In [10]:
import json
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup

### 1. Scraping Teams Data

In [4]:
data = []

soup = BeautifulSoup(requests.get('https://www.cricbuzz.com/cricket-team').content,'html.parser')

# Data of all the  Teams
sp = soup.find('div', class_ = 'cb-col cb-col-100')                                       

# Data of Each Team displayed at Left
for team_data in sp.find_all('div', class_ = 'cb-col cb-col-50 cb-team-item cb-lst-itm cb-team-lft-item'):   
    team_name = team_data.text.strip()
    team_link = 'https://www.cricbuzz.com' + team_data.find('a').get('href') + '/players'
    data.append([team_name, team_link])

# Data of Each Team displayed at Right
for team_data in sp.find_all('div', class_ = 'cb-col cb-col-50 cb-team-item cb-lst-itm cb-team-rght-item'):   
    team_name = team_data.text.strip()
    team_link = 'https://www.cricbuzz.com' + team_data.find('a').get('href') + '/players'
    data.append([team_name, team_link])
    
df = pd.DataFrame(data, columns = ['name','link'])

df.to_csv('teams.csv',index = False)
print('teams.csv is created!')

teams.csv is created!


### 2. Scraping Players Link

In [5]:
players_data = []

for team_name, team_link in tqdm(data):           # Going through each team
    soup = BeautifulSoup(requests.get(team_link).content, 'html.parser')
    
    for player in soup.find('div', class_ = 'cb-col-67 cb-col cb-left cb-top-zero').find_all('a'): # Players of Team
        player_name = player.text.strip()
        player_link = 'https://www.cricbuzz.com' + player.get('href').strip()
        players_data.append([player_name, team_name, player_link])
        
        
df = pd.DataFrame(players_data, columns = ['player','team','link'])
df.to_csv('players.csv', index = False)
print('players.csv is created!')

100%|███████████████████████████████████████████| 37/37 [00:28<00:00,  1.28it/s]

players.csv is created!





### 3. Scraping Players Details

In [6]:
final_data = []

for name, team, link in tqdm(players_data):

    details = []
    
    details.append(name)
    details.append(team)
    
    soup = BeautifulSoup(requests.get(link).content,'html.parser')
    
    try:
        # Basic Information
        basic_info = [sp.text.strip() for sp in soup.find('div', class_ = 'cb-col cb-col-33 text-black').find_all('div', class_ = 'cb-col cb-col-60 cb-lst-itm-sm')[:3]]
        details += basic_info


        # Data of Height
        height = soup.find('div', class_ = 'cb-col cb-col-33 text-black').find('div', class_ = 'cb-col cb-col-60').text.strip()
        details.append(height)

        # Ranks
        ranks = [sp.text.strip() for sp in soup.find('div', class_ = 'cb-col cb-col-33 text-black').find_all('div', class_ = 'cb-col cb-col-25 cb-plyr-rank text-right')]
        details += ranks


        details.append(link)    
        final_data.append(details)
        
    except:
        details = []
    
df = pd.DataFrame(final_data, columns = ['name','team','date_of_birth','palce_of_birth','role','height',
                                         'test_bat_rank','odi_bat_rank','t20_bat_rank',
                                         'test_bow_rank','odi_bow_rank','t20_bow_rank',
                                        'link'])

df.to_csv('cricket.csv', index = False)

100%|█████████████████████████████████████████| 453/453 [08:05<00:00,  1.07s/it]


### 4. Scraping Players Batting and Bawling history

In [9]:
links = list(pd.read_csv('cricket.csv')['link'].values)

def get_col_names(table):                                     # Get Column Names from the Table
    return [_.text.strip() for _ in table.find_all('th')]

def get_data(table):                                          # Get Table Data
    return [_.text.strip() for _ in table.find_all('td')]

def get_rows(table):                                          # Get number of rows from the Table
    return len(table.find('tbody').find_all('tr'))
    
def get_cols(table):                                          # Get number of columns from the Table
    return len(get_col_names(table))

def create_df(table):                                         # Create the DataFrame
    data   = np.reshape(get_data(table), (get_rows(table), get_cols(table)))
    return pd.DataFrame(data, columns = get_col_names(table))    

bat_paths = []
bow_paths = []


for link in tqdm(links):                                  # Going through each player data one by one    
    try:
        soup = BeautifulSoup(requests.get(link).content,'lxml')

        bat_table = soup.find_all('table', class_ = 'table cb-col-100 cb-plyr-thead')[0]
        bow_table = soup.find_all('table', class_ = 'table cb-col-100 cb-plyr-thead')[1]

        df_bat = create_df(bat_table)
        df_bow = create_df(bow_table)

        player_id = link.split('/')[-2]

        bat_path = 'Scores/bat_' + player_id + '.csv'
        bow_path = 'Scores/bow_' + player_id + '.csv'

        df_bat.to_csv(bat_path, index = False)
        df_bow.to_csv(bow_path, index = False)

        bat_paths.append(bat_path)
        bow_paths.append(bow_path)
    
    except:
        bat_paths.append(np.nan)
        bow_paths.append(np.nan)
        
        
df = pd.read_csv('cricket.csv')

df['bat_path'] = bat_paths
df['bow_path'] = bow_paths

df = df.dropna()

df.to_csv('players_record.csv', index = False)

100%|█████████████████████████████████████████| 453/453 [06:42<00:00,  1.13it/s]


### 5. Merging the Records of Batting and Bawling Career of Players

In [11]:
df = pd.read_csv('players_record.csv')

def get_cols(file_path, sep):  
    cols = []
    col_names = pd.read_csv(file_path).columns
    for _ in ['Test','ODI','T20I','IPL']:
        for col_name in col_names[1:]:
            cols.append(sep + "_" + _ + "_" + col_name)
    return cols

def get_data(file_path, seq_len):
    
    df_ = pd.read_csv(file_path)

    test = ['-' for i in range(seq_len)]
    odi  = ['-' for i in range(seq_len)]
    t20  = ['-' for i in range(seq_len)]
    ipl  = ['-' for i in range(seq_len)]

    for i in df_.values:   
        if i[0] == 'Test':
            test = i[1:]
        if i[0] == 'ODI':
            odi = i[1:]
        if i[0] == 'T20I':
            t20 = i[1:]
        if i[0] == 'IPL':
            ipl = i[1:]

        final =  [_ for _ in test]
        final += [_ for _ in odi ]
        final += [_ for _ in t20 ]
        final += [_ for _ in ipl ]
    return final

bat_data = []
bow_data = []

for i in df.values:
    bat_path = i[-2]
    bat_data.append(get_data(bat_path,13))
    
    bow_path = i[-1]
    bow_data.append(get_data(bow_path,12))
    
    
df_bat = pd.DataFrame(bat_data, columns = get_cols('Scores/bat_1413.csv', 'BT'))
df_bat.to_csv('bat.csv', index = False)

df_bow = pd.DataFrame(bow_data, columns = get_cols('Scores/bow_1413.csv', 'BW'))
df_bow.to_csv('ball.csv', index = False)

df_fin = pd.concat((df,df_bat,df_bow), axis = 1)
df_fin.to_csv('final.csv', index = False)

### 6. Creating Column Documentation JSON

In [12]:
link = pd.read_csv('cricket.csv')['link'].values[0]
soup = BeautifulSoup(requests.get(link).content,'lxml')

batt_col = [[i.text,i.get('title')] for i in soup.find_all('table')[0].find_all('th')][1:]
ball_col = [[i.text,i.get('title')] for i in soup.find_all('table')[1].find_all('th')][1:]

dct = {}

for i in batt_col:
    for j in ['Test','ODI','T20I','IPL']:        
        key = 'BT_' + j + '_' + i[0]
        value = j + ' ' + i[1] + ' ' + 'as a battsman'
        dct[key] = value
for i in ball_col:
    for j in ['Test','ODI','T20I','IPL']:        
        key = 'BT_' + j + '_' + i[0]
        value = j + ' ' + i[1] + ' ' + 'as a baller'
        dct[key] = value

        
for i in  ['name','team',
        'date_of_birth','palce_of_birth',
        'role',    'height',
        'test_bat_rank','odi_bat_rank','t20_bat_rank',
        'test_bow_rank','odi_bow_rank',
        't20_bow_rank','link', 'bat_path','bow_path']:

    dct[i] = i + ' of the player'
    
fd = open('columns.json','w')
fd.write(json.dumps(dct))
fd.close()