In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from tqdm import tqdm

clean_data_path = os.path.join("..", "clean_data") 

### Utility functions.

In [None]:
def get_from_obj(obj, key):
    try:
        return obj[key]
    except Exception as e:
        return ""

In [None]:
schedule = {}

URL = 'https://www.espncricinfo.com/series/ipl-2021-1249214/match-schedule-fixtures'
page = requests.get(URL)
page_content = BeautifulSoup(page.content, 'html.parser')
matches = page_content.find_all('div', class_='match-info match-info-FIXTURES')
match_no = 1
for match in matches:
    status = match.find('div', class_='status')
    time = status.find('span').text
    venue = match.find('div', class_='description').text.split(",")[1]
    teams = match.find_all('p', class_='name')
    schedule[match_no] = {
        "match_no" : match_no,
        "time" : time, 
        "venue" : venue,
        "team_1" : teams[0].text,
        "team_2" : teams[1].text
    }
    match_no += 1
df_ipl_schedule_csv = pd.DataFrame.from_dict(schedule, "index")
df_ipl_schedule_csv.to_csv(os.path.join(clean_data_path, "ipl_schedule.csv"), index=False)

In [None]:
team_urls = {
    "csk" : "https://www.espncricinfo.com/ci/content/squad/1252150.html",
    "dc" : "https://www.espncricinfo.com/ci/content/squad/1252198.html",
    "kkr" : "https://www.espncricinfo.com/ci/content/squad/1252188.html",
    "mi" : "https://www.espncricinfo.com/ci/content/squad/1252149.html",
    "pk" : "https://www.espncricinfo.com/ci/content/squad/1252194.html",
    "rr" : "https://www.espncricinfo.com/ci/content/squad/1252201.html",
    "rcb" : "https://www.espncricinfo.com/ci/content/squad/1252176.html",
    "srh" : "https://www.espncricinfo.com/ci/content/squad/1252199.html"
}

In [None]:
ipl_squads_csv = {}
player_id = 1;
for team in tqdm(team_urls):
    team_url = team_urls[team]
    team_page = requests.get(team_url)
    team_page_content = BeautifulSoup(team_page.content, 'html.parser')
    main = team_page_content.find_all('div', class_='squads_main')
    team_name = main[0].find_all('h1')[0].text.split(" / ")[0][:-6]
    players = team_page_content.find_all('div', class_='large-13')
    for player in tqdm(players):
        player_header = player.find('a')
        espn_player_id = player_header['href'].split("/")[-1][:-5]
        player_name = player_header.text.strip()
        player_display_name = "" # This needs to be updated later with fuzzy matching
        
        player_url = 'https://www.espncricinfo.com/ci/content/player/' + espn_player_id +'.html'
        player_page = requests.get(player_url)
        player_page_content = BeautifulSoup(player_page.content, 'html.parser')
        player_infos = player_page_content.find_all('p', class_='ciPlayerinformationtxt')
        player_info_map = {}
        for info in player_infos:
            player_info_map[info.find('b').text] = info.find('span').text
        ipl_squads_csv[player_id] = {
            "player_id" : player_id,
            "team_name" : team_name,
            "player_name" : player_name,
            "player_display_name" : player_display_name,
            "player_full_name" : player_info_map["Full name"],
            "batting_style" : get_from_obj(player_info_map, "Batting style"),
            "bowling_style" : get_from_obj(player_info_map, "Bowling style"),
            "playling_role" : get_from_obj(player_info_map, "Playing role")
        }
        player_id += 1
df_ipl_squads_csv = pd.DataFrame.from_dict(ipl_squads_csv, "index")
df_ipl_squads_csv.to_csv(os.path.join(clean_data_path, "ipl_squads.csv"), index=False)