In [1]:
import os
import import_ipynb
import Connections as conn
import cx_Oracle
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import time
import datetime
import yaml

importing Jupyter notebook from Connections.ipynb


### Define functions

In [4]:
# Establish connection with Oracle Database
def orcl_conn(): 
    connstr = conn.conn_string()
    connection = cx_Oracle.connect(connstr)
    return connection

# Fetch matches list based on files present in directory
def match_list_file():
    match_list=[]
    input_path=r'C:/Users/ninju/OneDrive/Desktop/Cricket_Analysis/Cricsheet data/Cricsheet_Input'
    match_list = [f for f in os.listdir(input_path)]
    return (input_path,match_list)

# Log match ids for which execution didn't complete due to errors
def error_log(mid,err,connection):
    print('Error while loading match {match} data'.format(match=mid))
    print('Error logging begins for match {match}'.format(match=mid))
    cursor = connection.cursor()
    sql_qry = "insert into temp_tgt_dbo.error_log (match_id,error_msg) values (:1,:2)"
    cursor.execute(sql_qry,(mid,str(err)))
    connection.commit()
    cursor.close()
    print('Error logging successful')
    print(' ')

### Read YAML files, parse it into Pandas Dataframe and insert into Oracle tables

In [5]:
input_path,match_list=match_list_file()
connection = orcl_conn()
for match in match_list:
    try:
        matches_row_dict = {}
        bbb_row_dict = {}
        
        yaml_file = open(r'{path}/{match}'.format(path=input_path,match=match))
        yaml_dict = yaml.load(yaml_file, Loader=yaml.FullLoader)
        print('Parsing file {match}'.format(match=match))
        
        # Parsing Matches data
        print('Loading match data')
        match_id=int(match.split('.')[0])
        
        if 'competition' in yaml_dict['info']:
            tournament=yaml_dict['info']['competition']
        else:
            tournament='International'
            
        gender=yaml_dict['info']['gender']
        match_type=yaml_dict['info']['match_type']
        overs=yaml_dict['info']['overs']
        
        if isinstance(yaml_dict['info']['dates'][0], datetime.date):
            match_date=yaml_dict['info']['dates'][0].strftime('%d-%b-%Y')
        else:
            match_date=datetime.datetime.strptime(yaml_dict['info']['dates'][0], '%Y-%m-%d').strftime('%d-%b-%Y')
            
        team1=yaml_dict['info']['teams'][0]
        team2=yaml_dict['info']['teams'][1]
        venue=yaml_dict['info']['venue']
        
        if 'city' in yaml_dict['info']:
            city=yaml_dict['info']['city']
        else:
            city=None
        
        if 'winner' in yaml_dict['info']['outcome']:
            winner=yaml_dict['info']['outcome']['winner']
            if 'by' in yaml_dict['info']['outcome']:
                margin_type=[k for k, v in yaml_dict['info']['outcome']['by'].items()][0]
                margin_number=[v for k, v in yaml_dict['info']['outcome']['by'].items()][0]
            else:
                margin_type=None
                margin_number=None
        else:
            winner=yaml_dict['info']['outcome']['result']
            if 'eliminator' in yaml_dict['info']['outcome']:
                margin_type='eliminator'
                margin_number=yaml_dict['info']['outcome']['eliminator']
            else:
                margin_type=None
                margin_number=None
        
        if 'player_of_match' in yaml_dict['info']:
            player_of_match=yaml_dict['info']['player_of_match'][0]
        else:
            player_of_match=None
            
        toss_winner=yaml_dict['info']['toss']['winner']
        toss_decision=yaml_dict['info']['toss']['decision']
        
        if 'umpires' in yaml_dict['info']:
            umpire1=yaml_dict['info']['umpires'][0]
            umpire2=yaml_dict['info']['umpires'][1]
        else:
            umpire1=None
            umpire2=None
        
        matches_col_list=[tournament,gender,match_type,overs,match_date,team1,team2,venue,city,winner,margin_type,margin_number,
                          player_of_match,toss_winner,toss_decision,umpire1,umpire2]
        matches_row_dict[match_id] = matches_col_list
        matches_df = pd.DataFrame.from_dict(matches_row_dict, orient='index', columns = 
                    ['tournament','gender','match_type','overs','match_date','team1','team2','venue','city','winner','margin_type',
                     'margin_number','player_of_match','toss_winner','toss_decision','umpire1','umpire2'])
        matches_df = matches_df.where(pd.notnull(matches_df), None)
        matches_df = matches_df.rename_axis('match_id').reset_index()
        matches_df['match_id'] = matches_df['match_id'].astype(str).astype(int)
        
        rows = [tuple(x) for x in matches_df.values]
        sql_qry = ("INSERT INTO temp_tgt_dbo.matches (MATCH_ID,TOURNAMENT,GENDER,MATCH_TYPE,OVERS,MATCH_DATE,TEAM_1,TEAM_2,VENUE,CITY,"
                   "WINNER,MARGIN_TYPE,MARGIN_NUMBER,PLAYER_OF_MATCH,TOSS_WINNER,TOSS_DECISION,UMPIRE_1,UMPIRE_2)" 
                   "VALUES (:1,:2,:3,:4,:5,:6,:7,:8,:9,:10,:11,:12,:13,:14,:15,:16,:17,:18)")
        cursor = connection.cursor()
        cursor.executemany(sql_qry,rows)
        connection.commit()
        cursor.close()
        
        
        #Parsing BBB data
        print('Loading BBB data')
        for inn_dict in yaml_dict['innings']:
            inns = [k for k, v in inn_dict.items()][0]
            batting_team = inn_dict[inns]['team']
            if batting_team == team1:
                bowling_team = team2
            else:
                bowling_team = team1
            for ball_dict in inn_dict[inns]['deliveries']:
                try:
                    row_id = row_id + 1
                except:
                    row_id = 1
                ball_str = [k for k, v in ball_dict.items()][0]
                ball_int = float([k for k, v in ball_dict.items()][0])
                striker = ball_dict[ball_str]['batsman']
                nonstriker = ball_dict[ball_str]['non_striker']
                bowler = ball_dict[ball_str]['bowler']
                runs_off_bat = ball_dict[ball_str]['runs']['batsman']
                extras = ball_dict[ball_str]['runs']['extras']
                total_ball_runs = ball_dict[ball_str]['runs']['total']
                
                if 'wicket' in ball_dict[ball_str]:
                    wicket_type = ball_dict[ball_str]['wicket']['kind']
                    player_dismissed = ball_dict[ball_str]['wicket']['player_out']
                else:
                    wicket_type = None
                    player_dismissed = None
                
                if 'extras' in ball_dict[ball_str]:
                    extras_type = [k for k, v in ball_dict[ball_str]['extras'].items()][0]
                else:
                    extras_type = None
                
                bbb_col_list = [match_id,inns,batting_team,bowling_team,ball_int,striker,nonstriker,bowler,runs_off_bat,
                                extras,total_ball_runs,extras_type,wicket_type,player_dismissed]
                bbb_row_dict[row_id] = bbb_col_list
                bbb_df = pd.DataFrame.from_dict(bbb_row_dict, orient='index', columns=
                         ['MATCH_ID','INNINGS','BATTING_TEAM','BOWLING_TEAM','BALL','STRIKER','NON_STRIKER','BOWLER','RUNS_OFF_BAT',
                          'EXTRAS','TOTAL_BALL_RUNS','EXTRAS_TYPE','WICKET_TYPE','PLAYER_DISMISSED'])
                bbb_df = bbb_df.where(pd.notnull(bbb_df), None)
                
        cursor = connection.cursor()
        rows = [tuple(x) for x in bbb_df.values]
        sql_qry = ("INSERT INTO temp_tgt_dbo.bbb_data (MATCH_ID,INNINGS,BATTING_TEAM,BOWLING_TEAM,BALL,STRIKER,NON_STRIKER,"
                   "BOWLER,RUNS_OFF_BAT,EXTRAS,TOTAL_BALL_RUNS,EXTRAS_TYPE,WICKET_TYPE,PLAYER_DISMISSED)" 
                   "VALUES (:1,:2,:3,:4,:5,:6,:7,:8,:9,:10,:11,:12,:13,:14)")
        cursor.executemany(sql_qry,rows)
        connection.commit()
        cursor.close()
                                             
                
    except Exception as err:
        cursor.close()
        error_log(match_id,err,connection)
        
    print('Parsing completed for match {match}'.format(match=match))
    print(' ')    

print('Parsing done successfully')
connection.close()

Parsing file 1023655.yaml
Loading match data
Loading BBB data
Parsing completed for match 1023655.yaml
 
Parsing file 1023659.yaml
Loading match data
Loading BBB data
Parsing completed for match 1023659.yaml
 
Parsing file 1023661.yaml
Loading match data
Loading BBB data
Parsing completed for match 1023661.yaml
 
Parsing file 1023667.yaml
Loading match data
Loading BBB data
Parsing completed for match 1023667.yaml
 
Parsing file 1023671.yaml
Loading match data
Loading BBB data
Parsing completed for match 1023671.yaml
 
Parsing file 1023673.yaml
Loading match data
Loading BBB data
Parsing completed for match 1023673.yaml
 
Parsing file 1023675.yaml
Loading match data
Loading BBB data
Parsing completed for match 1023675.yaml
 
Parsing file 1023677.yaml
Loading match data
Loading BBB data
Parsing completed for match 1023677.yaml
 
Parsing file 1023679.yaml
Loading match data
Loading BBB data
Parsing completed for match 1023679.yaml
 
Parsing file 1023681.yaml
Loading match data
Loading BB

Parsing completed for match 1118520.yaml
 
Parsing file 1118521.yaml
Loading match data
Loading BBB data
Parsing completed for match 1118521.yaml
 
Parsing file 1118522.yaml
Loading match data
Loading BBB data
Parsing completed for match 1118522.yaml
 
Parsing file 1118523.yaml
Loading match data
Loading BBB data
Parsing completed for match 1118523.yaml
 
Parsing file 1118524.yaml
Loading match data
Loading BBB data
Parsing completed for match 1118524.yaml
 
Parsing file 1118525.yaml
Loading match data
Loading BBB data
Parsing completed for match 1118525.yaml
 
Parsing file 1118526.yaml
Loading match data
Loading BBB data
Parsing completed for match 1118526.yaml
 
Parsing file 1118527.yaml
Loading match data
Loading BBB data
Parsing completed for match 1118527.yaml
 
Parsing file 1118528.yaml
Loading match data
Loading BBB data
Parsing completed for match 1118528.yaml
 
Parsing file 1118529.yaml
Loading match data
Loading BBB data
Parsing completed for match 1118529.yaml
 
Parsing file

Parsing file 1188392.yaml
Loading match data
Loading BBB data
Parsing completed for match 1188392.yaml
 
Parsing file 1188393.yaml
Loading match data
Loading BBB data
Parsing completed for match 1188393.yaml
 
Parsing file 1188394.yaml
Loading match data
Loading BBB data
Parsing completed for match 1188394.yaml
 
Parsing file 1188395.yaml
Loading match data
Loading BBB data
Parsing completed for match 1188395.yaml
 
Parsing file 1188396.yaml
Loading match data
Loading BBB data
Parsing completed for match 1188396.yaml
 
Parsing file 1188397.yaml
Loading match data
Loading BBB data
Parsing completed for match 1188397.yaml
 
Parsing file 1188398.yaml
Loading match data
Loading BBB data
Parsing completed for match 1188398.yaml
 
Parsing file 1188399.yaml
Loading match data
Loading BBB data
Parsing completed for match 1188399.yaml
 
Parsing file 1188401.yaml
Loading match data
Loading BBB data
Parsing completed for match 1188401.yaml
 
Parsing file 1188403.yaml
Loading match data
Loading BB

Parsing completed for match 1226945.yaml
 
Parsing file 1226946.yaml
Loading match data
Loading BBB data
Parsing completed for match 1226946.yaml
 
Parsing file 1226947.yaml
Loading match data
Loading BBB data
Parsing completed for match 1226947.yaml
 
Parsing file 1226948.yaml
Loading match data
Loading BBB data
Parsing completed for match 1226948.yaml
 
Parsing file 1226949.yaml
Loading match data
Loading BBB data
Parsing completed for match 1226949.yaml
 
Parsing file 1226950.yaml
Loading match data
Loading BBB data
Parsing completed for match 1226950.yaml
 
Parsing file 1226951.yaml
Loading match data
Loading BBB data
Parsing completed for match 1226951.yaml
 
Parsing file 1226952.yaml
Loading match data
Loading BBB data
Parsing completed for match 1226952.yaml
 
Parsing file 1226953.yaml
Loading match data
Loading BBB data
Parsing completed for match 1226953.yaml
 
Parsing file 1226954.yaml
Loading match data
Loading BBB data
Parsing completed for match 1226954.yaml
 
Parsing file

### Load Players data, additional matches data from Cricinfo API response

In [None]:
input_path,match_list=match_list_file()
for match in match_list:
    try:
        match_id=int(match.split('.')[0])
        driver = webdriver.Chrome("../chromedriver_win32/chromedriver.exe")

        url_page_hca_scr = 'https://hs-consumer-api.espncricinfo.com/v1/pages/match/scorecard?seriesId=366616&matchId={match}'.format(match=match_id)
        driver.get(url_page_hca_scr)
        content_hca_scr = driver.page_source
        page_hca_scr = BeautifulSoup(content_hca_scr)
        hca_scr=json.loads(page_hca_scr.text)

        print('Extracting Match {} data'.format(match_id))
        start_time = time.time()
        
        