In [24]:
# Importing necessary libraries

import numpy as np
import pandas as pd
from yaml import safe_load
import os
from tqdm import tqdm

In [25]:
# Creating an empty list to store file names with their paths
names_of_files = []

# Iterating through the files in the 'odi_dataset' directory
for file in os.listdir('odi_dataset'):
    names_of_files.append(os.path.join('odi_dataset', file)) # Appending the absolute path of each file to the list

In [26]:
# Showing the names of the files in the directory
names_of_files[0:5]

['odi_dataset\\1000887.yaml',
 'odi_dataset\\1000889.yaml',
 'odi_dataset\\1000891.yaml',
 'odi_dataset\\1000893.yaml',
 'odi_dataset\\1000895.yaml']

In [27]:
# Initializing an empty list to store DataFrame
dataframe_1 = pd.DataFrame()

# Initializing a counter for a new column - 'match_id'
counter = 1

# Iterating through the files with a progress bar(tqdm)
for file in tqdm(names_of_files):
    with open(file, 'r') as f:
        # Load YAML file, normalize to DataFrame, and add 'match_id' column
        df = pd.json_normalize(safe_load(f))
        df['match_id'] = counter
        dataframe_1 = pd.concat([dataframe_1, df], ignore_index=True)
        counter += 1

# Display the resulting DataFrame
dataframe_1

100%|██████████| 2822/2822 [24:12<00:00,  1.94it/s]


Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.balls_per_over,info.city,info.dates,info.gender,info.match_type,info.outcome.by.runs,...,info.registry.people.SP Terry,info.registry.people.DJ Worrall,info.registry.people.CP Tremain,info.registry.people.JM Mennie,info.registry.people.DS Weerakkody,info.registry.people.IG Warne,info.registry.people.Rokhan Barakzai,info.registry.people.CC Dalton,info.registry.people.S Wijesundera,info.registry.people.U Raymond-Hoey
0,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-01-14,2,6,Brisbane,[2017-01-13],male,ODI,92.0,...,,,,,,,,,,
1,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-01-18,1,6,,[2017-01-15],male,ODI,,...,,,,,,,,,,
2,"[{'1st innings': {'team': 'Pakistan', 'deliver...",0.91,2017-01-21,1,6,Perth,[2017-01-19],male,ODI,,...,,,,,,,,,,
3,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-01-22,1,6,,[2017-01-22],male,ODI,86.0,...,,,,,,,,,,
4,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-01-28,1,6,,[2017-01-26],male,ODI,57.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2817,"[{'1st innings': {'team': 'Afghanistan', 'deli...",0.91,2016-07-13,1,6,Belfast,[2016-07-12],male,ODI,39.0,...,,,,,,,,,,
2818,"[{'1st innings': {'team': 'Afghanistan', 'deli...",0.91,2016-07-20,1,6,Belfast,[2016-07-17],male,ODI,79.0,...,7f87204d,,,,,,,,,
2819,"[{'1st innings': {'team': 'Ireland', 'deliveri...",0.91,2016-07-20,1,6,Belfast,[2016-07-19],male,ODI,12.0,...,7f87204d,,,,,,,,,
2820,"[{'1st innings': {'team': 'Scotland', 'deliver...",0.91,2016-08-20,1,6,Edinburgh,[2016-08-14],male,ODI,98.0,...,,,,,,,,,,


In [43]:
# Checking if any column names contain 'weather'
weather_columns = [col for col in dataframe_1.columns if 'weather' in col.lower()]

# Checking if any column names contain 'condition'
condition_columns = [col for col in dataframe_1.columns if 'condition' in col.lower()]

# Checking if any column names contain 'pitch'
pitch_columns = [col for col in dataframe_1.columns if 'pitch' in col.lower()]

# Displaying the columns containing 'weather'
print(weather_columns)

# Displaying the columns containing 'condition'
print(condition_columns)

# Displaying the columns containing 'pitch'
print(pitch_columns)

[]
[]
['info.registry.people.OL Pitcher', 'info.registry.people.ADJ Pitcher', 'info.registry.people.AR Pitcher']


In [44]:
# Dropping 'info.registry.' type columns
dataframe_1 = dataframe_1.loc[:, ~dataframe_1.columns.str.startswith('info.registry.people.')] 

In [45]:
dataframe_1

Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.balls_per_over,info.city,info.dates,info.gender,info.match_type,info.outcome.by.runs,...,info.supersubs.Bangladesh,info.supersubs.New Zealand,info.supersubs.India,info.players.ICC World XI,info.supersubs.ICC World XI,info.supersubs.South Africa,info.supersubs.Pakistan,info.supersubs.West Indies,info.players.Bermuda,info.players.Kenya
0,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-01-14,2,6,Brisbane,[2017-01-13],male,ODI,92.0,...,,,,,,,,,,
1,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-01-18,1,6,,[2017-01-15],male,ODI,,...,,,,,,,,,,
2,"[{'1st innings': {'team': 'Pakistan', 'deliver...",0.91,2017-01-21,1,6,Perth,[2017-01-19],male,ODI,,...,,,,,,,,,,
3,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-01-22,1,6,,[2017-01-22],male,ODI,86.0,...,,,,,,,,,,
4,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-01-28,1,6,,[2017-01-26],male,ODI,57.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2817,"[{'1st innings': {'team': 'Afghanistan', 'deli...",0.91,2016-07-13,1,6,Belfast,[2016-07-12],male,ODI,39.0,...,,,,,,,,,,
2818,"[{'1st innings': {'team': 'Afghanistan', 'deli...",0.91,2016-07-20,1,6,Belfast,[2016-07-17],male,ODI,79.0,...,,,,,,,,,,
2819,"[{'1st innings': {'team': 'Ireland', 'deliveri...",0.91,2016-07-20,1,6,Belfast,[2016-07-19],male,ODI,12.0,...,,,,,,,,,,
2820,"[{'1st innings': {'team': 'Scotland', 'deliver...",0.91,2016-08-20,1,6,Edinburgh,[2016-08-14],male,ODI,98.0,...,,,,,,,,,,


In [47]:
# Dropping 'info.supersubs.' type columns
dataframe_1 = dataframe_1.loc[:, ~dataframe_1.columns.str.startswith('info.supersubs.')] 

In [48]:
dataframe_1

Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.balls_per_over,info.city,info.dates,info.gender,info.match_type,info.outcome.by.runs,...,info.players.Oman,info.players.Netherlands,info.players.Thailand,info.players.Canada,info.players.Jersey,info.players.Africa XI,info.players.Asia XI,info.players.ICC World XI,info.players.Bermuda,info.players.Kenya
0,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-01-14,2,6,Brisbane,[2017-01-13],male,ODI,92.0,...,,,,,,,,,,
1,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-01-18,1,6,,[2017-01-15],male,ODI,,...,,,,,,,,,,
2,"[{'1st innings': {'team': 'Pakistan', 'deliver...",0.91,2017-01-21,1,6,Perth,[2017-01-19],male,ODI,,...,,,,,,,,,,
3,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-01-22,1,6,,[2017-01-22],male,ODI,86.0,...,,,,,,,,,,
4,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-01-28,1,6,,[2017-01-26],male,ODI,57.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2817,"[{'1st innings': {'team': 'Afghanistan', 'deli...",0.91,2016-07-13,1,6,Belfast,[2016-07-12],male,ODI,39.0,...,,,,,,,,,,
2818,"[{'1st innings': {'team': 'Afghanistan', 'deli...",0.91,2016-07-20,1,6,Belfast,[2016-07-17],male,ODI,79.0,...,,,,,,,,,,
2819,"[{'1st innings': {'team': 'Ireland', 'deliveri...",0.91,2016-07-20,1,6,Belfast,[2016-07-19],male,ODI,12.0,...,,,,,,,,,,
2820,"[{'1st innings': {'team': 'Scotland', 'deliver...",0.91,2016-08-20,1,6,Edinburgh,[2016-08-14],male,ODI,98.0,...,,,,,,,,,,


In [49]:
# Dropping 'info.players.' type columns
dataframe_1 = dataframe_1.loc[:, ~dataframe_1.columns.str.startswith('info.players.')]  

In [50]:
dataframe_1

Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.balls_per_over,info.city,info.dates,info.gender,info.match_type,info.outcome.by.runs,...,info.toss.winner,info.umpires,info.venue,match_id,info.outcome.by.wickets,info.outcome.result,info.outcome.method,info.neutral_venue,info.match_type_number,info.outcome.eliminator
0,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-01-14,2,6,Brisbane,[2017-01-13],male,ODI,92.0,...,Australia,"[MD Martell, C Shamshuddin]","Brisbane Cricket Ground, Woolloongabba",1,,,,,,
1,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-01-18,1,6,,[2017-01-15],male,ODI,,...,Australia,"[CB Gaffaney, P Wilson]",Melbourne Cricket Ground,2,6.0,,,,,
2,"[{'1st innings': {'team': 'Pakistan', 'deliver...",0.91,2017-01-21,1,6,Perth,[2017-01-19],male,ODI,,...,Australia,"[SD Fry, C Shamshuddin]",Western Australia Cricket Association Ground,3,7.0,,,,,
3,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-01-22,1,6,,[2017-01-22],male,ODI,86.0,...,Australia,"[CB Gaffaney, MD Martell]",Sydney Cricket Ground,4,,,,,,
4,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-01-28,1,6,,[2017-01-26],male,ODI,57.0,...,Australia,"[SD Fry, C Shamshuddin]",Adelaide Oval,5,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2817,"[{'1st innings': {'team': 'Afghanistan', 'deli...",0.91,2016-07-13,1,6,Belfast,[2016-07-12],male,ODI,39.0,...,Ireland,"[AJ Neill, IN Ramage]","Civil Service Cricket Club, Stormont",2818,,,,,,
2818,"[{'1st innings': {'team': 'Afghanistan', 'deli...",0.91,2016-07-20,1,6,Belfast,[2016-07-17],male,ODI,79.0,...,Afghanistan,"[AJ Neill, C Shamshuddin]","Civil Service Cricket Club, Stormont",2819,,,,,,
2819,"[{'1st innings': {'team': 'Ireland', 'deliveri...",0.91,2016-07-20,1,6,Belfast,[2016-07-19],male,ODI,12.0,...,Afghanistan,"[R Black, C Shamshuddin]","Civil Service Cricket Club, Stormont",2820,,,,,,
2820,"[{'1st innings': {'team': 'Scotland', 'deliver...",0.91,2016-08-20,1,6,Edinburgh,[2016-08-14],male,ODI,98.0,...,United Arab Emirates,"[DA Haggo, M Hawthorne]","Grange Cricket Club Ground, Raeburn Place",2821,,,,,,


In [51]:
# Dropping 'meta.data_version', 'meta.created', 'meta.revision' columns
dataframe_1 = dataframe_1.drop(columns=['meta.data_version', 'meta.created', 'meta.revision']) 

In [52]:
dataframe_1

Unnamed: 0,innings,info.balls_per_over,info.city,info.dates,info.gender,info.match_type,info.outcome.by.runs,info.outcome.winner,info.overs,info.player_of_match,...,info.toss.winner,info.umpires,info.venue,match_id,info.outcome.by.wickets,info.outcome.result,info.outcome.method,info.neutral_venue,info.match_type_number,info.outcome.eliminator
0,"[{'1st innings': {'team': 'Australia', 'delive...",6,Brisbane,[2017-01-13],male,ODI,92.0,Australia,50,[MS Wade],...,Australia,"[MD Martell, C Shamshuddin]","Brisbane Cricket Ground, Woolloongabba",1,,,,,,
1,"[{'1st innings': {'team': 'Australia', 'delive...",6,,[2017-01-15],male,ODI,,Pakistan,50,[Mohammad Hafeez],...,Australia,"[CB Gaffaney, P Wilson]",Melbourne Cricket Ground,2,6.0,,,,,
2,"[{'1st innings': {'team': 'Pakistan', 'deliver...",6,Perth,[2017-01-19],male,ODI,,Australia,50,[SPD Smith],...,Australia,"[SD Fry, C Shamshuddin]",Western Australia Cricket Association Ground,3,7.0,,,,,
3,"[{'1st innings': {'team': 'Australia', 'delive...",6,,[2017-01-22],male,ODI,86.0,Australia,50,[DA Warner],...,Australia,"[CB Gaffaney, MD Martell]",Sydney Cricket Ground,4,,,,,,
4,"[{'1st innings': {'team': 'Australia', 'delive...",6,,[2017-01-26],male,ODI,57.0,Australia,50,[DA Warner],...,Australia,"[SD Fry, C Shamshuddin]",Adelaide Oval,5,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2817,"[{'1st innings': {'team': 'Afghanistan', 'deli...",6,Belfast,[2016-07-12],male,ODI,39.0,Afghanistan,50,,...,Ireland,"[AJ Neill, IN Ramage]","Civil Service Cricket Club, Stormont",2818,,,,,,
2818,"[{'1st innings': {'team': 'Afghanistan', 'deli...",6,Belfast,[2016-07-17],male,ODI,79.0,Afghanistan,50,,...,Afghanistan,"[AJ Neill, C Shamshuddin]","Civil Service Cricket Club, Stormont",2819,,,,,,
2819,"[{'1st innings': {'team': 'Ireland', 'deliveri...",6,Belfast,[2016-07-19],male,ODI,12.0,Ireland,50,,...,Afghanistan,"[R Black, C Shamshuddin]","Civil Service Cricket Club, Stormont",2820,,,,,,
2820,"[{'1st innings': {'team': 'Scotland', 'deliver...",6,Edinburgh,[2016-08-14],male,ODI,98.0,Scotland,50,,...,United Arab Emirates,"[DA Haggo, M Hawthorne]","Grange Cricket Club Ground, Raeburn Place",2821,,,,,,


In [72]:
#  Dropping 'info.outcome.eliminator', 'info.outcome.result', 'info.outcome.method', 'info.neutral_venue', 'info.match_type_number', 'info.outcome.by.wickets', 'info.outcome.by.runs' columns
columns_to_drop = ['info.outcome.eliminator', 'info.outcome.result', 'info.outcome.method', 'info.neutral_venue', 'info.match_type_number', 'info.outcome.by.wickets', 'info.outcome.by.runs'] 

In [54]:
dataframe_1 = dataframe_1.drop(columns=columns_to_drop)

In [55]:
dataframe_1

Unnamed: 0,innings,info.balls_per_over,info.city,info.dates,info.gender,info.match_type,info.outcome.winner,info.overs,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id
0,"[{'1st innings': {'team': 'Australia', 'delive...",6,Brisbane,[2017-01-13],male,ODI,Australia,50,[MS Wade],"[Australia, Pakistan]",bat,Australia,"[MD Martell, C Shamshuddin]","Brisbane Cricket Ground, Woolloongabba",1
1,"[{'1st innings': {'team': 'Australia', 'delive...",6,,[2017-01-15],male,ODI,Pakistan,50,[Mohammad Hafeez],"[Australia, Pakistan]",bat,Australia,"[CB Gaffaney, P Wilson]",Melbourne Cricket Ground,2
2,"[{'1st innings': {'team': 'Pakistan', 'deliver...",6,Perth,[2017-01-19],male,ODI,Australia,50,[SPD Smith],"[Australia, Pakistan]",field,Australia,"[SD Fry, C Shamshuddin]",Western Australia Cricket Association Ground,3
3,"[{'1st innings': {'team': 'Australia', 'delive...",6,,[2017-01-22],male,ODI,Australia,50,[DA Warner],"[Australia, Pakistan]",bat,Australia,"[CB Gaffaney, MD Martell]",Sydney Cricket Ground,4
4,"[{'1st innings': {'team': 'Australia', 'delive...",6,,[2017-01-26],male,ODI,Australia,50,[DA Warner],"[Australia, Pakistan]",bat,Australia,"[SD Fry, C Shamshuddin]",Adelaide Oval,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2817,"[{'1st innings': {'team': 'Afghanistan', 'deli...",6,Belfast,[2016-07-12],male,ODI,Afghanistan,50,,"[Ireland, Afghanistan]",field,Ireland,"[AJ Neill, IN Ramage]","Civil Service Cricket Club, Stormont",2818
2818,"[{'1st innings': {'team': 'Afghanistan', 'deli...",6,Belfast,[2016-07-17],male,ODI,Afghanistan,50,,"[Ireland, Afghanistan]",bat,Afghanistan,"[AJ Neill, C Shamshuddin]","Civil Service Cricket Club, Stormont",2819
2819,"[{'1st innings': {'team': 'Ireland', 'deliveri...",6,Belfast,[2016-07-19],male,ODI,Ireland,50,,"[Ireland, Afghanistan]",field,Afghanistan,"[R Black, C Shamshuddin]","Civil Service Cricket Club, Stormont",2820
2820,"[{'1st innings': {'team': 'Scotland', 'deliver...",6,Edinburgh,[2016-08-14],male,ODI,Scotland,50,,"[Scotland, United Arab Emirates]",field,United Arab Emirates,"[DA Haggo, M Hawthorne]","Grange Cricket Club Ground, Raeburn Place",2821


In [56]:
# Dropping 'info.balls_per_over' column
dataframe_1 = dataframe_1.drop(columns=['info.balls_per_over'])

In [57]:
dataframe_1

Unnamed: 0,innings,info.city,info.dates,info.gender,info.match_type,info.outcome.winner,info.overs,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id
0,"[{'1st innings': {'team': 'Australia', 'delive...",Brisbane,[2017-01-13],male,ODI,Australia,50,[MS Wade],"[Australia, Pakistan]",bat,Australia,"[MD Martell, C Shamshuddin]","Brisbane Cricket Ground, Woolloongabba",1
1,"[{'1st innings': {'team': 'Australia', 'delive...",,[2017-01-15],male,ODI,Pakistan,50,[Mohammad Hafeez],"[Australia, Pakistan]",bat,Australia,"[CB Gaffaney, P Wilson]",Melbourne Cricket Ground,2
2,"[{'1st innings': {'team': 'Pakistan', 'deliver...",Perth,[2017-01-19],male,ODI,Australia,50,[SPD Smith],"[Australia, Pakistan]",field,Australia,"[SD Fry, C Shamshuddin]",Western Australia Cricket Association Ground,3
3,"[{'1st innings': {'team': 'Australia', 'delive...",,[2017-01-22],male,ODI,Australia,50,[DA Warner],"[Australia, Pakistan]",bat,Australia,"[CB Gaffaney, MD Martell]",Sydney Cricket Ground,4
4,"[{'1st innings': {'team': 'Australia', 'delive...",,[2017-01-26],male,ODI,Australia,50,[DA Warner],"[Australia, Pakistan]",bat,Australia,"[SD Fry, C Shamshuddin]",Adelaide Oval,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2817,"[{'1st innings': {'team': 'Afghanistan', 'deli...",Belfast,[2016-07-12],male,ODI,Afghanistan,50,,"[Ireland, Afghanistan]",field,Ireland,"[AJ Neill, IN Ramage]","Civil Service Cricket Club, Stormont",2818
2818,"[{'1st innings': {'team': 'Afghanistan', 'deli...",Belfast,[2016-07-17],male,ODI,Afghanistan,50,,"[Ireland, Afghanistan]",bat,Afghanistan,"[AJ Neill, C Shamshuddin]","Civil Service Cricket Club, Stormont",2819
2819,"[{'1st innings': {'team': 'Ireland', 'deliveri...",Belfast,[2016-07-19],male,ODI,Ireland,50,,"[Ireland, Afghanistan]",field,Afghanistan,"[R Black, C Shamshuddin]","Civil Service Cricket Club, Stormont",2820
2820,"[{'1st innings': {'team': 'Scotland', 'deliver...",Edinburgh,[2016-08-14],male,ODI,Scotland,50,,"[Scotland, United Arab Emirates]",field,United Arab Emirates,"[DA Haggo, M Hawthorne]","Grange Cricket Club Ground, Raeburn Place",2821


In [59]:
# We need only 'male' data, so checking 'gender' column
dataframe_1['info.gender'].value_counts()

info.gender
male      2422
female     400
Name: count, dtype: int64

In [60]:
# Dropping 'info.gender' column after extracting only 'male'
dataframe_1 = dataframe_1[dataframe_1['info.gender'] == 'male']
dataframe_1.drop(columns = ['info.gender'],inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe_1.drop(columns = ['info.gender'],inplace = True)


In [61]:
dataframe_1

Unnamed: 0,innings,info.city,info.dates,info.match_type,info.outcome.winner,info.overs,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id
0,"[{'1st innings': {'team': 'Australia', 'delive...",Brisbane,[2017-01-13],ODI,Australia,50,[MS Wade],"[Australia, Pakistan]",bat,Australia,"[MD Martell, C Shamshuddin]","Brisbane Cricket Ground, Woolloongabba",1
1,"[{'1st innings': {'team': 'Australia', 'delive...",,[2017-01-15],ODI,Pakistan,50,[Mohammad Hafeez],"[Australia, Pakistan]",bat,Australia,"[CB Gaffaney, P Wilson]",Melbourne Cricket Ground,2
2,"[{'1st innings': {'team': 'Pakistan', 'deliver...",Perth,[2017-01-19],ODI,Australia,50,[SPD Smith],"[Australia, Pakistan]",field,Australia,"[SD Fry, C Shamshuddin]",Western Australia Cricket Association Ground,3
3,"[{'1st innings': {'team': 'Australia', 'delive...",,[2017-01-22],ODI,Australia,50,[DA Warner],"[Australia, Pakistan]",bat,Australia,"[CB Gaffaney, MD Martell]",Sydney Cricket Ground,4
4,"[{'1st innings': {'team': 'Australia', 'delive...",,[2017-01-26],ODI,Australia,50,[DA Warner],"[Australia, Pakistan]",bat,Australia,"[SD Fry, C Shamshuddin]",Adelaide Oval,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2817,"[{'1st innings': {'team': 'Afghanistan', 'deli...",Belfast,[2016-07-12],ODI,Afghanistan,50,,"[Ireland, Afghanistan]",field,Ireland,"[AJ Neill, IN Ramage]","Civil Service Cricket Club, Stormont",2818
2818,"[{'1st innings': {'team': 'Afghanistan', 'deli...",Belfast,[2016-07-17],ODI,Afghanistan,50,,"[Ireland, Afghanistan]",bat,Afghanistan,"[AJ Neill, C Shamshuddin]","Civil Service Cricket Club, Stormont",2819
2819,"[{'1st innings': {'team': 'Ireland', 'deliveri...",Belfast,[2016-07-19],ODI,Ireland,50,,"[Ireland, Afghanistan]",field,Afghanistan,"[R Black, C Shamshuddin]","Civil Service Cricket Club, Stormont",2820
2820,"[{'1st innings': {'team': 'Scotland', 'deliver...",Edinburgh,[2016-08-14],ODI,Scotland,50,,"[Scotland, United Arab Emirates]",field,United Arab Emirates,"[DA Haggo, M Hawthorne]","Grange Cricket Club Ground, Raeburn Place",2821


In [63]:
# We need only '50 overs - ODI MAtch' data, so checking 'match_type' column
dataframe_1['info.match_type'].value_counts()

info.match_type
ODI    2422
Name: count, dtype: int64

In [64]:
# Dropping 'info.overs' column after extracting only '50' overs match data
dataframe_1 = dataframe_1[dataframe_1['info.overs'] == 50]
dataframe_1.drop(columns = ['info.overs','info.match_type'],inplace = True)

In [65]:
dataframe_1

Unnamed: 0,innings,info.city,info.dates,info.outcome.winner,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id
0,"[{'1st innings': {'team': 'Australia', 'delive...",Brisbane,[2017-01-13],Australia,[MS Wade],"[Australia, Pakistan]",bat,Australia,"[MD Martell, C Shamshuddin]","Brisbane Cricket Ground, Woolloongabba",1
1,"[{'1st innings': {'team': 'Australia', 'delive...",,[2017-01-15],Pakistan,[Mohammad Hafeez],"[Australia, Pakistan]",bat,Australia,"[CB Gaffaney, P Wilson]",Melbourne Cricket Ground,2
2,"[{'1st innings': {'team': 'Pakistan', 'deliver...",Perth,[2017-01-19],Australia,[SPD Smith],"[Australia, Pakistan]",field,Australia,"[SD Fry, C Shamshuddin]",Western Australia Cricket Association Ground,3
3,"[{'1st innings': {'team': 'Australia', 'delive...",,[2017-01-22],Australia,[DA Warner],"[Australia, Pakistan]",bat,Australia,"[CB Gaffaney, MD Martell]",Sydney Cricket Ground,4
4,"[{'1st innings': {'team': 'Australia', 'delive...",,[2017-01-26],Australia,[DA Warner],"[Australia, Pakistan]",bat,Australia,"[SD Fry, C Shamshuddin]",Adelaide Oval,5
...,...,...,...,...,...,...,...,...,...,...,...
2817,"[{'1st innings': {'team': 'Afghanistan', 'deli...",Belfast,[2016-07-12],Afghanistan,,"[Ireland, Afghanistan]",field,Ireland,"[AJ Neill, IN Ramage]","Civil Service Cricket Club, Stormont",2818
2818,"[{'1st innings': {'team': 'Afghanistan', 'deli...",Belfast,[2016-07-17],Afghanistan,,"[Ireland, Afghanistan]",bat,Afghanistan,"[AJ Neill, C Shamshuddin]","Civil Service Cricket Club, Stormont",2819
2819,"[{'1st innings': {'team': 'Ireland', 'deliveri...",Belfast,[2016-07-19],Ireland,,"[Ireland, Afghanistan]",field,Afghanistan,"[R Black, C Shamshuddin]","Civil Service Cricket Club, Stormont",2820
2820,"[{'1st innings': {'team': 'Scotland', 'deliver...",Edinburgh,[2016-08-14],Scotland,,"[Scotland, United Arab Emirates]",field,United Arab Emirates,"[DA Haggo, M Hawthorne]","Grange Cricket Club Ground, Raeburn Place",2821


In [66]:
# Going inside the 'innings' to check the data
dataframe_1['innings']

0       [{'1st innings': {'team': 'Australia', 'delive...
1       [{'1st innings': {'team': 'Australia', 'delive...
2       [{'1st innings': {'team': 'Pakistan', 'deliver...
3       [{'1st innings': {'team': 'Australia', 'delive...
4       [{'1st innings': {'team': 'Australia', 'delive...
                              ...                        
2817    [{'1st innings': {'team': 'Afghanistan', 'deli...
2818    [{'1st innings': {'team': 'Afghanistan', 'deli...
2819    [{'1st innings': {'team': 'Ireland', 'deliveri...
2820    [{'1st innings': {'team': 'Scotland', 'deliver...
2821    [{'1st innings': {'team': 'United Arab Emirate...
Name: innings, Length: 2422, dtype: object

In [75]:
# Checking how many dictionaries are there in this 'innings' directory
len(dataframe_1.iloc[0]['innings'])

2

In [76]:
# Going inside the dictionary
dataframe_1.iloc[0]['innings']

[{'1st innings': {'team': 'Australia',
   'deliveries': [{0.1: {'batsman': 'DA Warner',
      'bowler': 'Mohammad Amir',
      'non_striker': 'TM Head',
      'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
    {0.2: {'batsman': 'DA Warner',
      'bowler': 'Mohammad Amir',
      'non_striker': 'TM Head',
      'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
    {0.3: {'batsman': 'DA Warner',
      'bowler': 'Mohammad Amir',
      'non_striker': 'TM Head',
      'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
    {0.4: {'batsman': 'DA Warner',
      'bowler': 'Mohammad Amir',
      'non_striker': 'TM Head',
      'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
    {0.5: {'batsman': 'DA Warner',
      'bowler': 'Mohammad Amir',
      'extras': {'wides': 1},
      'non_striker': 'TM Head',
      'runs': {'batsman': 0, 'extras': 1, 'total': 1}}},
    {0.6: {'batsman': 'DA Warner',
      'bowler': 'Mohammad Amir',
      'non_striker': 'TM Head',
      'runs': {'batsman': 0, 'ex

In [69]:
# Going inside the 1st - dictionary : 'innings' and found 'deliveries' - another dictionary :
dataframe_1.iloc[0]['innings'][0]

{'1st innings': {'team': 'Australia',
  'deliveries': [{0.1: {'batsman': 'DA Warner',
     'bowler': 'Mohammad Amir',
     'non_striker': 'TM Head',
     'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
   {0.2: {'batsman': 'DA Warner',
     'bowler': 'Mohammad Amir',
     'non_striker': 'TM Head',
     'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
   {0.3: {'batsman': 'DA Warner',
     'bowler': 'Mohammad Amir',
     'non_striker': 'TM Head',
     'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
   {0.4: {'batsman': 'DA Warner',
     'bowler': 'Mohammad Amir',
     'non_striker': 'TM Head',
     'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
   {0.5: {'batsman': 'DA Warner',
     'bowler': 'Mohammad Amir',
     'extras': {'wides': 1},
     'non_striker': 'TM Head',
     'runs': {'batsman': 0, 'extras': 1, 'total': 1}}},
   {0.6: {'batsman': 'DA Warner',
     'bowler': 'Mohammad Amir',
     'non_striker': 'TM Head',
     'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 

In [77]:
# Going inside 'deliveries' : 
dataframe_1.iloc[0]['innings'][0]['1st innings']['deliveries']

[{0.1: {'batsman': 'DA Warner',
   'bowler': 'Mohammad Amir',
   'non_striker': 'TM Head',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.2: {'batsman': 'DA Warner',
   'bowler': 'Mohammad Amir',
   'non_striker': 'TM Head',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.3: {'batsman': 'DA Warner',
   'bowler': 'Mohammad Amir',
   'non_striker': 'TM Head',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.4: {'batsman': 'DA Warner',
   'bowler': 'Mohammad Amir',
   'non_striker': 'TM Head',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.5: {'batsman': 'DA Warner',
   'bowler': 'Mohammad Amir',
   'extras': {'wides': 1},
   'non_striker': 'TM Head',
   'runs': {'batsman': 0, 'extras': 1, 'total': 1}}},
 {0.6: {'batsman': 'DA Warner',
   'bowler': 'Mohammad Amir',
   'non_striker': 'TM Head',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.7: {'batsman': 'DA Warner',
   'bowler': 'Mohammad Amir',
   'non_striker': 'TM Head',
   'runs': {

In [88]:
count = 1
dataframe_2 = pd.DataFrame()
dfs_to_concatenate = []

for index, row in dataframe_1.iterrows():
    ball_of_match = []
    batsman = []
    bowler = []
    runs = []
    player_of_dismissed = []
    teams = []
    batting_team = []
    match_id = []
    city = []
    venue = []
    toss_winner = []
    toss_decision_from_winner = []
    
    for ball in row['innings'][0]['1st innings']['deliveries']:
        for key in ball.keys():
            match_id.append(count)
            batting_team.append(row['innings'][0]['1st innings']['team'])
            teams.append(row['info.teams'])
            ball_of_match.append(key)
            batsman.append(ball[key]['batsman'])
            bowler.append(ball[key]['bowler'])
            runs.append(ball[key]['runs']['total'])
            city.append(row['info.city'])
            venue.append(row['info.venue'])
            toss_winner.append(row['info.toss.winner'])
            toss_decision_from_winner.append(row['info.toss.decision'])
            
            try:
                player_of_dismissed.append(ball[key]['wicket']['player_out'])
            except:
                player_of_dismissed.append('0')
    
    loop_df = pd.DataFrame({
        'match_id': match_id,
        'teams': teams,
        'batting_team': batting_team,
        'ball': ball_of_match,
        'batsman': batsman,
        'bowler': bowler,
        'runs': runs,
        'player_dismissed': player_of_dismissed,
        'city': city,
        'venue': venue,
        'toss_winner': toss_winner,
        'toss_decision_from_winner': toss_decision_from_winner
    })
    
    dfs_to_concatenate.append(loop_df)
    count += 1

# Concatenate all DataFrames in the list
dataframe_2 = pd.concat(dfs_to_concatenate, ignore_index=True)

In [89]:
dataframe_2

Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,toss_winner,toss_decision_from_winner
0,1,"[Australia, Pakistan]",Australia,0.1,DA Warner,Mohammad Amir,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat
1,1,"[Australia, Pakistan]",Australia,0.2,DA Warner,Mohammad Amir,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat
2,1,"[Australia, Pakistan]",Australia,0.3,DA Warner,Mohammad Amir,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat
3,1,"[Australia, Pakistan]",Australia,0.4,DA Warner,Mohammad Amir,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat
4,1,"[Australia, Pakistan]",Australia,0.5,DA Warner,Mohammad Amir,1,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat
...,...,...,...,...,...,...,...,...,...,...,...,...
700171,2422,"[Scotland, United Arab Emirates]",United Arab Emirates,44.6,Ahmed Raza,CB Sole,0,0,Edinburgh,"Grange Cricket Club Ground, Raeburn Place",United Arab Emirates,bat
700172,2422,"[Scotland, United Arab Emirates]",United Arab Emirates,45.1,Fayyaz Ahmed,AC Evans,1,0,Edinburgh,"Grange Cricket Club Ground, Raeburn Place",United Arab Emirates,bat
700173,2422,"[Scotland, United Arab Emirates]",United Arab Emirates,45.2,Ahmed Raza,AC Evans,0,0,Edinburgh,"Grange Cricket Club Ground, Raeburn Place",United Arab Emirates,bat
700174,2422,"[Scotland, United Arab Emirates]",United Arab Emirates,45.3,Ahmed Raza,AC Evans,0,0,Edinburgh,"Grange Cricket Club Ground, Raeburn Place",United Arab Emirates,bat


In [90]:
# Extracting 'bowling_team', from the 'teams' column, as the 'batting_team' is defined already, it's easier to extract the 'bowling_team:
def bowling_team(row):
    for team in row['teams']:
        if team != row['batting_team']:
            return team

In [91]:
# Applying the 'bowling_team' function to each row of the DataFrame
dataframe_2['bowling_team'] = dataframe_2.apply(bowling_team,axis=1)

In [92]:
dataframe_2

Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,toss_winner,toss_decision_from_winner,bowling_team
0,1,"[Australia, Pakistan]",Australia,0.1,DA Warner,Mohammad Amir,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat,Pakistan
1,1,"[Australia, Pakistan]",Australia,0.2,DA Warner,Mohammad Amir,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat,Pakistan
2,1,"[Australia, Pakistan]",Australia,0.3,DA Warner,Mohammad Amir,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat,Pakistan
3,1,"[Australia, Pakistan]",Australia,0.4,DA Warner,Mohammad Amir,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat,Pakistan
4,1,"[Australia, Pakistan]",Australia,0.5,DA Warner,Mohammad Amir,1,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat,Pakistan
...,...,...,...,...,...,...,...,...,...,...,...,...,...
700171,2422,"[Scotland, United Arab Emirates]",United Arab Emirates,44.6,Ahmed Raza,CB Sole,0,0,Edinburgh,"Grange Cricket Club Ground, Raeburn Place",United Arab Emirates,bat,Scotland
700172,2422,"[Scotland, United Arab Emirates]",United Arab Emirates,45.1,Fayyaz Ahmed,AC Evans,1,0,Edinburgh,"Grange Cricket Club Ground, Raeburn Place",United Arab Emirates,bat,Scotland
700173,2422,"[Scotland, United Arab Emirates]",United Arab Emirates,45.2,Ahmed Raza,AC Evans,0,0,Edinburgh,"Grange Cricket Club Ground, Raeburn Place",United Arab Emirates,bat,Scotland
700174,2422,"[Scotland, United Arab Emirates]",United Arab Emirates,45.3,Ahmed Raza,AC Evans,0,0,Edinburgh,"Grange Cricket Club Ground, Raeburn Place",United Arab Emirates,bat,Scotland


In [93]:
# Dropping the 'teams' column from the DataFrame
dataframe_2.drop(columns=['teams'], inplace=True)

# Displaying the updated DataFrame
dataframe_2

Unnamed: 0,match_id,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,toss_winner,toss_decision_from_winner,bowling_team
0,1,Australia,0.1,DA Warner,Mohammad Amir,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat,Pakistan
1,1,Australia,0.2,DA Warner,Mohammad Amir,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat,Pakistan
2,1,Australia,0.3,DA Warner,Mohammad Amir,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat,Pakistan
3,1,Australia,0.4,DA Warner,Mohammad Amir,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat,Pakistan
4,1,Australia,0.5,DA Warner,Mohammad Amir,1,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat,Pakistan
...,...,...,...,...,...,...,...,...,...,...,...,...
700171,2422,United Arab Emirates,44.6,Ahmed Raza,CB Sole,0,0,Edinburgh,"Grange Cricket Club Ground, Raeburn Place",United Arab Emirates,bat,Scotland
700172,2422,United Arab Emirates,45.1,Fayyaz Ahmed,AC Evans,1,0,Edinburgh,"Grange Cricket Club Ground, Raeburn Place",United Arab Emirates,bat,Scotland
700173,2422,United Arab Emirates,45.2,Ahmed Raza,AC Evans,0,0,Edinburgh,"Grange Cricket Club Ground, Raeburn Place",United Arab Emirates,bat,Scotland
700174,2422,United Arab Emirates,45.3,Ahmed Raza,AC Evans,0,0,Edinburgh,"Grange Cricket Club Ground, Raeburn Place",United Arab Emirates,bat,Scotland


In [94]:
# Retrieving the unique values in the 'batting_team' column of DataFrame dataframe_2
dataframe_2['batting_team'].unique()

array(['Australia', 'Pakistan', 'Afghanistan', 'Scotland', 'Zimbabwe',
       'New Zealand', 'Bangladesh', 'South Africa', 'India', 'England',
       'Sri Lanka', 'Hong Kong', 'Papua New Guinea', 'West Indies',
       'Ireland', 'United Arab Emirates', 'Nepal',
       'United States of America', 'Namibia', 'Oman', 'Netherlands',
       'Canada', 'Jersey', 'Africa XI', 'Kenya', 'Bermuda', 'Asia XI'],
      dtype=object)

In [95]:
# Number of counties
len(dataframe_2['batting_team'].unique())

27

In [97]:
# List of cricket teams
teams = [
    'Australia',
    'Pakistan',
    'Afghanistan',
    'Scotland',
    'Zimbabwe',
    'New Zealand',
    'Bangladesh',
    'South Africa',
    'India',
    'England',
    'Sri Lanka',
    'West Indies',
    'Ireland',
    'Netherlands',
    'Kenya'
]

In [98]:
len(teams)

15

In [99]:
# Filtering DataFrame to include only rows where 'batting_team' is in the provided list of teams
dataframe_2 = dataframe_2[dataframe_2['batting_team'].isin(teams)]

# Further filtering DataFrame to include only rows where 'bowling_team' is in the provided list of teams
dataframe_2 = dataframe_2[dataframe_2['bowling_team'].isin(teams)]

In [100]:
dataframe_2

Unnamed: 0,match_id,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,toss_winner,toss_decision_from_winner,bowling_team
0,1,Australia,0.1,DA Warner,Mohammad Amir,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat,Pakistan
1,1,Australia,0.2,DA Warner,Mohammad Amir,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat,Pakistan
2,1,Australia,0.3,DA Warner,Mohammad Amir,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat,Pakistan
3,1,Australia,0.4,DA Warner,Mohammad Amir,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat,Pakistan
4,1,Australia,0.5,DA Warner,Mohammad Amir,1,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat,Pakistan
...,...,...,...,...,...,...,...,...,...,...,...,...
699581,2420,Ireland,49.2,EC Joyce,Rashid Khan,4,0,Belfast,"Civil Service Cricket Club, Stormont",Afghanistan,field,Afghanistan
699582,2420,Ireland,49.3,EC Joyce,Rashid Khan,4,0,Belfast,"Civil Service Cricket Club, Stormont",Afghanistan,field,Afghanistan
699583,2420,Ireland,49.4,EC Joyce,Rashid Khan,1,0,Belfast,"Civil Service Cricket Club, Stormont",Afghanistan,field,Afghanistan
699584,2420,Ireland,49.5,SW Poynter,Rashid Khan,1,0,Belfast,"Civil Service Cricket Club, Stormont",Afghanistan,field,Afghanistan


In [103]:
# Taking the features only, which are needed for the prediction
features_needed = dataframe_2[['match_id','batting_team','bowling_team','toss_winner','toss_decision_from_winner','ball','runs','player_dismissed','city','venue']]

In [105]:
features_needed

Unnamed: 0,match_id,batting_team,bowling_team,toss_winner,toss_decision_from_winner,ball,runs,player_dismissed,city,venue
0,1,Australia,Pakistan,Australia,bat,0.1,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba"
1,1,Australia,Pakistan,Australia,bat,0.2,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba"
2,1,Australia,Pakistan,Australia,bat,0.3,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba"
3,1,Australia,Pakistan,Australia,bat,0.4,0,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba"
4,1,Australia,Pakistan,Australia,bat,0.5,1,0,Brisbane,"Brisbane Cricket Ground, Woolloongabba"
...,...,...,...,...,...,...,...,...,...,...
699581,2420,Ireland,Afghanistan,Afghanistan,field,49.2,4,0,Belfast,"Civil Service Cricket Club, Stormont"
699582,2420,Ireland,Afghanistan,Afghanistan,field,49.3,4,0,Belfast,"Civil Service Cricket Club, Stormont"
699583,2420,Ireland,Afghanistan,Afghanistan,field,49.4,1,0,Belfast,"Civil Service Cricket Club, Stormont"
699584,2420,Ireland,Afghanistan,Afghanistan,field,49.5,1,0,Belfast,"Civil Service Cricket Club, Stormont"


In [108]:
import pickle

# Serialize and save the 'features_needed' DataFrame to a binary file named 'odi_data_extract.pkl'
pickle.dump(features_needed, open('odi_data_extract.pkl', 'wb'))