# Data Initialisation
This document is concerned with initialising the data to be used throughout the remainder of the project. This section must be completed thoroughly and accurately to ensure optimal outcomes in later stages of this research topic.

In [1]:
# Path to library module.
from lib.constants import LIBRARY_PATH
import sys
sys.path.append(LIBRARY_PATH)

# Import data cleaning function
from lib.clean_raw_data import clean_raw_data

# Clean the raw match and delivery data files
clean_raw_data()

  clean_raw_data()
  clean_raw_data()
  clean_raw_data()


## Walk-Through of Cleaning Process
In this section, the entire cleaning process performed above will be outlined.

In [5]:
# Perform necessary imports.
import pandas as pd
from lib.constants import *

### Basic Clean Match Data
Here, we will perform a basic clean on the match data. This includes performing the following steps:

1. Removing unnecessary columns from match data.
2. Remoing female formats.
3. Removing disability teams.
4. Removing international games not involving Australia.
5. Removing uncommon match formats.
6. Removing international games that are not of 1 Day format.

This process is outlined below.

In [6]:
# Load match data.
match_data = pd.read_csv(DATA_PATH + "/Matches.txt", delimiter="\t")

In [7]:
# Remove unnecessary columns from match data.
match_data = match_data[match_data.columns.drop(list(match_data.filter(regex='Official+')))]
match_data.columns

Index(['Match Id', 'Season Id', 'Season', 'Series Id', 'Series',
       'Series Gender Id', 'Series Gender', 'Match Date', 'Match YYMMDD',
       'Match Type Id', 'Match Type', 'Ball Type Id', 'Ball Type', 'TeamA Id',
       'TeamA', 'TeamA At Home', 'TeamB Id', 'TeamB', 'TeamB At Home',
       'Day/Night', 'Venue Id', 'Venue', 'Toss Won By Id', 'Toss Decision Id',
       'TeamA Innings1 Closure', 'TeamA Innings2 Closure',
       'TeamB Innings1 Closure', 'TeamB Innings2 Closure',
       'TeamA 1st Comparison', 'TeamA Result Id', 'TeamA Result',
       'TeamBattingIdMatchInnings1', 'TeamBattingMatchInnings1',
       'TeamBattingIdMatchInnings2', 'TeamBattingMatchInnings2',
       'TeamBattingIdMatchInnings3', 'TeamBattingMatchInnings3',
       'TeamBattingIdMatchInnings4', 'TeamBattingMatchInnings4',
       'TeamB Result Id', 'TeamB Result', 'TeamA Coach Id',
       'TeamA Coach Surname', 'TeamA Coach Other Names', 'TeamB Coach Id',
       'TeamB Coach Surname', 'TeamB Coach Other Name

2. Removing female formats.

In [8]:
# Remove female series from match data.
match_data = match_data.loc[match_data["Series Gender Id"] == 1]
match_data["Series Gender"].unique()

array(['Male'], dtype=object)

3. Removing disability teams.

In [9]:
# Remove disability teams from match data.
match_data =  match_data[~match_data.TeamA.str.contains("Disability") | 
                         ~match_data.TeamB.str.contains("Disability")]
match_data["TeamA"].unique()

array(['Australia (M)', 'West Indies (M)', 'Pakistan (M)',
       'Sri Lanka (M)', 'South Africa (M)', 'England (M)',
       'New Zealand (M)', 'India (M)', 'SA (M)', 'Tas (M)', 'NSW (M)',
       'Victoria (M)', 'WA (M)', 'Qld (M)', 'Australia A (M)',
       'Zimbabwe (M)', 'Kenya (M)', 'Scotland (M)', 'Bangladesh (M)',
       'Ireland (M)', 'Sydney Sixers (M)', 'Melbourne Stars (M)',
       'Adelaide Strikers (M)', 'Perth Scorchers  (M)',
       'Brisbane Heat (M)', 'Hobart Hurricanes (M)',
       'Melbourne Renegades (M)', 'Sydney Thunder (M)', 'Canada (M)',
       'Gloucestershire (M)', 'Afghanistan (M)', 'India A (M)',
       'South Africa A (M)', 'CA XI (M)', 'India B (M)',
       'England Lions (M)'], dtype=object)

4. Remove international games where Australia is not playing.

In [10]:
# Remove international games where Australia is not playing.
match_data = match_data[~(match_data.Series.str.contains("International") & ~match_data.TeamA.str.contains("Australia") & ~match_data.TeamB.str.contains("Australia"))]
len(match_data[match_data.Series.str.contains("International") & ~match_data.TeamA.str.contains("Australia") & ~match_data.TeamB.str.contains("Australia")][["TeamA", "TeamB"]])

0

5. Remove uncommon match formats.

In [11]:
# Remove games that are not T20, 1 Day, 4 Day, or 5 Day formats.
match_data = match_data[match_data["Match Type Id"].isin([1,4,5,7])]
match_data["Match Type"].unique()

array(['5 Day', '1 Day', '4 Day', 'Twenty20'], dtype=object)

6. Remove international matches that are not 1 Day format.

In [12]:
# Remove international games that are not ODI.
match_data = match_data[(match_data["Match Type Id"] == 1 & match_data.Series.str.contains("International")) | match_data.Series.str.contains("Domestic")]
match_data.Series.unique()

array(['International ODI M', 'Domestic 1st Class M', 'Domestic List A M',
       'Domestic T20 M', 'International ICC Trophy M',
       'International ODI World Cup M', 'International 1st Class M'],
      dtype=object)

We will now write this cleaned data to file to be used later.

In [13]:
# Write the cleaned data to file.
match_data.to_csv(DATA_PATH + "/Matches_Clean.txt", sep="\t", index=False)

### Basic Clean Deliveries Data
Here, we will perform a basic clean on the deliveries data. This includes performing the following steps:

1. Load delivery data only containing relevant matches.
2. Remove deliveries to foreign teams.
3. Remove deliveries to domestic players that have not played at the international level.
4. Remove deliveries to players that have played less than a threshold number of international One-Day games.

These steps are outlined below.

1. Load delivery data only containing relevant matches.

In [10]:
# Determine which matches are important for delivery data.
match_ids = match_data["Match Id"]

# Determine duplicate columns between match and delivery data that should be dropped.
match_columns = set(match_data.columns)
match_columns.remove("Match Id")

# Load delivery data
delivery_data = pd.DataFrame()

for chunk in pd.read_csv(DATA_PATH + "/Deliveries.txt", delimiter="\t", chunksize=10**6):
  chunk = chunk[chunk["Match Id"].isin(match_ids)]
  chunk.drop(
    [col for col in chunk.columns if col in match_columns], axis=1, inplace=True
  )

  # Combine filtered deliveries into single dataframe.
  delivery_data = pd.concat([delivery_data, chunk])

  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):


Cleaning the delivery data while in chunks is necessary as the raw dataset is too large to load into memory.

2. Remove deliveries to foreign teams.

In [11]:
# Get a list of international One Day match IDs.
odi_IDs = match_data[match_data["Series"].str.contains("International")]["Match Id"].tolist()

# Remove deliveries to foreign teams.
delivery_data = delivery_data[~(delivery_data["Match Id"].isin(odi_IDs) & ~delivery_data["Team Batting"].str.contains("Australia"))]

# Show the remaining batting teams.
delivery_data["Team Batting"].unique()

array(['Australia (M)', 'SA (M)', 'Victoria (M)', 'NSW (M)', 'Tas (M)',
       'WA (M)', 'Qld (M)', 'Australia A (M)', 'Brisbane Heat (M)',
       'Sydney Sixers (M)', 'Melbourne Stars (M)', 'Sydney Thunder (M)',
       'Adelaide Strikers (M)', 'Melbourne Renegades (M)',
       'Hobart Hurricanes (M)', 'Perth Scorchers  (M)', 'CA XI (M)'],
      dtype=object)

3. Remove deliveries to domestic players that have not played at international level.

In [12]:
# Get deliveries to Australian One Day players.
australian_odi_deliveries = delivery_data[delivery_data["Match Id"].isin(odi_IDs)]

# Get Australian One Day batters.
australian_odi_batters = australian_odi_deliveries["Striker Id"].unique()

# Remove deliveries to domestic Australian batters.
delivery_data = delivery_data[delivery_data["Striker Id"].isin(australian_odi_batters)]

# Print remaining batters.
delivery_data["Striker"].unique()

array(['Waugh, Mark', 'Gilchrist, Adam', 'Ponting, Ricky',
       'Bevan, Michael', 'Waugh, Steve', 'Martyn, Damien',
       'Symonds, Andrew', 'Lee, Shane', 'Fleming, Damien',
       'MacGill, Stuart', 'Lee, Brett', 'McGrath, Glen', 'Dale, Adam',
       'Warne, Shane', 'Harvey, Ian', 'Blewett, Greg', 'Hayden, Matthew',
       'Katich, Simon', 'Slater, Michael', 'Hodge, Brad',
       'Lehmann, Darren', 'Haddin, Brad', 'Maher, Jimmy', 'Watson, Shane',
       'Gillespie, Jason', 'Williams, Brad', 'Bichel, Andrew',
       'Campbell, Ryan', 'Hauritz, Nathan', 'Harris, Ryan',
       'White, Cameron', 'Lewis, Michael', 'Manou, Graham',
       'Clarke, Michael', 'Wright, Damian', 'Tait, Shaun',
       'Hussey, Michael', 'Voges, Adam', 'Ronchi, Luke',
       'Doherty, Xavier', 'Cosgrove, Mark', 'Geeves, Brett',
       'Jaques, Philp', 'Clark, Stuart', 'Thornely, Dominic',
       'Bracken, Nathan', 'Bailey, George', 'North, Marcus',
       'Hussey, David', 'Harwood, Shane', 'Ferguson, Callum',


4. Remove deliveries to players that have played less than a threshold number of international One-Day games.

In [13]:
# Extract international and domestic deliveries.
international_matches = match_data[match_data["Series"].str.contains("International")]["Match Id"].tolist()
domestic_matches = match_data[match_data["Series"].str.contains("Domestic")]["Match Id"].tolist()

international_deliveries = delivery_data[delivery_data["Match Id"].isin(international_matches)]
domestic_deliveries = delivery_data[delivery_data["Match Id"].isin(domestic_matches)]

# Aggregate fields.
by_columns = ["Striker Id"]
aggregates = {"Match Id": pd.Series.nunique}

# Count number of international matches per remaining batter.
international_groupby_data = international_deliveries.groupby(by=by_columns).agg(aggregates)

# Count number of domestic matches per remaining batter.
domestic_groupby_data = domestic_deliveries.groupby(by=by_columns).agg(aggregates)

# Remove batters that have batted in less than 10 international One Day innings.
valid_batters = international_groupby_data[international_groupby_data["Match Id"] >= 10].index
delivery_data = delivery_data[delivery_data["Striker Id"].isin(valid_batters)]

# Remove batters that have batted in less than 10 domestic innings.
valid_batters = domestic_groupby_data[domestic_groupby_data["Match Id"] >= 10].index
delivery_data = delivery_data[delivery_data["Striker Id"].isin(valid_batters)]

# Print remaining batters.
delivery_data["Striker"].unique()


array(['Waugh, Mark', 'Ponting, Ricky', 'Bevan, Michael', 'Waugh, Steve',
       'Symonds, Andrew', 'Lee, Brett', 'Warne, Shane', 'Harvey, Ian',
       'Hayden, Matthew', 'Katich, Simon', 'Hodge, Brad',
       'Lehmann, Darren', 'Haddin, Brad', 'Maher, Jimmy', 'Watson, Shane',
       'Gillespie, Jason', 'Bichel, Andrew', 'Hauritz, Nathan',
       'White, Cameron', 'Clarke, Michael', 'Hussey, Michael',
       'Voges, Adam', 'Doherty, Xavier', 'Clark, Stuart',
       'Bracken, Nathan', 'Bailey, George', 'Hussey, David',
       'Ferguson, Callum', 'Marsh, Shaun', 'Hogg, Brad', 'Paine, Tim',
       'Wade, Matthew', 'Forrest, Peter', 'Hopes, James',
       'McKay, Clinton', 'Hughes, Phil', 'Christian, Daniel',
       'Khawaja, Usman', 'Finch, Aaron', 'Hastings, John',
       'Henriques, Moises', 'Smith, Steven', 'Warner, David',
       'Faulkner, James', 'Stoinis, Marcus', 'Richardson, Kane',
       'Marsh, Mitchell', 'Starc, Mitchell', 'Hazlewood, Josh',
       'Coulter-Nile, Nathan', 'Max

### Further Cleaning of Entire Dataset
This section tidies the already cleaned data to the final workable state. This includes the following steps:

1. Remove matches that contain no delivery data.

These steps are outlined below.

1. Remove matches that contain no delivery data.

In [14]:
# Extract match IDs.
match_ids = match_data["Match Id"].unique().tolist()

# Extract delivery match IDs.
delivery_ids = delivery_data["Match Id"].unique().tolist()

# Determine difference.
empty_matches = set(match_ids) - set(delivery_ids)

# Remove empty match data.
match_data = match_data[~(match_data["Match Id"].isin(empty_matches))]

### Writing Data to File

Finally, we will write both datasets back to file in their final cleaned states.

In [15]:
# Write the cleaned data to file.
match_data.to_csv(DATA_PATH + "/Matches_Clean.txt", sep="\t", index=False)
delivery_data.to_csv(DATA_PATH + "/Deliveries_Clean.txt", sep="\t", index=False)