In [10]:
import pandas as pd
import time
from cricsheet.fow_analysis.collapses.extract_collapses import return_collapses
from cricsheet.utils import fuzzy_merge

## Load and Format Single Match

In [72]:
filepath = '../data/raw/csv/howstat/fall_of_wickets/'
file = 'fow_1999.csv'
df = pd.read_csv(filepath+file, index_col=0, parse_dates=[2], infer_datetime_format=True)

In [73]:
df.groupby(['MatchId','MatchInnings', 'Team']).apply(return_collapses).reset_index()

Unnamed: 0,MatchId,MatchInnings,Team,level_3,start,end,runs,wickets_lost,batters
0,1999,1,England,0,5.0,7.0,0.0,"[5, 6, 7]","[Cook, Prior, Broad]"
1,1999,2,Australia,0,6.0,9.0,22.0,"[6, 7, 8, 9]","[Haddin, Hussey, Johnson, Siddle]"
2,1999,2,Australia,1,7.0,10.0,23.0,"[7, 8, 9, 10]","[Hussey, Johnson, Siddle, Doherty]"
3,1999,2,Australia,2,1.0,3.0,22.0,"[1, 2, 3]","[Watson, Ponting, Katich]"


Questions to answer:
- Number of collapses by Team, by year (unique collapses, innings with a collapse)
- Positions most often involved
- Batters most often involved

## Load and Format All Matches

### Experiments

~2500 matches. 
1) What is the most efficient way to load all dataframes?

2) Is it more efficient to load and format one-by-one, or concatenate into a single df and groupby the whole df


In [5]:
# Try with 100 dataframes initially

n = 100
filepath = '../data/raw/csv/howstat/fall_of_wickets/'

#### Method 1: Load all using glob generator + concat

In [50]:
import glob
import os

In [42]:
start = time.time()
all_files = glob.glob(os.path.join(filepath, "*.csv"))
all_files_to_load = all_files[:]

df_from_each_file = (pd.read_csv(f, index_col=0, parse_dates=[2], infer_datetime_format=True) for f in all_files_to_load)
concatenated_df   = pd.concat(df_from_each_file, ignore_index=True)

end = time.time()
time_taken = end - start
print(f'{time_taken} seconds')

6.191442012786865 seconds


In [48]:
"""
Attempts:
13 seconds
6.19 seconds

"""

'\nAttempts:\n13 seconds\n6.19 seconds\n\n'

#### Method 2: os.listdir + concat

In [45]:
from os import listdir

start = time.time()
df = pd.concat([pd.read_csv(f'{filepath}/{f}', index_col=0, parse_dates=[2], infer_datetime_format=True) for f in os.listdir(filepath) if f.endswith('.csv')])

end = time.time()
time_taken = end - start
print(f'{time_taken} seconds')

6.0129618644714355 seconds


In [46]:
"""
Attempts:
6.37 seconds
6.012
"""

'\nAttempts:\n6.37 seconds\n6.012\n'

#### Method 3: Dask

In [58]:
import dask.dataframe as dd

start = time.time()
df = dd.read_csv(f'{filepath}*.csv')

end = time.time()
time_taken = end - start
print(f'{time_taken} seconds')

2.355700731277466 seconds


In [59]:
start = time.time()
df = df.compute()

end = time.time()
time_taken = end - start
print(f'{time_taken} seconds')

30.62992286682129 seconds


In [60]:
"""
Attempts:
33 seconds
"""

'\nAttempts:\n33 seconds\n'

In [56]:
df.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 9 entries, Unnamed: 0 to Player
dtypes: object(4), int64(5)

I prefer Method 1

### Apply Chosen Method

In [11]:
import glob
import os

filepath_fow = '../data/raw/csv/howstat/fall_of_wickets/'

all_fow = glob.glob(os.path.join(filepath_fow, "*.csv"))
all_fow_to_load = all_fow[:]

df_fow_from_each_file = (pd.read_csv(f, index_col=0, parse_dates=[2], infer_datetime_format=True) for f in all_fow_to_load)
concatenated_df_fow   = pd.concat(df_fow_from_each_file, ignore_index=True)

In [12]:
concatenated_df_fow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74100 entries, 0 to 74099
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   MatchId       74100 non-null  int64         
 1   MatchDate     74100 non-null  datetime64[ns]
 2   MatchInnings  74100 non-null  int64         
 3   Team          74100 non-null  object        
 4   TeamInnings   74100 non-null  object        
 5   Wicket        74100 non-null  int64         
 6   Runs          74100 non-null  int64         
 7   Player        74100 non-null  object        
dtypes: datetime64[ns](1), int64(4), object(3)
memory usage: 4.5+ MB


In [77]:
#concatenated_df_fow.to_csv('../data/interim/howstat/fall_of_wickets/fow_all.csv')

In [17]:
filepath_scores = '../data/raw/csv/howstat/scorecards/'

all_scores = glob.glob(os.path.join(filepath_scores, "*.csv"))
all_scores_to_load = all_scores[:]

df_scores_from_each_file = (pd.read_csv(f, index_col=0, parse_dates=[2], infer_datetime_format=True) for f in all_scores_to_load)
concatenated_df_scores   = pd.concat(df_scores_from_each_file, ignore_index=True)

In [18]:
concatenated_df_scores = concatenated_df_scores[['MatchId', 'MatchInnings', 'Team', 'TeamInnings', 'Player', 'R', 'BF']]
concatenated_df_scores['BattingPosition'] = concatenated_df_scores.groupby(['MatchId','MatchInnings', 'Team']).cumcount() + 1

In [19]:
concatenated_df_scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113386 entries, 0 to 113385
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   MatchId          113386 non-null  int64  
 1   MatchInnings     113386 non-null  int64  
 2   Team             113386 non-null  object 
 3   TeamInnings      113386 non-null  object 
 4   Player           113386 non-null  object 
 5   R                113386 non-null  float64
 6   BF               113386 non-null  float64
 7   BattingPosition  113386 non-null  int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 6.9+ MB


In [21]:
#concatenated_df_scores.to_csv('../data/interim/howstat/scorecards/scorecards_all.csv')

In [20]:
start = time.time()

df_merged = fuzzy_merge(concatenated_df_fow, concatenated_df_scores, 'Player', 'Player', 80)

end = time.time()
time_taken = end - start
print(f'{time_taken} seconds')

KeyboardInterrupt: 

In [None]:
df_merged.head()

In [None]:
df_merged = df_merged.merge(df_scores, how='left', left_on=['MatchId', 'MatchInnings', 'Team', 'TeamInnings', 'matches'], right_on=['MatchId', 'MatchInnings', 'Team', 'TeamInnings', 'Player'])
df_merged = df_merged.drop(['Player_y', 'matches'], axis=1)
df_merged = df_merged.rename(columns={"Player_x": "Player"})

In [5]:
start = time.time()

df_grouped = df_merged.groupby(['MatchId','MatchInnings', 'Team']).apply(return_collapses).reset_index()

end = time.time()
time_taken = end - start
print(f'{time_taken} seconds')

AttributeError: 'DataFrame' object has no attribute 'BattingPosition'

In [65]:
df_grouped

Unnamed: 0,MatchId,MatchInnings,Team,level_3,start,end,runs,wickets_lost,batters
0,1,1,Australia,0,4.0,6.0,25.0,"[4, 5, 6]","[Cooper, Midwinter, Gregory]"
1,1,2,England,0,5.0,8.0,24.0,"[5, 6, 7, 8]","[Armitage, Shaw, Jupp, Emmett]"
2,1,2,England,1,2.0,4.0,30.0,"[2, 3, 4]","[Charlwood, Ulyett, Greenwood]"
3,1,2,England,2,3.0,5.0,23.0,"[3, 4, 5]","[Ulyett, Greenwood, Armitage]"
4,1,2,England,3,4.0,6.0,26.0,"[4, 5, 6]","[Greenwood, Armitage, Shaw]"
...,...,...,...,...,...,...,...,...,...
13809,2422,2,South Africa,2,6.0,9.0,26.0,"[6, 7, 8, 9]","[Mulder, Maharaj, Bavuma, Nortje]"
13810,2422,2,South Africa,3,8.0,10.0,26.0,"[8, 9, 10]","[Bavuma, Nortje, Sipamla]"
13811,2422,3,Sri Lanka,0,6.0,10.0,30.0,"[6, 7, 8, 9, 10]","[Dickwella, Shanaka, Silva, Chameera, Fernando]"
13812,2422,3,Sri Lanka,1,2.0,4.0,23.0,"[2, 3, 4]","[Thirimanne, Mendis, Bhanuka]"


In [76]:
df_grouped.to_csv('../data/processed/collapses/all_collapses.csv')