In [1]:

# Stratifying according to both winning team and year
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd


data = pd.read_csv("../Data/selected_data/processed_data.csv")

# Define the test size (e.g., 20% for testing)
test_size = 0.2

# Create a new column for combining year and winning_team
data['year_team'] = data['year'].astype(str) + '_' + data['winning_team']

# Separate features (all columns except 'year_team') and target variable
features = data.drop('winning_team', axis=1)
target = data['winning_team']

# Use StratifiedShuffleSplit to get the test set indices
sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=42)
test_indices, train_indices = next(sss.split(features, target))

# Separate the test and train sets based on the indices
test_set = data.iloc[test_indices]
train_set = data.iloc[train_indices]

# Drop the temporary year_team column
test_set = test_set.drop('year_team', axis=1)
train_set = train_set.drop('year_team', axis=1)

# Print the shapes of the test and train sets to confirm the split
print("Test Set Shape:", test_set.shape)
print("Train Set Shape:", train_set.shape)

Test Set Shape: (173793, 32)
Train Set Shape: (43449, 32)


In [2]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43449 entries, 129496 to 158443
Data columns (total 32 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   batter               43449 non-null  object 
 1   bowler               43449 non-null  object 
 2   non_striker          43449 non-null  object 
 3   runs_by_bat          43449 non-null  int64  
 4   extra_runs           43449 non-null  int64  
 5   total_runs_delivery  43449 non-null  int64  
 6   current_team_total   43449 non-null  int64  
 7   runs_remain          43449 non-null  float64
 8   batter_runs          43449 non-null  int64  
 9   balls_faced          43449 non-null  int64  
 10  wickets_fallen       43449 non-null  int64  
 11  extra_type           43449 non-null  object 
 12  delivery             43449 non-null  float64
 13  over                 43449 non-null  int64  
 14  wicket_type          43449 non-null  object 
 15  player_out           43449 non-null

In [3]:
train_set.head()

Unnamed: 0,batter,bowler,non_striker,runs_by_bat,extra_runs,total_runs_delivery,current_team_total,runs_remain,batter_runs,balls_faced,...,match_id,winning_team,batting_team,bowling_team,won,final_team_total,batter_type,non_striker_type,bowler_type,replacements
129496,S Chanderpaul,JM Anderson,DS Smith,4,0,4,42,-1.0,14,6,...,England_West Indies_2007-06-28,West Indies,West Indies,England,1.0,208,Left hand Bat,Left hand Bat,Right arm Fast,No Replacement
161381,PD McGlashan,Shoaib Akhtar,LRPL Taylor,0,0,0,126,17.0,0,3,...,New Zealand_Pakistan_2010-12-26,New Zealand,New Zealand,Pakistan,1.0,146,Right hand Bat,Right hand Bat,Right arm Fast,No Replacement
2638,JM Bairstow,D Paterson,JJ Roy,0,0,0,53,121.0,10,14,...,England_South Africa_2017-06-23,South Africa,England,South Africa,0.0,171,Right hand Bat,Right hand Bat,Right arm Fast,No Replacement
87244,BKG Mendis,CR Woakes,P Nissanka,0,0,0,34,-1.0,14,11,...,Sri Lanka_England_2022-11-05,England,Sri Lanka,England,0.0,141,Right hand Bat,Right hand Bat,Right arm Fast,No Replacement
134971,A Symonds,IK Pathan,MEK Hussey,0,0,0,156,32.0,43,26,...,Australia_India_2007-09-22,India,Australia,India,0.0,173,Right hand Bat,Left hand Bat,Left arm Medium,No Replacement
