In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
## Load libraries
import pandas as pd
import numpy as np

In [3]:
# Display options
pd.set_option('display.max_columns',50)

## Read data

In [4]:
train_data = pd.read_csv('train_data_with_samplefeatures.csv')
test_data = pd.read_csv('test_data_with_samplefeatures.csv')

In [5]:
train_data.shape
train_data.head(2)

(948, 23)

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,field,Hr Ct Sm Ie,Indore,2022-10-20,day/night match,Sd Mq Ai Ty,2022/23,7398,1.666667,0.672131,139.0,100.0,157.178571
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,field,Kn Ol Bn Bs,Bridgetown,2022-01-23,day/night match,Ed tr of Wt Is,2021/22,1406,1.285714,1.952381,156.0,50.0,103.5


In [6]:
test_data.shape
test_data.head(2)

(271, 21)

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15
0,9250275,Jy,28594,7438955.0:8271969.0:8369661.0:3685247.0:259025...,Ud Ss of Aa,90,2436944.0:8117500.0:6703528.0:3010748.0:161387...,Ud Ss of Aa,field,Bo Ac Cb,Bulawayo,2022-07-11,day match,Ud Ss of Aa tr of Ze,2022,3226,0.0,0.019608,,0.0,
1,9262189,Ga An Ws,36084,3715690.0:6818622.0:4069666.0:393014.0:4690188...,Ja Ts,36098,4690258.0:3761246.0:8464385.0:5742470.0:315072...,Ja Ts,field,Pe Sm Ga,Providence,2022-09-21,night match,Cn Pr Le,2022,13915,0.615385,0.344262,151.285714,66.67,153.5


#### Create dependent column

In [7]:
train_data['winner_01'] = train_data.apply(lambda x: 1 if (x['team2']==x['winner']) else 0, axis=1)

#### Pre process

In [8]:
## Toss winner to numerical - 1 if team2 wins, else 0

train_data['toss_winner_01'] = np.where(train_data['toss winner']==train_data['team2'], 1, 0)
test_data['toss_winner_01'] = np.where(test_data['toss winner']==test_data['team2'], 1, 0)

In [9]:
## Toss decision - categorical - 1 if winner bats, 0 otherwise

train_data['toss_decision_01'] = np.where(train_data['toss decision']=='bat', 1, 0)
test_data['toss_decision_01'] = np.where(test_data['toss decision']=='bat', 1, 0)

#### Selecting relevant columns

In [10]:
X,y = train_data[['toss_winner_01','toss_decision_01','team_count_50runs_last15','team_winp_last5','team1only_avg_runs_last15','team1_winp_team2_last15','ground_avg_runs_last15']], train_data['winner_01']

In [11]:
X_test = test_data[X.columns.tolist()]

#### Imputation

In [12]:
X_nans = X.isna().sum().reset_index()
X_nans[X_nans[0]!=0]

X_test_nans = X_test.isna().sum().reset_index()
X_test_nans[X_test_nans[0]!=0]

Unnamed: 0,index,0
4,team1only_avg_runs_last15,21
6,ground_avg_runs_last15,53


Unnamed: 0,index,0
4,team1only_avg_runs_last15,7
6,ground_avg_runs_last15,10


In [13]:
X.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [14]:
X_nans = X.isna().sum().reset_index()
X_nans[X_nans[0]!=0]

X_test_nans = X_test.isna().sum().reset_index()
X_test_nans[X_test_nans[0]!=0]

Unnamed: 0,index,0


Unnamed: 0,index,0


### Stats

In [15]:
X['toss_winner_01'].value_counts()
X['toss_decision_01'].value_counts()
X['team_count_50runs_last15'].describe()

1    620
0    328
Name: toss_winner_01, dtype: int64

0    617
1    331
Name: toss_decision_01, dtype: int64

count    948.000000
mean       1.202602
std        1.008793
min        0.083333
25%        0.686678
50%        1.000000
75%        1.400000
max       11.000000
Name: team_count_50runs_last15, dtype: float64

## Model

#### GBM

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

In [17]:
# user-defined parameters

algo_name = 'GradientBoostingClassifier'
is_ensemble = 'no'
n_trees = 10
depth = 2
lr = 0.1

##### Train

In [18]:
clf_gbm = GradientBoostingClassifier(n_estimators = n_trees, max_depth = depth, learning_rate = lr).fit(X,y)

In [19]:
train_data['y_pred_01'] = clf_gbm.predict(X)
test_data['y_pred_01'] = clf_gbm.predict(X_test)

In [20]:
from sklearn.metrics import classification_report

In [21]:
# Train accuracy
print(classification_report(y, clf_gbm.predict(X), labels=[0,1]))

              precision    recall  f1-score   support

           0       0.59      0.64      0.61       471
           1       0.61      0.56      0.58       477

    accuracy                           0.60       948
   macro avg       0.60      0.60      0.60       948
weighted avg       0.60      0.60      0.60       948



In [22]:
train_data['win_pred_score'] = clf_gbm.predict_proba(X)[:,1]
test_data['win_pred_score'] = clf_gbm.predict_proba(X_test)[:,1]

##### Getting win_pred_score corresponding to winner team

In [23]:
train_data['win_pred_score'] = np.where( (train_data['y_pred_01']==0), (1-train_data['win_pred_score']), train_data['win_pred_score'])
test_data['win_pred_score'] = np.where( (test_data['y_pred_01']==0), (1-test_data['win_pred_score']), test_data['win_pred_score'])

##### Getting winner_team_id from custom dependent column

In [24]:
train_data['win_pred_team_id'] = np.where( (train_data['y_pred_01']==0), (train_data['team1_id']), train_data['team2_id'])
test_data['win_pred_team_id'] = np.where( (test_data['y_pred_01']==0), (test_data['team1_id']), test_data['team2_id'])

#### Feature importance

In [25]:
df_feat_importance = pd.DataFrame({'feat_name':X.columns.tolist(), 'model_feat_imp_train':clf_gbm.feature_importances_}).sort_values(by='model_feat_imp_train', ascending=False)\
                                                                                                                        .reset_index(drop=True).head(10)
df_feat_importance

Unnamed: 0,feat_name,model_feat_imp_train
0,team_count_50runs_last15,0.506246
1,team_winp_last5,0.196276
2,ground_avg_runs_last15,0.18891
3,team1_winp_team2_last15,0.062128
4,team1only_avg_runs_last15,0.04644
5,toss_winner_01,0.0
6,toss_decision_01,0.0


## File preparation

#### File 1

In [26]:
train_data['dataset_type'] = 'train'
test_data['dataset_type'] = 'r1'

In [27]:
## refactor

df_file1 = pd.concat([test_data[['match id','dataset_type','win_pred_team_id','win_pred_score',] + list(df_feat_importance['feat_name'].head(10))], \
                     train_data[['match id','dataset_type','win_pred_team_id','win_pred_score',] + list(df_feat_importance['feat_name'].head(10))]])

renaming_dict = {}
for i,col in enumerate(list(df_feat_importance['feat_name'].head(10))):
    renaming_dict[col] = f'indep_feat_id{i+1}'
df_file1.rename(columns=renaming_dict, inplace=True)

for i in range(1,11):
    if f'indep_feat_id{i}' not in df_file1.columns:
        df_file1[f'indep_feat_id{i}'] = np.nan

df_file1['train_algorithm'] = algo_name
df_file1['is_ensemble'] = is_ensemble
df_file1['train_hps_trees'] = n_trees
df_file1['train_hps_depth'] = depth
df_file1['train_hps_lr'] = lr

In [28]:
df_file1.shape
df_file1.head()

(1219, 19)

Unnamed: 0,match id,dataset_type,win_pred_team_id,win_pred_score,indep_feat_id1,indep_feat_id2,indep_feat_id3,indep_feat_id4,indep_feat_id5,indep_feat_id6,indep_feat_id7,indep_feat_id8,indep_feat_id9,indep_feat_id10,train_algorithm,is_ensemble,train_hps_trees,train_hps_depth,train_hps_lr
0,9250275,r1,90,0.649253,0.0,0.019608,,0.0,,1,0,,,,GradientBoostingClassifier,no,10,2,0.1
1,9262189,r1,36098,0.565718,0.615385,0.344262,153.5,66.67,151.285714,1,0,,,,GradientBoostingClassifier,no,10,2,0.1
2,9128776,r1,48334,0.523879,0.842105,0.753086,179.625,100.0,171.066667,0,1,,,,GradientBoostingClassifier,no,10,2,0.1
3,9586919,r1,36112,0.526343,0.285714,1.487805,155.5,75.0,166.0,1,0,,,,GradientBoostingClassifier,no,10,2,0.1
4,9128538,r1,30414,0.536829,2.375,0.31033,164.125,0.0,169.933333,1,0,,,,GradientBoostingClassifier,no,10,2,0.1


#### File 2

In [29]:
feature_desc = {'team_count_50runs_last15':'Ratio of number of 50s by players in team1 to number of 50s by players in team2 in last 15 games',\
               'team_winp_last5':'Ratio of team1\'s win % to team2\'s win % in last 5 games',\
               'ground_avg_runs_last15':'average runs scored in the ground in last 15 games',\
               'team1_winp_team2_last15':'Team1\'s win percentage against Team2 in last 15 games',\
               'team1only_avg_runs_last15':'team1\'s avg inning runs in last 15 games',\
                
               'season_num' : 'Numerical form of season. Takes 1 for oldest season and increases for latest seasons.',\
               'toss_winner_01' : 'Toss winner to numerical - 1 if team2 wins, else 0',\
               'toss_decision_01' : 'Toss decision - categorical - 1 if winner bats, 0 otherwise'}

In [30]:
# df_feat_importance.rename(index={0:'feat_id'}, inplace=True)
df_file2 = df_feat_importance
df_file2['feat_id'] = [i+1 for i in df_file2.index]
df_file2['feat_rank_train'] = [i+1 for i in df_file2.index]
df_file2 = df_file2.set_index('feat_id')
df_file2['feat_description'] = df_file2['feat_name'].map(feature_desc)

In [31]:
df_file2

Unnamed: 0_level_0,feat_name,model_feat_imp_train,feat_rank_train,feat_description
feat_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,team_count_50runs_last15,0.506246,1,Ratio of number of 50s by players in team1 to ...
2,team_winp_last5,0.196276,2,Ratio of team1's win % to team2's win % in las...
3,ground_avg_runs_last15,0.18891,3,average runs scored in the ground in last 15 g...
4,team1_winp_team2_last15,0.062128,4,Team1's win percentage against Team2 in last 1...
5,team1only_avg_runs_last15,0.04644,5,team1's avg inning runs in last 15 games
6,toss_winner_01,0.0,6,"Toss winner to numerical - 1 if team2 wins, el..."
7,toss_decision_01,0.0,7,Toss decision - categorical - 1 if winner bats...


## Save

In [None]:
df_file1.to_csv('file1.csv', index=False)
df_file2.to_csv('file2.csv')