In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.decomposition import PCA

In [2]:
raw_data = pd.read_csv("IPL_Ball_by_Ball_2008_2022.csv")
# raw_data = raw_data.sample(frac=0.05, random_state=42)
raw_data

Unnamed: 0,ID,innings,overs,ballnumber,batter,bowler,non-striker,extra_type,batsman_run,extras_run,total_run,non_boundary,isWicketDelivery,player_out,kind,fielders_involved,BattingTeam
0,1312200,1,0,1,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals
1,1312200,1,0,2,YBK Jaiswal,Mohammed Shami,JC Buttler,legbyes,0,1,1,0,0,,,,Rajasthan Royals
2,1312200,1,0,3,JC Buttler,Mohammed Shami,YBK Jaiswal,,1,0,1,0,0,,,,Rajasthan Royals
3,1312200,1,0,4,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals
4,1312200,1,0,5,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225949,335982,2,14,5,P Kumar,I Sharma,SB Joshi,legbyes,0,1,1,0,0,,,,Royal Challengers Bangalore
225950,335982,2,14,6,SB Joshi,I Sharma,P Kumar,,1,0,1,0,0,,,,Royal Challengers Bangalore
225951,335982,2,14,7,P Kumar,I Sharma,SB Joshi,,0,0,0,0,0,,,,Royal Challengers Bangalore
225952,335982,2,15,1,SB Joshi,LR Shukla,P Kumar,wides,0,1,1,0,0,,,,Royal Challengers Bangalore


In [3]:
def over_data(raw_data):
    
    raw_data=raw_data[raw_data['total_run']<=6] # Outliers
    
    over_info = pd.DataFrame()
    
    over_info['overid'] = raw_data['ID']*40+(raw_data['innings']-1)*20+raw_data['overs'] # Unique ID for over
    over_info['type'] = (raw_data['overs']>6).astype(int) + (raw_data['overs']>15).astype(int) # Powerplay/Middle/Death
    
    # Batter Columns
    distinct_batters = raw_data['batter'].unique()
    batter_names = pd.concat([(raw_data['batter'].fillna('').isin([batter]) | raw_data['non-striker'].fillna('').isin([batter])).astype(int) for batter in distinct_batters], axis=1)
    batter_names.columns = distinct_batters
    over_info = pd.concat([over_info,batter_names],axis=1)

    # Bowler Columns
    distinct_bowlers = raw_data['bowler'].unique()
    bowler_names = pd.concat([raw_data['bowler'].fillna('') == bowler for bowler in distinct_bowlers], axis=1).astype(int)
    bowler_names.columns = distinct_bowlers
    over_info = pd.concat([over_info,bowler_names],axis=1)
    
    over_info['total_run'] = raw_data['total_run'] 
    over_info['total_wickets'] = raw_data['isWicketDelivery']
    
    # Grouping each over's information
    over_info = over_info.groupby(over_info['overid']).sum()
    for batter in distinct_batters :
        over_info[batter] = (over_info[batter]>0).astype(int)
    for bowler in distinct_bowlers :
        over_info[bowler]=(over_info[bowler]>0).astype(int)
    
    return over_info

In [4]:
modified_data = over_data(raw_data)
modified_data

Unnamed: 0_level_0,type,YBK Jaiswal,JC Buttler,SV Samson,D Padikkal,SO Hetmyer,R Ashwin,R Parag,TA Boult,OC McCoy,...,Shoaib Malik,S Vidyut,Mohammad Hafeez,LPC Silva,D Kalyankrishna,SB Joshi,MA Khote,AA Noffke,total_run,total_wickets
overid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13439280,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,0
13439281,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,18,0
13439282,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6,0
13439283,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,23,0
13439284,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52488034,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8,0
52488035,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12,0
52488036,14,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,13,0
52488037,12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5,0


In [5]:
X = modified_data.drop(columns = ["total_run","total_wickets"])
Y = modified_data["total_run"]
pca = PCA(n_components=50) 
X = pca.fit_transform(X)     
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 1092003)

In [6]:
def Logistic_Regression(X_train,Y_train):
    model = LogisticRegression(n_jobs=-1,max_iter=1000000)
    model.fit(X_train,Y_train)
    return model

In [7]:
def SVM(X_train,Y_train):
    model = SVC(probability=True)
    model.fit(X_train,Y_train)
    return model

In [8]:
def MLP(X_train,Y_train):
    model = MLPClassifier(activation = 'relu',hidden_layer_sizes=(10,5,10), max_iter=30000)
    model.fit(X_train,Y_train)
    return model

In [9]:
print("Using Logistic Regression :-")
model_lr = Logistic_Regression(X_train,Y_train)
Y_train_pred = model_lr.predict(X_train)
Y_test_pred = model_lr.predict(X_test)
print("Accuracy on Trained data - ",accuracy_score(Y_train,Y_train_pred))
print("Classification report on Trained data - \n",classification_report(Y_train,Y_train_pred))
print("Confusion matrix on Trained data - \n",confusion_matrix(Y_train,Y_train_pred))
print("Accuracy on Test data - ",accuracy_score(Y_test,Y_test_pred))
print("Classification report on Test data - \n",classification_report(Y_test,Y_test_pred))
print("Confusion matrix on Test data - \n",confusion_matrix(Y_test,Y_test_pred))

Using Logistic Regression :-
Accuracy on Trained data -  0.1076555023923445
Classification report on Trained data - 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       239
           1       0.11      0.01      0.02       766
           2       0.07      0.00      0.00      1372
           3       0.12      0.03      0.05      1952
           4       0.12      0.12      0.12      2489
           5       0.11      0.33      0.16      2704
           6       0.11      0.26      0.16      2654
           7       0.10      0.27      0.15      2605
           8       0.10      0.12      0.11      2435
           9       0.10      0.08      0.09      2298
          10       0.06      0.00      0.01      2072
          11       0.09      0.00      0.01      1724
          12       0.12      0.00      0.01      1306
          13       0.21      0.01      0.01      1143
          14       0.22      0.00      0.00       884
          15     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
print("Using SVM :-")
model_SVM = SVM(X_train,Y_train)
Y_train_pred = model_SVM.predict(X_train)
Y_test_pred = model_SVM.predict(X_test)
print("Accuracy on Trained data - ",accuracy_score(Y_train,Y_train_pred))
print("Classification report on Trained data - \n",classification_report(Y_train,Y_train_pred))
print("Confusion matrix on Trained data - \n",confusion_matrix(Y_train,Y_train_pred))
print("Accuracy on Test data - ",accuracy_score(Y_test,Y_test_pred))
print("Classification report on Test data - \n",classification_report(Y_test,Y_test_pred))
print("Confusion matrix on Test data - \n",confusion_matrix(Y_test,Y_test_pred))

Using SVM :-
Accuracy on Trained data -  0.1101161995898838
Classification report on Trained data - 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       239
           1       0.00      0.00      0.00       766
           2       0.00      0.00      0.00      1372
           3       0.00      0.00      0.00      1952
           4       0.12      0.28      0.17      2489
           5       0.12      0.30      0.17      2704
           6       0.11      0.36      0.16      2654
           7       0.10      0.20      0.13      2605
           8       0.13      0.02      0.04      2435
           9       0.10      0.07      0.08      2298
          10       0.13      0.00      0.00      2072
          11       0.17      0.01      0.01      1724
          12       0.00      0.00      0.00      1306
          13       0.00      0.00      0.00      1143
          14       0.00      0.00      0.00       884
          15       0.00      0.00

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
print("Using MLP :-")
model_MLP = MLP(X_train,Y_train)
Y_train_pred = model_MLP.predict(X_train)
Y_test_pred = model_MLP.predict(X_test)
print("Accuracy on Trained data - ",accuracy_score(Y_train,Y_train_pred))
print("Classification report on Trained data - \n",classification_report(Y_train,Y_train_pred))
print("Confusion matrix on Trained data - \n",confusion_matrix(Y_train,Y_train_pred))
print("Accuracy on Test data - ",accuracy_score(Y_test,Y_test_pred))
print("Classification report on Test data - \n",classification_report(Y_test,Y_test_pred))
print("Confusion matrix on Test data - \n",confusion_matrix(Y_test,Y_test_pred))

Using MLP :-
Accuracy on Trained data -  0.10345181134654818
Classification report on Trained data - 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       239
           1       0.16      0.04      0.06       766
           2       0.00      0.00      0.00      1372
           3       0.16      0.01      0.02      1952
           4       0.13      0.05      0.07      2489
           5       0.11      0.42      0.18      2704
           6       0.10      0.34      0.16      2654
           7       0.09      0.13      0.11      2605
           8       0.11      0.03      0.04      2435
           9       0.09      0.12      0.10      2298
          10       0.00      0.00      0.00      2072
          11       0.09      0.08      0.09      1724
          12       0.00      0.00      0.00      1306
          13       0.00      0.00      0.00      1143
          14       0.00      0.00      0.00       884
          15       0.00      0.0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
