In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.decomposition import PCA

In [2]:
raw_data = pd.read_csv("IPL_Ball_by_Ball_2008_2022.csv")
raw_data = raw_data.sample(frac=0.1, random_state=42)
raw_data

Unnamed: 0,ID,innings,overs,ballnumber,batter,bowler,non-striker,extra_type,batsman_run,extras_run,total_run,non_boundary,isWicketDelivery,player_out,kind,fielders_involved,BattingTeam
177814,501226,1,12,2,MS Dhoni,Yuvraj Singh,MEK Hussey,,1,0,1,0,0,,,,Chennai Super Kings
3287,1304107,1,17,4,SW Billings,T Natarajan,AD Russell,,6,0,6,0,0,,,,Kolkata Knight Riders
188612,419148,2,13,3,RV Uthappa,AD Mathews,LRPL Taylor,,0,0,0,0,0,,,,Royal Challengers Bangalore
96212,980961,1,13,6,RA Jadeja,Z Khan,KD Karthik,,0,0,0,0,0,,,,Gujarat Lions
30637,1254064,1,5,1,RR Pant,JD Unadkat,AM Rahane,,4,0,4,0,0,,,,Delhi Capitals
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164073,548320,2,16,4,YK Pathan,GB Hogg,Shakib Al Hasan,,0,0,0,0,0,,,,Kolkata Knight Riders
29125,1254071,2,14,1,JM Bairstow,FA Allen,KS Williamson,,1,0,1,0,0,,,,Sunrisers Hyderabad
84133,1082612,1,9,4,WP Saha,JJ Bumrah,HM Amla,,2,0,2,0,0,,,,Kings XI Punjab
164546,548318,2,16,5,MS Dhoni,R Vinay Kumar,DJ Bravo,,0,0,0,0,0,,,,Chennai Super Kings


In [3]:
# Pre processing the data
def rectify_data(raw_data):
    raw_data=raw_data[raw_data['total_run']<=6]
    processed_data = pd.DataFrame()
    processed_data['first_innings'] = 2-raw_data['innings']
    processed_data['second_innings'] = raw_data['innings']-1
    processed_data['powerplay'] = (raw_data['overs']<=6).astype(int)
    processed_data['middle'] = ((raw_data['overs']>6) & (raw_data['overs']<=15)).astype(int)
    processed_data['death'] = (raw_data['overs']>15).astype(int)
    distinct_batters = raw_data['batter'].unique()
    batter_names = pd.concat([raw_data['batter'].fillna('') == batter for batter in distinct_batters], axis=1).astype(int)
    batter_names.columns = distinct_batters
    processed_data = pd.concat([processed_data,batter_names],axis=1)
    distinct_bowlers = raw_data['bowler'].unique()
    bowler_names = pd.concat([raw_data['bowler'].fillna('') == bowler for bowler in distinct_bowlers], axis=1).astype(int)
    bowler_names.columns = distinct_bowlers
    processed_data = pd.concat([processed_data,batter_names],axis=1)
    # processed_data['run_and_wicket'] = raw_data[['total_run','isWicketDelivery']].apply(list,axis=1)
    processed_data['label'] = raw_data['total_run']+7*raw_data['isWicketDelivery']    
    return processed_data

In [4]:
modified_data = rectify_data(raw_data)
X = modified_data.drop(columns = "label")
Y = modified_data["label"]
len(modified_data.columns)

1070

In [5]:
# Applying PCA to the modified data to upto 100 components
pca = PCA(n_components=100) 
X = pca.fit_transform(X)     
pca.explained_variance_ratio_,sum(pca.explained_variance_ratio_)

(array([0.16259857, 0.13126957, 0.07079568, 0.01461266, 0.01374125,
        0.01307626, 0.01237963, 0.01137684, 0.01121262, 0.01102235,
        0.01056392, 0.01015152, 0.01000599, 0.00976529, 0.00970935,
        0.00928315, 0.00826755, 0.00813096, 0.00785032, 0.00774162,
        0.0075144 , 0.0071963 , 0.00703483, 0.0069368 , 0.00678653,
        0.00666877, 0.0062577 , 0.00608423, 0.00595508, 0.00590308,
        0.00579637, 0.00568504, 0.00566275, 0.00561036, 0.00552958,
        0.00547096, 0.00539912, 0.00528916, 0.00524449, 0.00514248,
        0.00513417, 0.00509051, 0.00505698, 0.00502526, 0.00488291,
        0.00480445, 0.00475349, 0.00461422, 0.0045405 , 0.00442212,
        0.00434431, 0.00424603, 0.00408169, 0.00391382, 0.00389423,
        0.00387153, 0.00383679, 0.00378969, 0.00373582, 0.00369517,
        0.00365026, 0.00360803, 0.0035104 , 0.0034322 , 0.00325276,
        0.00311643, 0.0030406 , 0.00300109, 0.00295014, 0.00289269,
        0.00280874, 0.00278367, 0.00271935, 0.00

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 1092003)

In [7]:
def Logistic_Regression(X_train,Y_train):
    model = LogisticRegression(n_jobs=-1,max_iter=100000)
    model.fit(X_train,Y_train)
    return model

In [8]:
def SVM(X_train,Y_train):
    model = SVC()
    model.fit(X_train,Y_train)
    return model

In [9]:
def MLP(X_train,Y_train):
    model = MLPClassifier(activation = 'relu',hidden_layer_sizes=(10,5,10), max_iter=30000)
    model.fit(X_train,Y_train)
    return model

In [10]:
print("Using Logistic Regression :-")
model = Logistic_Regression(X_train,Y_train)
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)
print("Accuracy on Trained data - ",accuracy_score(Y_train,Y_train_pred))
print("Classification report on Trained data - \n",classification_report(Y_train,Y_train_pred))
print("Confusion matrix on Trained data - \n",confusion_matrix(Y_train,Y_train_pred))
print("Accuracy on Test data - ",accuracy_score(Y_test,Y_test_pred))
print("Classification report on Test data - \n",classification_report(Y_test,Y_test_pred))
print("Confusion matrix on Test data - \n",confusion_matrix(Y_test,Y_test_pred))

Using Logistic Regression :-
Accuracy on Trained data -  0.443399358194091
Classification report on Trained data - 
               precision    recall  f1-score   support

           0       0.42      0.37      0.39      5472
           1       0.45      0.80      0.58      7506
           2       0.00      0.00      0.00      1225
           3       0.00      0.00      0.00        56
           4       0.00      0.00      0.00      2073
           5       0.00      0.00      0.00        26
           6       0.00      0.00      0.00       813
           7       0.00      0.00      0.00       868
           8       0.00      0.00      0.00        33
           9       0.00      0.00      0.00         2

    accuracy                           0.44     18074
   macro avg       0.09      0.12      0.10     18074
weighted avg       0.32      0.44      0.36     18074

Confusion matrix on Trained data - 
 [[2000 3472    0    0    0    0    0    0    0    0]
 [1492 6014    0    0    0    0   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
i = 0
probabilities = model.predict_proba(X_test)
classes = model.classes_
dict = {}
for _ in range(len(classes)):
    print(_)
    dict[classes[_]] = probabilities[i][_]
dict

0
1
2
3
4
5
6
7
8
9


{0: 0.34902120209770704,
 1: 0.3872128966801055,
 2: 0.05050174003974715,
 3: 0.0012378042526081687,
 4: 0.1605370583064236,
 5: 0.0010000149400173177,
 6: 0.006933805637178592,
 7: 0.042658384687187224,
 8: 0.0008067280256896018,
 9: 9.036533333584174e-05}

In [12]:
num = model.predict_proba(X_test)
num[0]

array([3.49021202e-01, 3.87212897e-01, 5.05017400e-02, 1.23780425e-03,
       1.60537058e-01, 1.00001494e-03, 6.93380564e-03, 4.26583847e-02,
       8.06728026e-04, 9.03653333e-05])

In [13]:
print("Using SVM :-")
model = SVM(X_train,Y_train)
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)
print("Accuracy on Trained data - ",accuracy_score(Y_train,Y_train_pred))
print("Classification report on Trained data - \n",classification_report(Y_train,Y_train_pred))
print("Confusion matrix on Trained data - \n",confusion_matrix(Y_train,Y_train_pred))
print("Accuracy on Test data - ",accuracy_score(Y_test,Y_test_pred))
print("Classification report on Test data - \n",classification_report(Y_test,Y_test_pred))
print("Confusion matrix on Test data - \n",confusion_matrix(Y_test,Y_test_pred))

Using SVM :-
Accuracy on Trained data -  0.45396702445501824
Classification report on Trained data - 
               precision    recall  f1-score   support

           0       0.43      0.41      0.42      5472
           1       0.46      0.79      0.58      7506
           2       1.00      0.00      0.00      1225
           3       0.00      0.00      0.00        56
           4       0.50      0.00      0.00      2073
           5       0.00      0.00      0.00        26
           6       0.00      0.00      0.00       813
           7       0.00      0.00      0.00       868
           8       0.00      0.00      0.00        33
           9       0.00      0.00      0.00         2

    accuracy                           0.45     18074
   macro avg       0.24      0.12      0.10     18074
weighted avg       0.45      0.45      0.37     18074

Confusion matrix on Trained data - 
 [[2248 3224    0    0    0    0    0    0    0    0]
 [1555 5950    0    0    1    0    0    0    0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
print("Using MLP :-")
model = MLP(X_train,Y_train)
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)
print("Accuracy on Trained data - ",accuracy_score(Y_train,Y_train_pred))
print("Classification report on Trained data - \n",classification_report(Y_train,Y_train_pred))
print("Confusion matrix on Trained data - \n",confusion_matrix(Y_train,Y_train_pred))
print("Accuracy on Test data - ",accuracy_score(Y_test,Y_test_pred))
print("Classification report on Test data - \n",classification_report(Y_test,Y_test_pred))
print("Confusion matrix on Test data - \n",confusion_matrix(Y_test,Y_test_pred))

Using MLP :-
Accuracy on Trained data -  0.4482682306075025
Classification report on Trained data - 
               precision    recall  f1-score   support

           0       0.43      0.39      0.41      5472
           1       0.46      0.79      0.58      7506
           2       0.00      0.00      0.00      1225
           3       0.00      0.00      0.00        56
           4       0.29      0.00      0.01      2073
           5       0.00      0.00      0.00        26
           6       0.00      0.00      0.00       813
           7       1.00      0.00      0.00       868
           8       0.00      0.00      0.00        33
           9       0.00      0.00      0.00         2

    accuracy                           0.45     18074
   macro avg       0.22      0.12      0.10     18074
weighted avg       0.40      0.45      0.37     18074

Confusion matrix on Trained data - 
 [[2149 3315    0    0    8    0    0    0    0    0]
 [1556 5945    0    0    5    0    0    0    0   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
