In [1]:
import numpy as np
import pandas as pd
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.decomposition import PCA

In [2]:
raw_data = pd.read_csv("IPL_Ball_by_Ball_2008_2022.csv")
raw_data = raw_data.sample(frac=0.1, random_state=42)
raw_data

Unnamed: 0,ID,innings,overs,ballnumber,batter,bowler,non-striker,extra_type,batsman_run,extras_run,total_run,non_boundary,isWicketDelivery,player_out,kind,fielders_involved,BattingTeam
177814,501226,1,12,2,MS Dhoni,Yuvraj Singh,MEK Hussey,,1,0,1,0,0,,,,Chennai Super Kings
3287,1304107,1,17,4,SW Billings,T Natarajan,AD Russell,,6,0,6,0,0,,,,Kolkata Knight Riders
188612,419148,2,13,3,RV Uthappa,AD Mathews,LRPL Taylor,,0,0,0,0,0,,,,Royal Challengers Bangalore
96212,980961,1,13,6,RA Jadeja,Z Khan,KD Karthik,,0,0,0,0,0,,,,Gujarat Lions
30637,1254064,1,5,1,RR Pant,JD Unadkat,AM Rahane,,4,0,4,0,0,,,,Delhi Capitals
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164073,548320,2,16,4,YK Pathan,GB Hogg,Shakib Al Hasan,,0,0,0,0,0,,,,Kolkata Knight Riders
29125,1254071,2,14,1,JM Bairstow,FA Allen,KS Williamson,,1,0,1,0,0,,,,Sunrisers Hyderabad
84133,1082612,1,9,4,WP Saha,JJ Bumrah,HM Amla,,2,0,2,0,0,,,,Kings XI Punjab
164546,548318,2,16,5,MS Dhoni,R Vinay Kumar,DJ Bravo,,0,0,0,0,0,,,,Chennai Super Kings


In [3]:
def rectify_data(raw_data):
    raw_data=raw_data[raw_data['total_run']<=6]
    processed_data = pd.DataFrame()
    processed_data['first_innings'] = 2-raw_data['innings']
    processed_data['second_innings'] = raw_data['innings']-1
    processed_data['powerplay'] = (raw_data['overs']<=6).astype(int)
    processed_data['middle'] = ((raw_data['overs']>6) & (raw_data['overs']<=15)).astype(int)
    processed_data['death'] = (raw_data['overs']>15).astype(int)
    distinct_batters = raw_data['batter'].unique()
    batter_names = pd.concat([raw_data['batter'].fillna('') == batter for batter in distinct_batters], axis=1).astype(int)
    batter_names.columns = distinct_batters
    processed_data = pd.concat([processed_data,batter_names],axis=1)
    distinct_bowlers = raw_data['bowler'].unique()
    bowler_names = pd.concat([raw_data['bowler'].fillna('') == bowler for bowler in distinct_bowlers], axis=1).astype(int)
    bowler_names.columns = distinct_bowlers
    processed_data = pd.concat([processed_data,batter_names],axis=1)
    # processed_data['run_and_wicket'] = raw_data[['total_run','isWicketDelivery']].apply(list,axis=1)
    processed_data['label'] = raw_data['total_run']+7*raw_data['isWicketDelivery']    
    return processed_data

In [4]:
# def rectify_data(raw_data):
#     raw_data=raw_data[raw_data['total_run']<=6]
#     processed_data = pd.DataFrame()
#     processed_data['first_innings'] = 2-raw_data['innings']
#     processed_data['second_innings'] = raw_data['innings']-1
#     processed_data['powerplay'] = (raw_data['overs']<=6).astype(int)
#     processed_data['middle'] = ((raw_data['overs']>6) & (raw_data['overs']<=15)).astype(int)
#     processed_data['death'] = (raw_data['overs']>15).astype(int)
#     distinct_batters = {}
#     counter = 0
#     for i in raw_data['batter'].unique() :
#         distinct_batters[i]=counter
#         counter+=1
#     processed_data['batter'] = [distinct_batters[x] for x in raw_data['batter']]
#     distinct_bowlers = {}
#     counter = 0
#     for i in raw_data['bowler'].unique() :
#         distinct_bowlers[i]=counter
#         counter+=1
#     processed_data['bowler'] = [distinct_bowlers[x] for x in raw_data['bowler']]
#     # processed_data['total_run'] = raw_data['total_run']
#     # processed_data['iswicket'] = raw_data['isWicketDelivery']
#     processed_data['label'] = raw_data['total_run']+7*raw_data['isWicketDelivery']
#     return processed_data

In [5]:
modified_data = rectify_data(raw_data)
X = modified_data.drop(columns = "label")
Y = modified_data["label"]
# pca = PCA(n_components=100) 
# X = pca.fit_transform(X)     
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 1092003)
len(modified_data.columns)

1070

In [6]:
def Logistic_Regression(X_train,Y_train):
    model = LogisticRegression()
    model.fit(X_train,Y_train)
    return model

In [7]:
def SVM(X_train,Y_train):
    model = SVC()
    model.fit(X_train,Y_train)
    return model

In [8]:
def MLP(X_train,Y_train):
    model = MLPClassifier(activation = 'relu',hidden_layer_sizes=(10,5,10), max_iter=30000)
    model.fit(X_train,Y_train)
    return model

In [9]:
print("Using Logistic Regression :-")
model = Logistic_Regression(X_train,Y_train)
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)
print("Accuracy on Trained data - ",accuracy_score(Y_train,Y_train_pred))
print("Classification report on Trained data - \n",classification_report(Y_train,Y_train_pred))
print("Confusion matrix on Trained data - \n",confusion_matrix(Y_train,Y_train_pred))
print("Accuracy on Test data - ",accuracy_score(Y_test,Y_test_pred))
print("Classification report on Test data - \n",classification_report(Y_test,Y_test_pred))
print("Confusion matrix on Test data - \n",confusion_matrix(Y_test,Y_test_pred))

Using Logistic Regression :-
Accuracy on Trained data -  0.443399358194091
Classification report on Trained data - 
               precision    recall  f1-score   support

           0       0.42      0.37      0.39      5472
           1       0.45      0.80      0.58      7506
           2       0.00      0.00      0.00      1225
           3       0.00      0.00      0.00        56
           4       0.00      0.00      0.00      2073
           5       0.00      0.00      0.00        26
           6       0.00      0.00      0.00       813
           7       0.00      0.00      0.00       868
           8       0.00      0.00      0.00        33
           9       0.00      0.00      0.00         2

    accuracy                           0.44     18074
   macro avg       0.09      0.12      0.10     18074
weighted avg       0.32      0.44      0.36     18074

Confusion matrix on Trained data - 
 [[2004 3468    0    0    0    0    0    0    0    0]
 [1496 6010    0    0    0    0   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
i = 0
probabilities = model.predict_proba(X_test)
classes = model.classes_
dict = {}
for _ in range(len(classes)):
    print(_)
    dict[classes[_]] = probabilities[i][_]
dict

0
1
2
3
4
5
6
7
8
9


{0: 0.3497631238154191,
 1: 0.38700440296181776,
 2: 0.05046833391087512,
 3: 0.0012029340813427367,
 4: 0.1597841826614829,
 5: 0.0010102381825937792,
 6: 0.007223775271523209,
 7: 0.042613525132183125,
 8: 0.0008482242641277924,
 9: 8.125971863466622e-05}

In [11]:
num = model.predict_proba(X_test)
num[0]

array([3.49763124e-01, 3.87004403e-01, 5.04683339e-02, 1.20293408e-03,
       1.59784183e-01, 1.01023818e-03, 7.22377527e-03, 4.26135251e-02,
       8.48224264e-04, 8.12597186e-05])

In [12]:
print("Using SVM :-")
model = SVM(X_train,Y_train)
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)
print("Accuracy on Trained data - ",accuracy_score(Y_train,Y_train_pred))
print("Classification report on Trained data - \n",classification_report(Y_train,Y_train_pred))
print("Confusion matrix on Trained data - \n",confusion_matrix(Y_train,Y_train_pred))
print("Accuracy on Test data - ",accuracy_score(Y_test,Y_test_pred))
print("Classification report on Test data - \n",classification_report(Y_test,Y_test_pred))
print("Confusion matrix on Test data - \n",confusion_matrix(Y_test,Y_test_pred))

Using SVM :-
Accuracy on Trained data -  0.45396702445501824
Classification report on Trained data - 
               precision    recall  f1-score   support

           0       0.43      0.41      0.42      5472
           1       0.46      0.79      0.58      7506
           2       1.00      0.00      0.00      1225
           3       0.00      0.00      0.00        56
           4       0.50      0.00      0.00      2073
           5       0.00      0.00      0.00        26
           6       0.00      0.00      0.00       813
           7       0.00      0.00      0.00       868
           8       0.00      0.00      0.00        33
           9       0.00      0.00      0.00         2

    accuracy                           0.45     18074
   macro avg       0.24      0.12      0.10     18074
weighted avg       0.45      0.45      0.37     18074

Confusion matrix on Trained data - 
 [[2259 3213    0    0    0    0    0    0    0    0]
 [1566 5939    0    0    1    0    0    0    0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
print("Using MLP :-")
model = MLP(X_train,Y_train)
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)
print("Accuracy on Trained data - ",accuracy_score(Y_train,Y_train_pred))
print("Classification report on Trained data - \n",classification_report(Y_train,Y_train_pred))
print("Confusion matrix on Trained data - \n",confusion_matrix(Y_train,Y_train_pred))
print("Accuracy on Test data - ",accuracy_score(Y_test,Y_test_pred))
print("Classification report on Test data - \n",classification_report(Y_test,Y_test_pred))
print("Confusion matrix on Test data - \n",confusion_matrix(Y_test,Y_test_pred))

Using MLP :-
Accuracy on Trained data -  0.44754896536461214
Classification report on Trained data - 
               precision    recall  f1-score   support

           0       0.42      0.40      0.41      5472
           1       0.46      0.78      0.58      7506
           2       0.00      0.00      0.00      1225
           3       0.00      0.00      0.00        56
           4       0.20      0.00      0.00      2073
           5       0.00      0.00      0.00        26
           6       0.28      0.01      0.01       813
           7       0.00      0.00      0.00       868
           8       0.00      0.00      0.00        33
           9       0.00      0.00      0.00         2

    accuracy                           0.45     18074
   macro avg       0.14      0.12      0.10     18074
weighted avg       0.35      0.45      0.37     18074

Confusion matrix on Trained data - 
 [[2192 3275    0    0    3    0    2    0    0    0]
 [1613 5889    0    0    3    0    1    0    0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
