In [1]:
import numpy as np
import pandas as pd
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier

In [2]:
raw_data = pd.read_csv("IPL_Ball_by_Ball_2008_2022.csv")
raw_data = raw_data.sample(frac=0.025, random_state=42)
raw_data

Unnamed: 0,ID,innings,overs,ballnumber,batter,bowler,non-striker,extra_type,batsman_run,extras_run,total_run,non_boundary,isWicketDelivery,player_out,kind,fielders_involved,BattingTeam
177814,501226,1,12,2,MS Dhoni,Yuvraj Singh,MEK Hussey,,1,0,1,0,0,,,,Chennai Super Kings
3287,1304107,1,17,4,SW Billings,T Natarajan,AD Russell,,6,0,6,0,0,,,,Kolkata Knight Riders
188612,419148,2,13,3,RV Uthappa,AD Mathews,LRPL Taylor,,0,0,0,0,0,,,,Royal Challengers Bangalore
96212,980961,1,13,6,RA Jadeja,Z Khan,KD Karthik,,0,0,0,0,0,,,,Gujarat Lions
30637,1254064,1,5,1,RR Pant,JD Unadkat,AM Rahane,,4,0,4,0,0,,,,Delhi Capitals
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95356,980969,2,14,1,V Kohli,R Bhatia,SR Watson,,4,0,4,0,0,,,,Royal Challengers Bangalore
216575,336022,1,8,2,V Sehwag,RR Powar,S Dhawan,wides,0,1,1,0,0,,,,Delhi Daredevils
16561,1304052,1,18,5,UT Yadav,Akash Deep,CV Varun,,0,0,0,0,1,UT Yadav,bowled,,Kolkata Knight Riders
8938,1304084,2,13,4,AT Rayudu,Sandeep Sharma,RA Jadeja,,4,0,4,0,0,,,,Chennai Super Kings


In [3]:
def rectify_data(raw_data):
    raw_data=raw_data[raw_data['total_run']<=6]
    processed_data = pd.DataFrame()
    processed_data['first_innings'] = 2-raw_data['innings']
    processed_data['second_innings'] = raw_data['innings']-1
    processed_data['powerplay'] = (raw_data['overs']<=6).astype(int)
    processed_data['middle'] = ((raw_data['overs']>6) & (raw_data['overs']<=15)).astype(int)
    processed_data['death'] = (raw_data['overs']>15).astype(int)
    distinct_batters = raw_data['batter'].unique()
    batter_names = pd.concat([raw_data['batter'].fillna('') == batter for batter in distinct_batters], axis=1).astype(int)
    batter_names.columns = distinct_batters
    processed_data = pd.concat([processed_data,batter_names],axis=1)
    distinct_bowlers = raw_data['bowler'].unique()
    bowler_names = pd.concat([raw_data['bowler'].fillna('') == bowler for bowler in distinct_bowlers], axis=1).astype(int)
    bowler_names.columns = distinct_bowlers
    processed_data = pd.concat([processed_data,batter_names],axis=1)
    # processed_data['run_and_wicket'] = raw_data[['total_run','isWicketDelivery']].apply(list,axis=1)
    processed_data['label'] = raw_data['total_run']+7*raw_data['isWicketDelivery']

    #outliers
    # processed_data = processed_data[processed_data['label']%7<=6]
    
    return processed_data

In [19]:
# def rectify_data(raw_data):
#     raw_data=raw_data[raw_data['total_run']<=6]
#     processed_data = pd.DataFrame()
#     processed_data['first_innings'] = 2-raw_data['innings']
#     processed_data['second_innings'] = raw_data['innings']-1
#     processed_data['powerplay'] = (raw_data['overs']<=6).astype(int)
#     processed_data['middle'] = ((raw_data['overs']>6) & (raw_data['overs']<=15)).astype(int)
#     processed_data['death'] = (raw_data['overs']>15).astype(int)
#     distinct_batters = {}
#     counter = 0
#     for i in raw_data['batter'].unique() :
#         distinct_batters[i]=counter
#         counter+=1
#     processed_data['batter'] = [distinct_batters[x] for x in raw_data['batter']]
#     distinct_bowlers = {}
#     counter = 0
#     for i in raw_data['bowler'].unique() :
#         distinct_bowlers[i]=counter
#         counter+=1
#     processed_data['bowler'] = [distinct_bowlers[x] for x in raw_data['bowler']]
#     # processed_data['total_run'] = raw_data['total_run']
#     # processed_data['iswicket'] = raw_data['isWicketDelivery']
#     processed_data['label'] = raw_data['total_run']+7*raw_data['isWicketDelivery']
#     return processed_data

In [11]:
modified_data = rectify_data(raw_data)
X = modified_data.drop(columns = "label")
Y = modified_data["label"]
# if pca is required :-
# pca = PCA(n_components=100) 
# X = pca.fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 1092003)
modified_data

Unnamed: 0,first_innings,second_innings,powerplay,middle,death,MS Dhoni,SW Billings,RV Uthappa,RA Jadeja,RR Pant,...,RD Chahar,MM Sharma,Virat Singh,M Theekshana,NK Patel,M Prasidh Krishna,AS Joseph,J Suchith,DAJ Bracewell,label
177814,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3287,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,6
188612,0,1,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
96212,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
30637,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95356,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
216575,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
16561,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
8938,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4


In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming df is your DataFrame containing the features and labels

# Separate features (X) and labels (y)
X = modified_data.drop(columns=['label'])  # Assuming 'label' is the column with the target labels
y = modified_data['label']

# Convert target labels to one-hot encoding
Y_one_hot = pd.get_dummies(y)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y_one_hot, test_size=0.2, random_state=42)

def sigmoid(z):
    """Sigmoid activation function"""
    return 1 / (1 + np.exp(-z))

def softmax(X):
    """Softmax activation function"""
    exp_X = np.exp(X - np.max(X, axis=1, keepdims=True))
    return exp_X / np.sum(exp_X, axis=1, keepdims=True)

def initialize_parameters(num_features, num_classes):
    """Initialize weights and bias"""
    W = np.random.randn(num_features, num_classes)
    b = np.zeros((1, num_classes))
    return W, b

def forward_propagation(X, W, b):
    """Forward propagation"""
    Z = np.dot(X, W) + b
    A = softmax(Z)
    return A

def custom_logistic_loss(Y, A, label_coefficients):
    """Compute cross-entropy loss with label-specific coefficients"""
    epsilon = 1e-15  # Small constant to prevent log(0)
    A = np.clip(A, epsilon, 1 - epsilon)
    coefficients_array = np.array([label_coefficients[label] for label in range(len(label_coefficients))])
    # Apply label-specific coefficients to the loss
    loss = -np.sum(Y * np.log(A) * coefficients_array) / Y.shape[0]

    return loss

def backward_propagation(X, A, Y):
    """Backward propagation"""
    dZ = A - Y
    dW = np.dot(X.T, dZ)
    db = np.sum(dZ, axis=0, keepdims=True)
    return dW, db

def update_parameters(W, b, dW, db, learning_rate):
    """Update parameters using gradient descent"""
    W -= learning_rate * dW
    b -= learning_rate * db
    return W, b

def train(X, Y, num_epochs, learning_rate, label_coefficients):
    num_features = X.shape[1]
    num_classes = Y.shape[1]

    W, b = initialize_parameters(num_features, num_classes)

    for epoch in range(num_epochs):
        # Forward propagation
        A = forward_propagation(X, W, b)

        # Compute loss
        loss = custom_logistic_loss(Y, A, label_coefficients)

        # Backward propagation
        dW, db = backward_propagation(X, A, Y)

        # Update parameters
        W, b = update_parameters(W, b, dW, db, learning_rate)

        # Print loss every 100 epochs
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Loss: {loss}")

    return W, b

def predict(X, W, b):
    """Make predictions and convert to label form"""
    A = forward_propagation(X, W, b)
    predicted_labels = np.argmax(A, axis=1)
    return predicted_labels

# Example usage:
num_epochs = 1000
learning_rate = 0.01

# Assuming label_coefficients is a dictionary specifying coefficients for each label
label_coefficients = {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 20.0, 8: 20.0}

W_trained, b_trained = train(X_train, Y_train.values, num_epochs, learning_rate, label_coefficients)

# Assuming X_test is your test feature matrix
# X_test.shape = (m_test, num_features)

# Make predictions
predicted_labels = predict(X_test, W_trained, b_trained)
print("Predicted Labels:", predicted_labels)


Epoch 0, Loss: 8.220901751993644
Epoch 100, Loss: 18.017364605497082
Epoch 200, Loss: 34.71299413129328
Epoch 300, Loss: 31.98208500706677
Epoch 400, Loss: 37.37135914444258
Epoch 500, Loss: 40.1221648721275
Epoch 600, Loss: 22.442119623727393
Epoch 700, Loss: 34.343369765558315
Epoch 800, Loss: 16.495691300081536
Epoch 900, Loss: 37.24067933903581
Predicted Labels: [1 1 1 ... 1 1 1]


In [29]:
accuracy_score(predicted_labels, Y_test.values.argmax(axis=1))

0.3513274336283186

In [21]:
def Logistic_Regression(X_train,Y_train):
    model = LogisticRegression(n_jobs=-1)
    model.fit(X_train,Y_train)
    return model

In [None]:
def SVM(X_train,Y_train):
    model = SVC(probability=True)
    model.fit(X_train,Y_train)
    return model

In [23]:
def MLP(X_train,Y_train):
    model = MLPClassifier(activation = 'relu',hidden_layer_sizes=(10,5,10), max_iter=30000)
    model.fit(X_train,Y_train)
    return model

In [24]:
print("Using Logistic Regression :-")
model_lr = Logistic_Regression(X_train,Y_train)
Y_train_pred = model_lr.predict(X_train)
Y_test_pred = model_lr.predict(X_test)
print("Accuracy on Trained data - ",accuracy_score(Y_train,Y_train_pred))
print("Classification report on Trained data - \n",classification_report(Y_train,Y_train_pred))
print("Confusion matrix on Trained data - \n",confusion_matrix(Y_train,Y_train_pred))
print("Accuracy on Test data - ",accuracy_score(Y_test,Y_test_pred))
print("Classification report on Test data - \n",classification_report(Y_test,Y_test_pred))
print("Confusion matrix on Test data - \n",confusion_matrix(Y_test,Y_test_pred))

Using Logistic Regression :-
Accuracy on Trained data -  0.4518805309734513
Classification report on Trained data - 
               precision    recall  f1-score   support

           0       0.43      0.50      0.46       560
           1       0.46      0.73      0.57       742
           2       0.00      0.00      0.00       109
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00       216
           5       0.00      0.00      0.00         3
           6       0.00      0.00      0.00        83
           7       0.00      0.00      0.00        80
           8       0.00      0.00      0.00         9

    accuracy                           0.45      1808
   macro avg       0.10      0.14      0.11      1808
weighted avg       0.32      0.45      0.38      1808

Confusion matrix on Trained data - 
 [[279 281   0   0   0   0   0   0   0]
 [204 538   0   0   0   0   0   0   0]
 [ 31  78   0   0   0   0   0   0   0]
 [  2   4   0   0   0  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
print("Using SVM :-")
model_SVM = SVM(X_train,Y_train)
Y_train_pred = model_SVM.predict(X_train)
Y_test_pred = model_SVM.predict(X_test)
print("Accuracy on Trained data - ",accuracy_score(Y_train,Y_train_pred))
print("Classification report on Trained data - \n",classification_report(Y_train,Y_train_pred))
print("Confusion matrix on Trained data - \n",confusion_matrix(Y_train,Y_train_pred))
print("Accuracy on Test data - ",accuracy_score(Y_test,Y_test_pred))
print("Classification report on Test data - \n",classification_report(Y_test,Y_test_pred))
print("Confusion matrix on Test data - \n",confusion_matrix(Y_test,Y_test_pred))

Using SVM :-
Accuracy on Trained data -  0.4103982300884956
Classification report on Trained data - 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       560
           1       0.41      1.00      0.58       742
           2       0.00      0.00      0.00       109
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00       216
           5       0.00      0.00      0.00         3
           6       0.00      0.00      0.00        83
           7       0.00      0.00      0.00        80
           8       0.00      0.00      0.00         9

    accuracy                           0.41      1808
   macro avg       0.05      0.11      0.06      1808
weighted avg       0.17      0.41      0.24      1808

Confusion matrix on Trained data - 
 [[  0 560   0   0   0   0   0   0   0]
 [  0 742   0   0   0   0   0   0   0]
 [  0 109   0   0   0   0   0   0   0]
 [  0   6   0   0   0   0   0   0   0]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
print("Using MLP :-")
model_MLP = MLP(X_train,Y_train)
Y_train_pred = model_MLP.predict(X_train)
Y_test_pred = model_MLP.predict(X_test)
print("Accuracy on Trained data - ",accuracy_score(Y_train,Y_train_pred))
print("Classification report on Trained data - \n",classification_report(Y_train,Y_train_pred))
print("Confusion matrix on Trained data - \n",confusion_matrix(Y_train,Y_train_pred))
print("Accuracy on Test data - ",accuracy_score(Y_test,Y_test_pred))
print("Classification report on Test data - \n",classification_report(Y_test,Y_test_pred))
print("Confusion matrix on Test data - \n",confusion_matrix(Y_test,Y_test_pred))

Using MLP :-
Accuracy on Trained data -  0.435287610619469
Classification report on Trained data - 
               precision    recall  f1-score   support

           0       0.41      0.30      0.35       560
           1       0.44      0.83      0.58       742
           2       0.00      0.00      0.00       109
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00       216
           5       0.00      0.00      0.00         3
           6       0.00      0.00      0.00        83
           7       0.00      0.00      0.00        80
           8       0.00      0.00      0.00         9

    accuracy                           0.44      1808
   macro avg       0.09      0.13      0.10      1808
weighted avg       0.31      0.44      0.35      1808

Confusion matrix on Trained data - 
 [[170 390   0   0   0   0   0   0   0]
 [125 617   0   0   0   0   0   0   0]
 [ 23  86   0   0   0   0   0   0   0]
 [  1   5   0   0   0   0   0   0   0]
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
