In [2]:
import pandas as pd
import numpy as np 
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import GaussianNB 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

In [3]:
df = pd.read_csv('PSL-2016-2022.csv')
df.tail()

Unnamed: 0,psl_year,match_number,team_1,team_2,inning,over,ball,runs,wicket,total_runs,wickets,is_four,is_six,is_wicket,wicket_text,result
50257,2022,34,Lahore Qalandars,Multan Sultans,2,19,5,0,2.0,138,9,False,False,True,bowled,Qalandars
50258,2022,34,Lahore Qalandars,Multan Sultans,2,19,6,0,,138,9,False,False,False,,Qalandars
50259,2022,34,Lahore Qalandars,Multan Sultans,2,20,1,0,,138,9,False,False,False,,Qalandars
50260,2022,34,Lahore Qalandars,Multan Sultans,2,20,2,0,,138,9,False,False,False,,Qalandars
50261,2022,34,Lahore Qalandars,Multan Sultans,2,20,3,0,1.0,138,10,False,False,True,caught,Qalandars


In [4]:
def is_winner(row):
    if type(row['result']) == str:
        if row['result'] in row['team_2']:
            return 1
        return 0
    else:
        return 0

In [5]:
def is_out(row):
    if type(row['player_dismissed']) == str:
        return 1
    else:
        return 0

In [6]:
first_inning = df[df['inning']==1]
second_inning = df[df['inning']==2]

In [7]:
total_sum = first_inning.groupby(["psl_year","match_number"]).agg(
   target= ("runs","sum")
)

In [8]:
total_sum = total_sum.reset_index()
total_sum

Unnamed: 0,psl_year,match_number,target
0,2016,1,128
1,2016,2,125
2,2016,3,145
3,2016,4,147
4,2016,5,117
...,...,...,...
210,2022,30,158
211,2022,31,163
212,2022,32,169
213,2022,33,168


In [9]:
new_df = pd.merge(total_sum, second_inning,  how='left', left_on=['psl_year','match_number'], right_on = ['psl_year','match_number'])
new_df

Unnamed: 0,psl_year,match_number,target,team_1,team_2,inning,over,ball,runs,wicket,total_runs,wickets,is_four,is_six,is_wicket,wicket_text,result
0,2016,1,128,Islamabad United,Quetta Gladiators,2.0,1.0,1.0,0.0,,0.0,0.0,False,False,False,,Gladiators
1,2016,1,128,Islamabad United,Quetta Gladiators,2.0,1.0,2.0,0.0,,0.0,0.0,False,False,False,,Gladiators
2,2016,1,128,Islamabad United,Quetta Gladiators,2.0,1.0,3.0,0.0,,0.0,0.0,False,False,False,,Gladiators
3,2016,1,128,Islamabad United,Quetta Gladiators,2.0,1.0,4.0,2.0,,2.0,0.0,False,False,False,,Gladiators
4,2016,1,128,Islamabad United,Quetta Gladiators,2.0,1.0,5.0,0.0,,2.0,0.0,False,False,False,,Gladiators
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24181,2022,34,180,Lahore Qalandars,Multan Sultans,2.0,19.0,5.0,0.0,2.0,138.0,9.0,False,False,True,bowled,Qalandars
24182,2022,34,180,Lahore Qalandars,Multan Sultans,2.0,19.0,6.0,0.0,,138.0,9.0,False,False,False,,Qalandars
24183,2022,34,180,Lahore Qalandars,Multan Sultans,2.0,20.0,1.0,0.0,,138.0,9.0,False,False,False,,Qalandars
24184,2022,34,180,Lahore Qalandars,Multan Sultans,2.0,20.0,2.0,0.0,,138.0,9.0,False,False,False,,Qalandars


In [10]:
new_df['won'] = new_df.apply(is_winner, axis=1)
new_df

Unnamed: 0,psl_year,match_number,target,team_1,team_2,inning,over,ball,runs,wicket,total_runs,wickets,is_four,is_six,is_wicket,wicket_text,result,won
0,2016,1,128,Islamabad United,Quetta Gladiators,2.0,1.0,1.0,0.0,,0.0,0.0,False,False,False,,Gladiators,1
1,2016,1,128,Islamabad United,Quetta Gladiators,2.0,1.0,2.0,0.0,,0.0,0.0,False,False,False,,Gladiators,1
2,2016,1,128,Islamabad United,Quetta Gladiators,2.0,1.0,3.0,0.0,,0.0,0.0,False,False,False,,Gladiators,1
3,2016,1,128,Islamabad United,Quetta Gladiators,2.0,1.0,4.0,2.0,,2.0,0.0,False,False,False,,Gladiators,1
4,2016,1,128,Islamabad United,Quetta Gladiators,2.0,1.0,5.0,0.0,,2.0,0.0,False,False,False,,Gladiators,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24181,2022,34,180,Lahore Qalandars,Multan Sultans,2.0,19.0,5.0,0.0,2.0,138.0,9.0,False,False,True,bowled,Qalandars,0
24182,2022,34,180,Lahore Qalandars,Multan Sultans,2.0,19.0,6.0,0.0,,138.0,9.0,False,False,False,,Qalandars,0
24183,2022,34,180,Lahore Qalandars,Multan Sultans,2.0,20.0,1.0,0.0,,138.0,9.0,False,False,False,,Qalandars,0
24184,2022,34,180,Lahore Qalandars,Multan Sultans,2.0,20.0,2.0,0.0,,138.0,9.0,False,False,False,,Qalandars,0


In [11]:
new_df['balls_left'] = 120-(((new_df['over']-1)*6)+new_df['ball'])
new_df

Unnamed: 0,psl_year,match_number,target,team_1,team_2,inning,over,ball,runs,wicket,total_runs,wickets,is_four,is_six,is_wicket,wicket_text,result,won,balls_left
0,2016,1,128,Islamabad United,Quetta Gladiators,2.0,1.0,1.0,0.0,,0.0,0.0,False,False,False,,Gladiators,1,119.0
1,2016,1,128,Islamabad United,Quetta Gladiators,2.0,1.0,2.0,0.0,,0.0,0.0,False,False,False,,Gladiators,1,118.0
2,2016,1,128,Islamabad United,Quetta Gladiators,2.0,1.0,3.0,0.0,,0.0,0.0,False,False,False,,Gladiators,1,117.0
3,2016,1,128,Islamabad United,Quetta Gladiators,2.0,1.0,4.0,2.0,,2.0,0.0,False,False,False,,Gladiators,1,116.0
4,2016,1,128,Islamabad United,Quetta Gladiators,2.0,1.0,5.0,0.0,,2.0,0.0,False,False,False,,Gladiators,1,115.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24181,2022,34,180,Lahore Qalandars,Multan Sultans,2.0,19.0,5.0,0.0,2.0,138.0,9.0,False,False,True,bowled,Qalandars,0,7.0
24182,2022,34,180,Lahore Qalandars,Multan Sultans,2.0,19.0,6.0,0.0,,138.0,9.0,False,False,False,,Qalandars,0,6.0
24183,2022,34,180,Lahore Qalandars,Multan Sultans,2.0,20.0,1.0,0.0,,138.0,9.0,False,False,False,,Qalandars,0,5.0
24184,2022,34,180,Lahore Qalandars,Multan Sultans,2.0,20.0,2.0,0.0,,138.0,9.0,False,False,False,,Qalandars,0,4.0


In [12]:
# Creating the Final Dataset
final_df = new_df[['over','ball','total_runs','wickets','target','balls_left','won']]
final_df

Unnamed: 0,over,ball,total_runs,wickets,target,balls_left,won
0,1.0,1.0,0.0,0.0,128,119.0,1
1,1.0,2.0,0.0,0.0,128,118.0,1
2,1.0,3.0,0.0,0.0,128,117.0,1
3,1.0,4.0,2.0,0.0,128,116.0,1
4,1.0,5.0,2.0,0.0,128,115.0,1
...,...,...,...,...,...,...,...
24181,19.0,5.0,138.0,9.0,180,7.0,0
24182,19.0,6.0,138.0,9.0,180,6.0,0
24183,20.0,1.0,138.0,9.0,180,5.0,0
24184,20.0,2.0,138.0,9.0,180,4.0,0


In [13]:
final_df['runs_left'] = final_df['target']- final_df['total_runs']
final_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['runs_left'] = final_df['target']- final_df['total_runs']


Unnamed: 0,over,ball,total_runs,wickets,target,balls_left,won,runs_left
0,1.0,1.0,0.0,0.0,128,119.0,1,128.0
1,1.0,2.0,0.0,0.0,128,118.0,1,128.0
2,1.0,3.0,0.0,0.0,128,117.0,1,128.0
3,1.0,4.0,2.0,0.0,128,116.0,1,126.0
4,1.0,5.0,2.0,0.0,128,115.0,1,126.0
...,...,...,...,...,...,...,...,...
24181,19.0,5.0,138.0,9.0,180,7.0,0,42.0
24182,19.0,6.0,138.0,9.0,180,6.0,0,42.0
24183,20.0,1.0,138.0,9.0,180,5.0,0,42.0
24184,20.0,2.0,138.0,9.0,180,4.0,0,42.0


In [14]:
new_cols =['wickets','balls_left','runs_left','won']
psl_df = final_df[new_cols]
psl_df.isnull().sum()

wickets       2
balls_left    2
runs_left     2
won           0
dtype: int64

In [15]:
psl_df = psl_df.apply(pd.to_numeric,errors ='coerce')
psl_df = psl_df.dropna()


In [16]:
psl_df.isnull().sum()

wickets       0
balls_left    0
runs_left     0
won           0
dtype: int64

In [17]:
y = psl_df.iloc[:,3]
x = psl_df.iloc[:,:3]


In [18]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size= 0.2,random_state=69)


In [21]:
SVC_model = SVC()
SVC_model.fit(x_train, y_train)
predicted = SVC_model.predict(x_test)
a = accuracy_score(y_test,predicted)
print('The accuracy using SVC Classifier is:',format(a*100))

The accuracy using SVC Classifier is: 79.65681207359934


In [None]:
current= {
    "wickets":5,
    "balls_left" :66,
    "runs_left":100,
        }
RF = RandomForestClassifier(n_estimators=100, random_state=42)
RF.fit(x_train, y_train)
current_df = pd.DataFrame(current,index=[0])
RF.predict_proba(current_df)

array([[0.83, 0.17]])

In [None]:
logistic_model = LogisticRegression()
logistic_model.fit(x_train, y_train)
y_pred_logistic = logistic_model.predict(x_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logistic))

Logistic Regression Accuracy: 0.7947074633037007


In [None]:
nb_model = GaussianNB()
nb_model.fit(x_train, y_train)
y_pred_nb = nb_model.predict(x_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.7113913582799256


In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(x_train, y_train)
y_pred_rf = rf_model.predict(x_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


Random Forest Accuracy: 0.7804424229894563


In [None]:
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train, y_train)
y_pred_dt = dt_model.predict(x_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))


Decision Tree Accuracy: 0.7626628075253257


In [None]:
GB = GradientBoostingClassifier()
GB.fit(x_train, y_train)
y_pred_gb = GB.predict(x_test)
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))




Gradient Boosting Accuracy: 0.8007029150299773


In [None]:
current= {
    "wickets":7,
    "balls_left" :55,
    "runs_left":88,
        }
GB = GradientBoostingClassifier(n_estimators=100, random_state=42)
GB.fit(x_train, y_train)
current_df = pd.DataFrame(current,index=[0])
# GB.predict_proba(current_df)
probabilities= GB.predict_proba(current_df)
print(f"Win Probability: {probabilities[0][1]:.2f}")
print(f"Lose Probability: {probabilities[0][0]:.2f}")

Win Probability: 0.05
Lose Probability: 0.95
