## Import the datasets

In [1]:
import numpy as np
import pandas as pd
match = pd.read_csv('matches.csv')
delivery = pd.read_csv('deliveries.csv')

In [2]:
match.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               756 non-null    int64 
 1   Season           756 non-null    object
 2   city             749 non-null    object
 3   date             756 non-null    object
 4   team1            756 non-null    object
 5   team2            756 non-null    object
 6   toss_winner      756 non-null    object
 7   toss_decision    756 non-null    object
 8   result           756 non-null    object
 9   dl_applied       756 non-null    int64 
 10  winner           752 non-null    object
 11  win_by_runs      756 non-null    int64 
 12  win_by_wickets   756 non-null    int64 
 13  player_of_match  752 non-null    object
 14  venue            756 non-null    object
 15  umpire1          754 non-null    object
 16  umpire2          754 non-null    object
 17  umpire3          119 non-null    ob

In [3]:
delivery.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179078 entries, 0 to 179077
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   match_id          179078 non-null  int64 
 1   inning            179078 non-null  int64 
 2   batting_team      179078 non-null  object
 3   bowling_team      179078 non-null  object
 4   over              179078 non-null  int64 
 5   ball              179078 non-null  int64 
 6   batsman           179078 non-null  object
 7   non_striker       179078 non-null  object
 8   bowler            179078 non-null  object
 9   is_super_over     179078 non-null  int64 
 10  wide_runs         179078 non-null  int64 
 11  bye_runs          179078 non-null  int64 
 12  legbye_runs       179078 non-null  int64 
 13  noball_runs       179078 non-null  int64 
 14  penalty_runs      179078 non-null  int64 
 15  batsman_runs      179078 non-null  int64 
 16  extra_runs        179078 non-null  int

## Preprocessing of datasets and merging them

In [4]:
total_score_df = delivery.groupby(['match_id','inning']).sum()['total_runs'].reset_index()

In [5]:
total_score_df = total_score_df[total_score_df['inning']==1]

In [6]:
match_df = match.merge(total_score_df[['match_id','total_runs']],left_on='id',right_on='match_id')

In [7]:
teams = ['Sunrisers Hyderabad','Mumbai Indians','Royal Challegers Bangalore','Kolkata Knight Riders','Kings XI Punjab','Chennai Super Kings','Rajasthan Royals','Delhi Capitals']

In [8]:
match_df['team1'] = match_df['team1'].str.replace('Delhi Daredevils','Delhi Capitals')
match_df['team1'] = match_df['team2'].str.replace('Delhi Daredevils','Delhi Capitals')
match_df['team1'] = match_df['team1'].str.replace('Deccan Chargers','Sunrisers Hyderabad')
match_df['team1'] = match_df['team2'].str.replace('Deccan Chargers','Sunrisers Hyderabad')

In [9]:
match_df = match_df[match_df['team1'].isin(teams)]
match_df = match_df[match_df['team2'].isin(teams)]
match_df.head()

Unnamed: 0,id,Season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3,match_id,total_runs
2,3,IPL-2017,Rajkot,07-04-2017,Kolkata Knight Riders,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan,,3,183
3,4,IPL-2017,Indore,08-04-2017,Kings XI Punjab,Kings XI Punjab,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin,,4,163
5,6,IPL-2017,Hyderabad,09-04-2017,Sunrisers Hyderabad,Sunrisers Hyderabad,Sunrisers Hyderabad,field,normal,0,Sunrisers Hyderabad,0,9,Rashid Khan,"Rajiv Gandhi International Stadium, Uppal",A Deshmukh,NJ Llong,,6,135
6,7,IPL-2017,Mumbai,09-04-2017,Mumbai Indians,Mumbai Indians,Mumbai Indians,field,normal,0,Mumbai Indians,0,4,N Rana,Wankhede Stadium,Nitin Menon,CK Nandan,,7,178
7,8,IPL-2017,Indore,10-04-2017,Kings XI Punjab,Kings XI Punjab,Royal Challengers Bangalore,bat,normal,0,Kings XI Punjab,0,8,AR Patel,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin,,8,148


In [10]:
match_df = match_df[match_df['dl_applied']==0]

In [11]:
match_df = match_df[['match_id','city','winner','total_runs']]

In [12]:
final_df = match_df.merge(delivery,on='match_id')

In [13]:
final_df = final_df[final_df['inning']==2]

In [14]:
final_df['current_score']=final_df.groupby('match_id').cumsum()['total_runs_y']

In [15]:
final_df['runs_left'] = final_df['total_runs_x'] - final_df['current_score']+1

In [16]:
final_df['balls_left'] = 126 - (final_df['over']*6+final_df['ball'])

In [17]:
final_df

Unnamed: 0,match_id,city,winner,total_runs_x,inning,batting_team,bowling_team,over,ball,batsman,...,penalty_runs,batsman_runs,extra_runs,total_runs_y,player_dismissed,dismissal_kind,fielder,current_score,runs_left,balls_left
122,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,1,1,G Gambhir,...,0,1,0,1,,,,1,183,119
123,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,1,2,CA Lynn,...,0,0,0,0,,,,1,183,118
124,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,1,3,CA Lynn,...,0,1,0,1,,,,2,182,117
125,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,1,4,G Gambhir,...,0,0,0,0,,,,2,182,116
126,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,1,5,G Gambhir,...,0,1,0,1,,,,3,181,115
127,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,1,6,CA Lynn,...,0,4,0,4,,,,7,177,114
128,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,2,1,G Gambhir,...,0,1,0,1,,,,8,176,113
129,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,2,2,CA Lynn,...,0,6,0,6,,,,14,170,112
130,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,2,3,CA Lynn,...,0,1,0,1,,,,15,169,111
131,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,2,4,G Gambhir,...,0,4,0,4,,,,19,165,110


In [18]:
final_df['player_dismissed'] = final_df['player_dismissed'].fillna("0")
final_df['player_dismissed'] = final_df['player_dismissed'].apply(lambda x:x if x == "0" else "1")
final_df['player_dismissed'] = final_df['player_dismissed'].astype(int)
wickets = final_df.groupby('match_id').cumsum()['player_dismissed'].values
final_df['wickets'] = 10 - wickets

In [19]:
final_df.head()

Unnamed: 0,match_id,city,winner,total_runs_x,inning,batting_team,bowling_team,over,ball,batsman,...,batsman_runs,extra_runs,total_runs_y,player_dismissed,dismissal_kind,fielder,current_score,runs_left,balls_left,wickets
122,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,1,1,G Gambhir,...,1,0,1,0,,,1,183,119,10
123,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,1,2,CA Lynn,...,0,0,0,0,,,1,183,118,10
124,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,1,3,CA Lynn,...,1,0,1,0,,,2,182,117,10
125,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,1,4,G Gambhir,...,0,0,0,0,,,2,182,116,10
126,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,1,5,G Gambhir,...,1,0,1,0,,,3,181,115,10


In [20]:
final_df['crr'] = (final_df['current_score']*6)/(120-final_df['balls_left'])

In [21]:
final_df['rrr'] = (final_df['runs_left']*6)/(final_df['balls_left'])

In [22]:
def result(row):
    return 1 if row['batting_team']==row['winner'] else 0
      
final_df['result'] = final_df.apply(result,axis=1)        

In [23]:
final_df.head()

Unnamed: 0,match_id,city,winner,total_runs_x,inning,batting_team,bowling_team,over,ball,batsman,...,player_dismissed,dismissal_kind,fielder,current_score,runs_left,balls_left,wickets,crr,rrr,result
122,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,1,1,G Gambhir,...,0,,,1,183,119,10,6.0,9.226891,1
123,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,1,2,CA Lynn,...,0,,,1,183,118,10,3.0,9.305085,1
124,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,1,3,CA Lynn,...,0,,,2,182,117,10,4.0,9.333333,1
125,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,1,4,G Gambhir,...,0,,,2,182,116,10,3.0,9.413793,1
126,3,Rajkot,Kolkata Knight Riders,183,2,Kolkata Knight Riders,Gujarat Lions,1,5,G Gambhir,...,0,,,3,181,115,10,3.6,9.443478,1


In [24]:
main_df = final_df[['batting_team','bowling_team','city','runs_left','balls_left','wickets','total_runs_x','crr','rrr','result']]

In [25]:
main_df.head()

Unnamed: 0,batting_team,bowling_team,city,runs_left,balls_left,wickets,total_runs_x,crr,rrr,result
122,Kolkata Knight Riders,Gujarat Lions,Rajkot,183,119,10,183,6.0,9.226891,1
123,Kolkata Knight Riders,Gujarat Lions,Rajkot,183,118,10,183,3.0,9.305085,1
124,Kolkata Knight Riders,Gujarat Lions,Rajkot,182,117,10,183,4.0,9.333333,1
125,Kolkata Knight Riders,Gujarat Lions,Rajkot,182,116,10,183,3.0,9.413793,1
126,Kolkata Knight Riders,Gujarat Lions,Rajkot,181,115,10,183,3.6,9.443478,1


In [26]:
main_df = main_df.sample(main_df.shape[0])

In [27]:
main_df.dropna(inplace=True)
main_df = main_df[main_df['balls_left'] != 0]

## Splitting train and test data

In [28]:
x = main_df.iloc[:,:-1]
y = main_df.iloc[:,-1]
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size = 0.2, random_state = 6)

## Encoding the data for model training

In [29]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse=False,drop='first'),['batting_team','bowling_team','city'])
]
,remainder='passthrough')

## Creating pipeline for predictions

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',LogisticRegression(solver='liblinear'))
    
])

In [31]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('step1',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('trf',
                                                  OneHotEncoder(drop='first',
                                                                sparse=False),
                                                  ['batting_team',
                                                   'bowling_team', 'city'])])),
                ('step2', LogisticRegression(solver='liblinear'))])

In [32]:
predictions = pipe.predict(x_test)

## Testing accuracy of model

In [33]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predictions)

0.8272659035466317

## Saving the model as pickle file

In [34]:
import pickle
pickle.dump(pipe, open('pipe.pkl','wb'))