# IPL Prediction Probability
matches.csv --> Match analysis/ schedule, toss  
deliveries.csv --> Each Ball/match analysis

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
match = pd.read_csv('data/matches.csv')
delivery = pd.read_csv('data/deliveries.csv')

In [3]:
# Drop unwanted columns - player_of_the_match, umpire1, 2, 3
match = match.drop(columns = ['toss_winner','toss_decision','player_of_match','umpire1','umpire2','umpire3'])
delivery = delivery.drop(columns = ['bowler', 'is_super_over', 'wide_runs','bye_runs', 'legbye_runs', 'noball_runs', 'penalty_runs','dismissal_kind', 'fielder'])

In [4]:
display(match.head(3))
delivery.head(3)

Unnamed: 0,id,Season,city,date,team1,team2,result,dl_applied,winner,win_by_runs,win_by_wickets,venue
0,1,IPL-2017,Hyderabad,05-04-2017,Sunrisers Hyderabad,Royal Challengers Bangalore,normal,0,Sunrisers Hyderabad,35,0,"Rajiv Gandhi International Stadium, Uppal"
1,2,IPL-2017,Pune,06-04-2017,Mumbai Indians,Rising Pune Supergiant,normal,0,Rising Pune Supergiant,0,7,Maharashtra Cricket Association Stadium
2,3,IPL-2017,Rajkot,07-04-2017,Gujarat Lions,Kolkata Knight Riders,normal,0,Kolkata Knight Riders,0,10,Saurashtra Cricket Association Stadium


Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,batsman_runs,extra_runs,total_runs,player_dismissed
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,0,0,0,
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,0,0,0,
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,4,0,4,


In [5]:
print("Shape of Dataset:")
print("match: ", match.shape)
print("delivery: ", delivery.shape)

Shape of Dataset:
match:  (756, 12)
delivery:  (179078, 12)


In [6]:
total_scores_df = delivery.groupby(['match_id','inning']).agg({'total_runs':'sum'}).reset_index()
total_scores_df.head(6)

Unnamed: 0,match_id,inning,total_runs
0,1,1,207
1,1,2,172
2,2,1,184
3,2,2,187
4,3,1,183
5,3,2,184


In [7]:
# Interested in only the first innings score
inn1_total_scores_df = total_scores_df[total_scores_df.inning == 1]
inn1_total_scores_df = inn1_total_scores_df.rename(columns={"total_runs": "total_runs_inn1"})
# Add the other info from match dataset
final_match_df = match.merge(right = inn1_total_scores_df[['match_id', 'total_runs_inn1']], left_on = 'id', right_on='match_id')
final_match_df = final_match_df.drop(columns = ['id'])

In [8]:
teams = ['Sunrisers Hyderabad', 'Mumbai Indians', 'Gujarat Titans', 'Royal Challengers Bangalore', 
         'Kolkata Knight Riders','Kings XI Punjab','Chennai Super Kings','Rajasthan Royals', 'Delhi Capitals']

In [9]:
# Replace the name of team
replace_team_name = { 'Delhi Daredevils' : 'Delhi Capitals', 'Deccan Chargers' : 'Sunrisers Hyderabad', 'Gujarat Lions' : 'Gujrat Titans'}
final_match_df['team1'] = final_match_df['team1'].replace(replace_team_name)
final_match_df['team2'] = final_match_df['team2'].replace(replace_team_name)
final_match_df['winner'] = final_match_df['winner'].replace(replace_team_name)

In [10]:
# Dropping unwanted team matches
drop_team_name = ['Rising Pune Supergiant' , 'Deccan Chargers', 'Kochi Tuskers Kerala', 'Pune Warriors', 'Rising Pune Supergiants']
final_match_df = final_match_df[~(final_match_df['team2'].isin(drop_team_name) | final_match_df['team1'].isin(drop_team_name))]

In [11]:
# Removing the matches whic are affected by rain, and duckworth lewis(dls) applied
# dl_applied = 0 : normal matches
final_match_df = final_match_df[final_match_df['dl_applied'] == 0]

In [12]:
final_match_df.head(3)

Unnamed: 0,Season,city,date,team1,team2,result,dl_applied,winner,win_by_runs,win_by_wickets,venue,match_id,total_runs_inn1
0,IPL-2017,Hyderabad,05-04-2017,Sunrisers Hyderabad,Royal Challengers Bangalore,normal,0,Sunrisers Hyderabad,35,0,"Rajiv Gandhi International Stadium, Uppal",1,207
2,IPL-2017,Rajkot,07-04-2017,Gujrat Titans,Kolkata Knight Riders,normal,0,Kolkata Knight Riders,0,10,Saurashtra Cricket Association Stadium,3,183
4,IPL-2017,Bangalore,08-04-2017,Royal Challengers Bangalore,Delhi Capitals,normal,0,Royal Challengers Bangalore,15,0,M Chinnaswamy Stadium,5,157


In [13]:
final_match_df = final_match_df[['match_id', 'city', 'winner', 'total_runs_inn1']]

### Delivery Dataset

In [14]:
final_delivery_df = delivery[delivery['inning'] == 2]
final_delivery_df = final_match_df.merge(final_delivery_df, on = 'match_id')

In [15]:
# Change column name
final_delivery_df = final_delivery_df.rename(columns = {'total_runs': 'ball_score'})

# Replace the name of team
replace_team_name = { 'Delhi Daredevils' : 'Delhi Capitals', 'Deccan Chargers' : 'Sunrisers Hyderabad', 'Gujarat Lions' : 'Gujrat Titans'}
final_delivery_df['batting_team'] = final_delivery_df['batting_team'].replace(replace_team_name)
final_delivery_df['bowling_team'] = final_delivery_df['bowling_team'].replace(replace_team_name)

In [16]:
final_delivery_df.head(3)

Unnamed: 0,match_id,city,winner,total_runs_inn1,inning,batting_team,bowling_team,over,ball,batsman,non_striker,batsman_runs,extra_runs,ball_score,player_dismissed
0,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,1,CH Gayle,Mandeep Singh,1,0,1,
1,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,2,Mandeep Singh,CH Gayle,0,0,0,
2,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,3,Mandeep Singh,CH Gayle,0,0,0,


**Feature  Engineering**

In [17]:
# Score-board for each ball
final_delivery_df['score'] = final_delivery_df[['match_id', 'ball_score']].groupby('match_id').cumsum()['ball_score']

# target-left
final_delivery_df['target_left'] = final_delivery_df['total_runs_inn1'] + 1 - final_delivery_df['score']

# remaining-balls
final_delivery_df['remaining_balls'] = 120 - ((final_delivery_df['over'] - 1)*6 + final_delivery_df['ball'])

# wickets-tally for each ball
final_delivery_df['player_dismissed'] = final_delivery_df['player_dismissed'].fillna(0)
final_delivery_df['wkt_dissmisal'] = final_delivery_df['player_dismissed'].apply(lambda x: x if x == 0 else 1)
# Wickets at scorebord
final_delivery_df['wickets'] = final_delivery_df[['match_id', 'wkt_dissmisal']].groupby('match_id').cumsum()['wkt_dissmisal']
# remaining wickets
final_delivery_df['wickets_remaining'] = 10  - final_delivery_df['wickets']

In [18]:
# Current Run Rate, Required Run Rate
# Current Run Rate : runs scored / # of overs
final_delivery_df['crr'] = (final_delivery_df['score']/(120 - final_delivery_df['remaining_balls']))*6

# Required Run Rate : remaining scores / remaining # of overs
final_delivery_df['rrr'] = (final_delivery_df['target_left']/final_delivery_df['remaining_balls'])*6

In [19]:
# result: 1 - batting-team-winner, 0 - bowling-team-winner
def result(row):
    if row['batting_team'] == row['winner']:
        return 1
    else:
        return 0

final_delivery_df['result'] = final_delivery_df.apply(result, axis = 1)

In [20]:
final_delivery_df.head(3)

Unnamed: 0,match_id,city,winner,total_runs_inn1,inning,batting_team,bowling_team,over,ball,batsman,...,player_dismissed,score,target_left,remaining_balls,wkt_dissmisal,wickets,wickets_remaining,crr,rrr,result
0,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,1,CH Gayle,...,0,1,207,119,0,0,10,6.0,10.436975,0
1,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,2,Mandeep Singh,...,0,1,207,118,0,0,10,3.0,10.525424,0
2,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,3,Mandeep Singh,...,0,1,207,117,0,0,10,2.0,10.615385,0


In [21]:
# Dataset in which we will perform our model building
model_df = final_delivery_df[['batting_team', 'bowling_team', 'city', 'score','wickets', 'remaining_balls', 'target_left',  'crr', 'rrr', 'result']]
model_df.head(3)

Unnamed: 0,batting_team,bowling_team,city,score,wickets,remaining_balls,target_left,crr,rrr,result
0,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,1,0,119,207,6.0,10.436975,0
1,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,1,0,118,207,3.0,10.525424,0
2,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,1,0,117,207,2.0,10.615385,0


In [22]:
model_df.isnull().sum()

batting_team         0
bowling_team         0
city               832
score                0
wickets              0
remaining_balls      0
target_left          0
crr                  0
rrr                  5
result               0
dtype: int64

In [23]:
# Droping the null values
model_df = model_df.dropna()

In [24]:
# Removing obs with rrr --> inf
model_df = model_df[model_df['remaining_balls'] != 0]

In [25]:
# Removing bias between observation by randomsampling
model_df = model_df.sample(model_df.shape[0])

In [26]:
# Train-test-split
from sklearn.model_selection import train_test_split

In [27]:
y = model_df['result']
X = model_df.drop(columns = ['result'])

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

### Encoding

In [29]:
# we will apply onehotencoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [30]:
# Fixing the Categories
cat_feat = ['batting_team', 'bowling_team', 'city']
team = ['Kings XI Punjab', 'Kolkata Knight Riders', 'Chennai Super Kings',
       'Gujrat Titans', 'Royal Challengers Bangalore', 'Delhi Capitals',
       'Sunrisers Hyderabad', 'Mumbai Indians', 'Rajasthan Royals']
city = ['Hyderabad', 'Jaipur', 'Chennai', 'Bangalore', 'Chandigarh',
       'Mumbai', 'Kolkata', 'Rajkot', 'Delhi', 'Port Elizabeth',
       'Centurion', 'Bengaluru', 'Kimberley', 'Durban', 'Abu Dhabi',
       'Cuttack', 'Johannesburg', 'Pune', 'Cape Town', 'Nagpur',
       'Ahmedabad', 'Bloemfontein', 'Dharamsala', 'Kanpur',
       'Visakhapatnam', 'Mohali', 'Sharjah', 'Raipur', 'East London',
       'Ranchi', 'Indore']
ohe_category = [team, team, city]

In [31]:
# One Hot Encoder
ohe = OneHotEncoder(sparse_output = False, categories = ohe_category, drop = 'first')

In [32]:
# Preprocessor Pipeline
transformer = [
    ('cat', ohe, cat_feat)
]
enc_ct = ColumnTransformer(transformer, remainder = 'passthrough', verbose_feature_names_out = False, force_int_remainder_cols=False).set_output(transform = 'pandas')
enc_ct

### Model Building Pipeline

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [34]:
steps = [
    ('encoding', enc_ct),
    ('logistic_regression', LogisticRegression(solver = 'liblinear'))
]
pipe = Pipeline(steps)
pipe

### Model Fitting & Evaluation

In [35]:
from sklearn.metrics import accuracy_score

In [36]:
# Training
pipe.fit(X_train, y_train)

In [37]:
# Prediction
y_pred = pipe.predict(X_test)
accuracy_score(y_test, y_pred)

0.8135056890863799

### Exporting the Models

In [38]:
import pickle as pkl

In [39]:
# Exporting the trained model
pkl.dump(pipe, open('model_pipe.pkl', 'wb')) # wb : write-binary mode, rb : read-binary model

In [40]:
# Exporting Required list of team and city
pkl.dump(team, open('teams.pkl', 'wb'))
pkl.dump(city, open('city.pkl', 'wb'))