In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

ipl2022 = pd.read_csv('datasets/IPL 2022/IPL_Matches_2022.csv')
ipl2023 = pd.read_csv('datasets/IPL 2023/matches.csv')

In [97]:
ipl2022.head()

Unnamed: 0,ID,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,method,Player_of_Match,Team1Players,Team2Players,Umpire1,Umpire2
0,1312200,Ahmedabad,2022-05-29,2022,Final,Rajasthan Royals,Gujarat Titans,"Narendra Modi Stadium, Ahmedabad",Rajasthan Royals,bat,N,Gujarat Titans,Wickets,7,,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan...",CB Gaffaney,Nitin Menon
1,1312199,Ahmedabad,2022-05-27,2022,Qualifier 2,Royal Challengers Bangalore,Rajasthan Royals,"Narendra Modi Stadium, Ahmedabad",Rajasthan Royals,field,N,Rajasthan Royals,Wickets,7,,JC Buttler,"['V Kohli', 'F du Plessis', 'RM Patidar', 'GJ ...","['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...",CB Gaffaney,Nitin Menon
2,1312198,Kolkata,2022-05-25,2022,Eliminator,Royal Challengers Bangalore,Lucknow Super Giants,"Eden Gardens, Kolkata",Lucknow Super Giants,field,N,Royal Challengers Bangalore,Runs,14,,RM Patidar,"['V Kohli', 'F du Plessis', 'RM Patidar', 'GJ ...","['Q de Kock', 'KL Rahul', 'M Vohra', 'DJ Hooda...",J Madanagopal,MA Gough
3,1312197,Kolkata,2022-05-24,2022,Qualifier 1,Rajasthan Royals,Gujarat Titans,"Eden Gardens, Kolkata",Gujarat Titans,field,N,Gujarat Titans,Wickets,7,,DA Miller,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan...",BNJ Oxenford,VK Sharma
4,1304116,Mumbai,2022-05-22,2022,70,Sunrisers Hyderabad,Punjab Kings,"Wankhede Stadium, Mumbai",Sunrisers Hyderabad,bat,N,Punjab Kings,Wickets,5,,Harpreet Brar,"['PK Garg', 'Abhishek Sharma', 'RA Tripathi', ...","['JM Bairstow', 'S Dhawan', 'M Shahrukh Khan',...",AK Chaudhary,NA Patwardhan


In [98]:
ipl2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               74 non-null     int64  
 1   City             74 non-null     object 
 2   Date             74 non-null     object 
 3   Season           74 non-null     int64  
 4   MatchNumber      74 non-null     object 
 5   Team1            74 non-null     object 
 6   Team2            74 non-null     object 
 7   Venue            74 non-null     object 
 8   TossWinner       74 non-null     object 
 9   TossDecision     74 non-null     object 
 10  SuperOver        74 non-null     object 
 11  WinningTeam      74 non-null     object 
 12  WonBy            74 non-null     object 
 13  Margin           74 non-null     int64  
 14  method           0 non-null      float64
 15  Player_of_Match  74 non-null     object 
 16  Team1Players     74 non-null     object 
 17  Team2Players     7

In [99]:
drop_columns_2022 = ['ID','City','Date','Season','MatchNumber','Venue','SuperOver','method',
                     'Player_of_Match','Team1Players','Team2Players','Umpire1','Umpire2']

print(ipl2022['Team1'].unique())
ipl2022 = ipl2022.drop(drop_columns_2022, axis=1)
ipl2022

['Rajasthan Royals' 'Royal Challengers Bangalore' 'Sunrisers Hyderabad'
 'Delhi Capitals' 'Chennai Super Kings' 'Gujarat Titans'
 'Lucknow Super Giants' 'Kolkata Knight Riders' 'Punjab Kings'
 'Mumbai Indians']


Unnamed: 0,Team1,Team2,TossWinner,TossDecision,WinningTeam,WonBy,Margin
0,Rajasthan Royals,Gujarat Titans,Rajasthan Royals,bat,Gujarat Titans,Wickets,7
1,Royal Challengers Bangalore,Rajasthan Royals,Rajasthan Royals,field,Rajasthan Royals,Wickets,7
2,Royal Challengers Bangalore,Lucknow Super Giants,Lucknow Super Giants,field,Royal Challengers Bangalore,Runs,14
3,Rajasthan Royals,Gujarat Titans,Gujarat Titans,field,Gujarat Titans,Wickets,7
4,Sunrisers Hyderabad,Punjab Kings,Sunrisers Hyderabad,bat,Punjab Kings,Wickets,5
...,...,...,...,...,...,...,...
69,Rajasthan Royals,Sunrisers Hyderabad,Sunrisers Hyderabad,field,Rajasthan Royals,Runs,61
70,Lucknow Super Giants,Gujarat Titans,Gujarat Titans,field,Gujarat Titans,Wickets,5
71,Royal Challengers Bangalore,Punjab Kings,Punjab Kings,field,Punjab Kings,Wickets,5
72,Mumbai Indians,Delhi Capitals,Delhi Capitals,field,Delhi Capitals,Wickets,4


In [100]:
#Feature Engineering
#Since WonBy and Margin are important pieces of information about a team's performance, 
#we need to include them in our predictive model
#To do this we will create new features to capture the historical performances of all the teams

# Calculate average margin for runs
team_avg_margin_runs = ipl2022[ipl2022['WonBy'] == 'Runs'].groupby('WinningTeam')['Margin'].mean()

# Calculate average margin for wickets
team_avg_margin_wickets = ipl2022[ipl2022['WonBy'] == 'Wickets'].groupby('WinningTeam')['Margin'].mean()

# Map the calculated averages to the respective teams in ipl2022 DataFrame
ipl2022['team1_avg_margin_runs'] = ipl2022['Team1'].map(team_avg_margin_runs)
ipl2022['team2_avg_margin_runs'] = ipl2022['Team2'].map(team_avg_margin_runs)
ipl2022['team1_avg_margin_wickets'] = ipl2022['Team1'].map(team_avg_margin_wickets)
ipl2022['team2_avg_margin_wickets'] = ipl2022['Team2'].map(team_avg_margin_wickets)
ipl2022 = ipl2022.drop(columns = ['WonBy','Margin'], axis=1)

In [101]:
ipl2022.head()

Unnamed: 0,Team1,Team2,TossWinner,TossDecision,WinningTeam,team1_avg_margin_runs,team2_avg_margin_runs,team1_avg_margin_wickets,team2_avg_margin_wickets
0,Rajasthan Royals,Gujarat Titans,Rajasthan Royals,bat,Gujarat Titans,23.142857,30.25,6.0,5.75
1,Royal Challengers Bangalore,Rajasthan Royals,Rajasthan Royals,field,Rajasthan Royals,25.6,23.142857,5.5,6.0
2,Royal Challengers Bangalore,Lucknow Super Giants,Lucknow Super Giants,field,Royal Challengers Bangalore,25.6,24.142857,5.5,6.0
3,Rajasthan Royals,Gujarat Titans,Gujarat Titans,field,Gujarat Titans,23.142857,30.25,6.0,5.75
4,Sunrisers Hyderabad,Punjab Kings,Sunrisers Hyderabad,bat,Punjab Kings,3.0,32.75,7.8,6.0


In [102]:
ipl2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   season           28 non-null     int64  
 1   team1            28 non-null     object 
 2   team2            28 non-null     object 
 3   date             28 non-null     object 
 4   match_number     28 non-null     int64  
 5   venue            28 non-null     object 
 6   city             28 non-null     object 
 7   toss_winner      28 non-null     object 
 8   toss_decision    28 non-null     object 
 9   player_of_match  28 non-null     object 
 10  umpire1          28 non-null     object 
 11  umpire2          28 non-null     object 
 12  reserve_umpire   28 non-null     object 
 13  match_referee    28 non-null     object 
 14  winner           28 non-null     object 
 15  winner_runs      14 non-null     float64
 16  winner_wickets   14 non-null     float64
 17  match_type       2

In [103]:
drop_columns_2023 = ['season','date','match_number','venue','city','player_of_match','umpire1',
                     'umpire2','reserve_umpire','match_referee','winner','winner_runs','winner_wickets','match_type']
ipl2023 = ipl2023.rename(columns = {'team1':'Team1','team2':'Team2',
                                    'toss_winner':'TossWinner','toss_decision':'TossDecision'})
print(ipl2023['Team1'].unique())
ipl2023.drop(drop_columns_2023, axis = 1, inplace = True)

['Chennai Super Kings' 'Punjab Kings' 'Lucknow Super Giants'
 'Rajasthan Royals' 'Mumbai Indians' 'Delhi Capitals'
 'Kolkata Knight Riders' 'Sunrisers Hyderabad' 'Gujarat Titans'
 'Royal Challengers Bangalore']


Some editions of IPL introduce new teams, hence we have to check if teams of 2023 are present in 2022 dataset

In [104]:
# Get unique values from team1 and team2 columns in ipl_2022 and ipl_2023 DataFrames
ipl_2022_teams = set(ipl2022['Team1'].unique()) | set(ipl2022['Team2'].unique())
ipl_2023_teams = set(ipl2023['Team1'].unique()) | set(ipl2023['Team2'].unique())

# Check if unique values of ipl_2023 team1 and team2 columns are subsets of ipl_2022 team1 and team2 columns
is_2023_subset = ipl_2023_teams.issubset(ipl_2022_teams)

# Print the result
print("Are ipl_2023 teams a subset of ipl_2022 teams?:", is_2023_subset)

Are ipl_2023 teams a subset of ipl_2022 teams?: True


In [105]:
ipl2023['team1_avg_margin_runs'] = ipl2023['Team1'].map(team_avg_margin_runs)
ipl2023['team2_avg_margin_runs'] = ipl2023['Team2'].map(team_avg_margin_runs)
ipl2023['team1_avg_margin_wickets'] = ipl2023['Team1'].map(team_avg_margin_wickets)
ipl2023['team2_avg_margin_wickets'] = ipl2023['Team2'].map(team_avg_margin_wickets)
ipl2023

Unnamed: 0,Team1,Team2,TossWinner,TossDecision,team1_avg_margin_runs,team2_avg_margin_runs,team1_avg_margin_wickets,team2_avg_margin_wickets
0,Chennai Super Kings,Gujarat Titans,Gujarat Titans,field,42.333333,30.25,3.0,5.75
1,Punjab Kings,Kolkata Knight Riders,Kolkata Knight Riders,field,32.75,53.0,6.0,6.0
2,Lucknow Super Giants,Delhi Capitals,Delhi Capitals,field,24.142857,27.333333,6.0,6.25
3,Rajasthan Royals,Sunrisers Hyderabad,Sunrisers Hyderabad,field,23.142857,3.0,6.0,7.8
4,Mumbai Indians,Royal Challengers Bangalore,Royal Challengers Bangalore,field,5.0,25.6,5.0,5.5
5,Chennai Super Kings,Lucknow Super Giants,Lucknow Super Giants,field,42.333333,24.142857,3.0,6.0
6,Delhi Capitals,Gujarat Titans,Gujarat Titans,field,27.333333,30.25,6.25,5.75
7,Punjab Kings,Rajasthan Royals,Rajasthan Royals,field,32.75,23.142857,6.0,6.0
8,Kolkata Knight Riders,Royal Challengers Bangalore,Royal Challengers Bangalore,field,53.0,25.6,6.0,5.5
9,Sunrisers Hyderabad,Lucknow Super Giants,Sunrisers Hyderabad,bat,3.0,24.142857,7.8,6.0


In [107]:
columns_2022 = set(ipl2022.columns)
columns_2023 = set(ipl2023.columns)

diff_2022_to_2023 = columns_2022 - columns_2023
diff_2023_to_2022 = columns_2023 - columns_2022

print("Columns in ipl2022 but not in ipl2023:", diff_2022_to_2023)
print("Columns in ipl2023 but not in ipl2022:", diff_2023_to_2022)
#so we have ensured that we have same columns in both datasets, 
#except 'WinningTeam' in 2023 because we are going to predict it

Columns in ipl2022 but not in ipl2023: {'WinningTeam'}
Columns in ipl2023 but not in ipl2022: set()
