In [799]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [800]:
ipl_bbb = pd.read_csv('IPL_ball_by_ball_updated till 2024.csv',low_memory=False)

In [801]:
ipl_salary = pd.read_excel('IPL SALARIES 2024.xlsx')

## Understadning the data

In [803]:
ipl_salary.head()

Unnamed: 0,Player,Salary,Rs,international,iconic
0,Abhishek Porel,20 lakh,20,0,
1,Anrich Nortje,6.5 crore,650,1,
2,Axar Patel,9 crore,900,0,
3,David Warner,6.25 crore,625,1,
4,Ishant Sharma,50 lakh,50,0,


In [804]:
grouped_data = ipl_bbb.groupby(['Season', 'Innings No', 'Striker','Bowler']).agg({'runs_scored': sum, 'wicket_confirmation':sum}).reset_index()

In [805]:
player_runs = grouped_data.groupby(['Season', 'Striker'])['runs_scored'].sum().reset_index()
player_wickets = grouped_data.groupby(['Season', 'Bowler'])['wicket_confirmation'].sum().reset_index()

In [806]:
player_runs[player_runs['Season']=='2023'].sort_values(by='runs_scored',ascending=False)

Unnamed: 0,Season,Striker,runs_scored
2423,2023,Shubman Gill,890
2313,2023,F du Plessis,730
2311,2023,DP Conway,672
2433,2023,V Kohli,639
2443,2023,YBK Jaiswal,625
...,...,...,...
2404,2023,RP Meredith,0
2372,2023,Mohsin Khan,0
2307,2023,DG Nalkande,0
2429,2023,TU Deshpande,0


## Top three run-getters and Top three wicket-takers in each IPL iteration

In [808]:
top_run_getters = player_runs.groupby('Season').apply(lambda x: x.nlargest(3, 'runs_scored')).reset_index(drop=True)
bottom_wicket_takers = player_wickets.groupby('Season').apply(lambda x: x.nlargest(3, 'wicket_confirmation')).reset_index(drop=True)
print("Top Three Run Getters:")
print(top_run_getters)
print("Top Three Wicket Takers:")
print(bottom_wicket_takers)

Top Three Run Getters:
     Season          Striker  runs_scored
0   2007/08         SE Marsh          616
1   2007/08        G Gambhir          534
2   2007/08    ST Jayasuriya          514
3      2009        ML Hayden          572
4      2009     AC Gilchrist          495
5      2009   AB de Villiers          465
6   2009/10     SR Tendulkar          618
7   2009/10        JH Kallis          572
8   2009/10         SK Raina          528
9      2011         CH Gayle          608
10     2011          V Kohli          557
11     2011     SR Tendulkar          553
12     2012         CH Gayle          733
13     2012        G Gambhir          590
14     2012         S Dhawan          569
15     2013       MEK Hussey          733
16     2013         CH Gayle          720
17     2013          V Kohli          639
18     2014       RV Uthappa          660
19     2014         DR Smith          566
20     2014       GJ Maxwell          552
21     2015        DA Warner          562
22     2015

## Creating a new dataframe with relevant columns

In [810]:
ipl_year_id = pd.DataFrame(columns=["id", "year"])
ipl_year_id["id"] = ipl_bbb["Match id"]
ipl_year_id["year"] = pd.to_datetime(ipl_bbb["Date"], dayfirst=True).dt.year

In [811]:
#create a copy of ipl_bbbc dataframe
ipl_bbbc= ipl_bbb.copy()

In [812]:
ipl_bbbc['year'] = pd.to_datetime(ipl_bbb["Date"], dayfirst=True).dt.year

In [813]:
ipl_bbbc[["Match id", "year", "runs_scored","wicket_confirmation","Bowler",'Striker']].head()

Unnamed: 0,Match id,year,runs_scored,wicket_confirmation,Bowler,Striker
0,335982,2008,0,0,P Kumar,SC Ganguly
1,335982,2008,0,0,P Kumar,BB McCullum
2,335982,2008,0,0,P Kumar,BB McCullum
3,335982,2008,0,0,P Kumar,BB McCullum
4,335982,2008,0,0,P Kumar,BB McCullum


## Fitting the most appropriate distribution

In [815]:
import scipy.stats as st

def get_best_distribution(data):
    dist_names = ['alpha','beta','betaprime','burr12','crystalball',
                  'dgamma','dweibull','erlang','exponnorm','f','fatiguelife',
                  'gamma','gengamma','gumbel_l','johnsonsb','kappa4',
                  'lognorm','nct','norm','norminvgauss','powernorm','rice',
                  'recipinvgauss','t','trapz','truncnorm']
    dist_results = []
    params = {}
    for dist_name in dist_names:
        dist = getattr(st, dist_name)
        param = dist.fit(data)
        params[dist_name] = param
        # Applying the Kolmogorov-Smirnov test
        D, p = st.kstest(data, dist_name, args=param)
        print("p value for "+dist_name+" = "+str(p))
        dist_results.append((dist_name, p))
    # select the best fitted distribution
    best_dist, best_p = (max(dist_results, key=lambda item: item[1]))
    # store the name of the best fit and its p value
    print("\nBest fitting distribution: "+str(best_dist))
    print("Best p value: "+ str(best_p))
    print("Parameters for the best fit: "+ str(params[best_dist]))
    return best_dist, best_p, params[best_dist]

In [816]:
total_run_each_year = ipl_bbbc.groupby(["year", "Striker"])["runs_scored"].sum().reset_index()

In [817]:
total_run_each_year.sort_values(["year", "runs_scored"], ascending=False, inplace=True)
print(total_run_each_year)

      year          Striker  runs_scored
2549  2024       RD Gaikwad          509
2589  2024          V Kohli          500
2470  2024  B Sai Sudharsan          418
2502  2024         KL Rahul          406
2555  2024          RR Pant          398
...    ...              ...          ...
58    2008         L Balaji            0
66    2008   M Muralitharan            0
75    2008         MM Patel            0
107   2008      S Sreesanth            0
136   2008           U Kaul            0

[2598 rows x 3 columns]


#### Top three batsmen and their distribution in the last three IPL tournaments.

In [819]:
list_top_batsman_last_three_year = {}
for i in total_run_each_year["year"].unique()[:3]:
    list_top_batsman_last_three_year[i] = total_run_each_year[total_run_each_year.year == i][:3]["Striker"].unique().tolist()

In [820]:
list_top_batsman_last_three_year

{2024: ['RD Gaikwad', 'V Kohli', 'B Sai Sudharsan'],
 2023: ['Shubman Gill', 'F du Plessis', 'DP Conway'],
 2022: ['JC Buttler', 'KL Rahul', 'Q de Kock']}

In [821]:
import warnings
warnings.filterwarnings('ignore')
runs = ipl_bbbc.groupby(['Striker','Match id'])[['runs_scored']].sum().reset_index()

for key in list_top_batsman_last_three_year:
    for Striker in list_top_batsman_last_three_year[key]:
        print("************************")
        print("year:", key, " Batsman:", Striker)
        get_best_distribution(runs[runs["Striker"] == Striker]["runs_scored"])
        print("\n\n")

************************
year: 2024  Batsman: RD Gaikwad
p value for alpha = 2.599259711013304e-20
p value for beta = 0.02041902689492403
p value for betaprime = 0.019503763598668566
p value for burr12 = 0.46882020698395865
p value for crystalball = 0.24953646987270484
p value for dgamma = 0.1570743843120962
p value for dweibull = 0.20046582403736823
p value for erlang = 1.893799588395604e-06
p value for exponnorm = 0.4644304230917985
p value for f = 1.3560920695663998e-07
p value for fatiguelife = 1.304427037367869e-14
p value for gamma = 0.005830868576003678
p value for gengamma = 0.015331622187826577
p value for gumbel_l = 0.05546236480086586
p value for johnsonsb = 4.646964117947127e-13
p value for kappa4 = 0.006363220770325362
p value for lognorm = 1.1719355665219537e-16
p value for nct = 0.5881570496217807
p value for norm = 0.24953651809309751
p value for norminvgauss = 0.5538573365184996
p value for powernorm = 0.1788753268739086
p value for rice = 0.18287532184336575
p value f

#### Top three bowlers and their distribution in the last three IPL tournaments.

In [823]:
total_wicket_each_year = ipl_bbbc.groupby(["year", "Bowler"])["wicket_confirmation"].sum().reset_index()

In [824]:
total_wicket_each_year.sort_values(["year", "wicket_confirmation"], ascending=False, inplace=True)
print(total_wicket_each_year)

      year             Bowler  wicket_confirmation
1836  2024           HV Patel                   19
1875  2024       Mukesh Kumar                   15
1822  2024     Arshdeep Singh                   14
1842  2024          JJ Bumrah                   14
1876  2024  Mustafizur Rahman                   14
...    ...                ...                  ...
16    2008           CL White                    0
41    2008             K Goel                    0
43    2008          LPC Silva                    0
60    2008       Pankaj Singh                    0
90    2008        VS Yeligati                    0

[1929 rows x 3 columns]


In [825]:
list_top_bowler_last_three_year = {}
for i in total_wicket_each_year["year"].unique()[:3]:
    list_top_bowler_last_three_year[i] = total_wicket_each_year[total_wicket_each_year.year == i][:3]["Bowler"].unique().tolist()
list_top_bowler_last_three_year

{2024: ['HV Patel', 'Mukesh Kumar', 'Arshdeep Singh'],
 2023: ['MM Sharma', 'Mohammed Shami', 'Rashid Khan'],
 2022: ['YS Chahal', 'PWH de Silva', 'K Rabada']}

In [826]:
import warnings
warnings.filterwarnings('ignore')
wickets = ipl_bbbc.groupby(['Bowler','Match id'])[['wicket_confirmation']].sum().reset_index()

for key in list_top_bowler_last_three_year:
    for bowler in list_top_bowler_last_three_year[key]:
        print("************************")
        print("year:", key, " Bowler:", bowler)
        get_best_distribution(wickets[wickets["Bowler"] == bowler]["wicket_confirmation"])
        print("\n\n")

************************
year: 2024  Bowler: HV Patel
p value for alpha = 0.0002993252328930706
p value for beta = 2.777571908776589e-19
p value for betaprime = 1.7052883875145053e-30
p value for burr12 = 5.427998338605459e-15
p value for crystalball = 1.1109118198587684e-05
p value for dgamma = 4.375428528574276e-05
p value for dweibull = 1.8553295107771936e-05
p value for erlang = 5.473635282991912e-24
p value for exponnorm = 0.0002813279943461815
p value for f = 1.9012983291282487e-09
p value for fatiguelife = 1.9734428958773156e-05
p value for gamma = 1.470787431589663e-16
p value for gengamma = 1.4345058849022962e-16
p value for gumbel_l = 4.541523588271283e-05
p value for johnsonsb = 2.827201329331457e-51
p value for kappa4 = 9.177530010006471e-23
p value for lognorm = 5.2162358572043325e-22
p value for nct = 0.0001960277304576293
p value for norm = 1.1109124960635979e-05
p value for norminvgauss = 3.811196478020768e-05
p value for powernorm = 3.2186417463058256e-05
p value for r

## Fit the most appropriate distribution for alloted player - AR Patel

In [828]:
# Initialize the dictionary to store top bowlers for each of the last three years
axar_patel_bowl = { }

# Loop through the unique years in the dataset, limited to the last three years
for i in total_wicket_each_year["year"].unique()[:3]:
    # Filter the dataset to include only records for Axar Patel in the current year
    axar_patel_data = total_wicket_each_year[(total_wicket_each_year["year"] == i) & (total_wicket_each_year["Bowler"] == "AR Patel")]
    # Get the unique list of years where Axar Patel appears in the filtered dataset
    axar_patel_bowl[i] = axar_patel_data["Bowler"].unique().tolist()

# Print the dictionary to verify the results
print(axar_patel_bowl)

{2024: ['AR Patel'], 2023: ['AR Patel'], 2022: ['AR Patel']}


In [829]:
import warnings
warnings.filterwarnings('ignore')

# Group by Bowler and Match id, then sum the wickets
wickets = ipl_bbbc.groupby(['Bowler', 'Match id'])[['wicket_confirmation']].sum().reset_index()

# Loop through the dictionary to process Axar Patel's data for each year
for year, bowlers in axar_patel_bowl.items():
    for bowler in bowlers:
        if bowler == "AR Patel":
            print("************************")
            print("year:", year, " Bowler:", bowler)
            get_best_distribution(wickets[wickets["Bowler"] == bowler]["wicket_confirmation"])
            print("\n\n")

************************
year: 2024  Bowler: AR Patel
p value for alpha = 9.940012950298595e-19
p value for beta = 1.73089555773241e-31
p value for betaprime = 6.890602231402487e-28
p value for burr12 = 5.648773934180763e-20
p value for crystalball = 1.212835491802816e-07
p value for dgamma = 1.1945105340885417e-10
p value for dweibull = 4.834475183349857e-09
p value for erlang = 3.7936798489477985e-39
p value for exponnorm = 6.847790425577837e-20
p value for f = 5.648773934180763e-20
p value for fatiguelife = 6.574170250938941e-36
p value for gamma = 6.040604015697529e-29
p value for gengamma = 2.1676360523609773e-25
p value for gumbel_l = 1.2777338489201446e-08
p value for johnsonsb = 5.648773936049154e-20
p value for kappa4 = 4.002495349263793e-32
p value for lognorm = 2.021269059381295e-21
p value for nct = 2.4980624812823503e-11
p value for norm = 1.2128358700898914e-07
p value for norminvgauss = 0.0
p value for powernorm = 1.0293563251350029e-10
p value for rice = 1.0187539089487

In [830]:
# Initialize the dictionary to store top bowlers for each of the last three years
axar_patel_bat = {}

# Loop through the unique years in the dataset, limited to the last three years
for i in total_run_each_year["year"].unique()[:3]:
    # Filter the dataset to include only records for Axar Patel in the current year
    axar_patel_data1 = total_run_each_year[(total_run_each_year["year"] == i) & (total_run_each_year["Striker"] == "AR Patel")]
    # Get the unique list of years where Axar Patel appears in the filtered dataset
    axar_patel_bat[i] = axar_patel_data1["Striker"].unique().tolist()

# Print the dictionary to verify the results
print(axar_patel_bat)

{2024: ['AR Patel'], 2023: ['AR Patel'], 2022: ['AR Patel']}


In [831]:
import warnings
warnings.filterwarnings('ignore')

# Group by Batsman and Match id, then sum the wickets
wickets = ipl_bbbc.groupby(['Striker', 'Match id'])[['runs_scored']].sum().reset_index()

# Loop through the dictionary to process Axar Patel's data for each year
for year, strikers in axar_patel_bat.items():
    for striker in strikers:
        if striker == "AR Patel":
            print("************************")
            print("year:", year, "batsman:", striker)
            get_best_distribution(runs[runs["Striker"] ==striker]["runs_scored"])
            print("\n\n")


************************
year: 2024 batsman: AR Patel
p value for alpha = 1.4283049006330874e-19
p value for beta = 0.08501064188004714
p value for betaprime = 9.200747163367085e-11
p value for burr12 = 0.4240145784486461
p value for crystalball = 0.029775015720014397
p value for dgamma = 0.008447321543132325
p value for dweibull = 0.0067651510035502405
p value for erlang = 0.0012434310409705773
p value for exponnorm = 0.44275294718405667
p value for f = 1.0828276463613638e-16
p value for fatiguelife = 0.22678195858041206
p value for gamma = 0.01941581626513733
p value for gengamma = 2.3537360809311073e-06
p value for gumbel_l = 4.928627051090389e-06
p value for johnsonsb = 0.2513706078100967
p value for kappa4 = 2.0042162915949264e-21
p value for lognorm = 1.3560827213335702e-29
p value for nct = 0.3161622541605552
p value for norm = 0.029775039515882007
p value for norminvgauss = 0.38491885655137925
p value for powernorm = 0.011133425872538627
p value for rice = 0.011334555411159908


## Relationship between the performance of a player and the salary he gets

In [833]:
R2024 =total_run_each_year[total_run_each_year['year']==2024]

In [834]:
W2024 =total_wicket_each_year[total_wicket_each_year['year']==2024]

In [835]:
#pip install fuzzywuzzy
#pip install python-Levenshtein

In [836]:
from fuzzywuzzy import process

# Convert to DataFrame
df_salary = ipl_salary.copy()
df_runs = R2024.copy()

# Function to match names
def match_names_runs(name, names_list):
    match, score = process.extractOne(name, names_list)
    return match if score >= 87 else None

# Create a new column in df_salary with matched names from df_runs
df_salary['Matched_Player'] = df_salary['Player'].apply(lambda x: match_names_runs(x, df_runs['Striker'].tolist()))

# Merge the DataFrames on the matched names
df_merged_runs = pd.merge(df_salary, df_runs, left_on='Matched_Player', right_on='Striker')

In [837]:
from fuzzywuzzy import process

# Convert to DataFrame
df_salary = ipl_salary.copy()
df_wickets = W2024.copy()

# Function to match names
def match_names_wickets(name, names_list):
    match, score = process.extractOne(name, names_list)
    return match if score >= 87 else None

# Create a new column in df_salary with matched names from df_runs
df_salary['Matched_Player'] = df_salary['Player'].apply(lambda x: match_names_wickets(x, df_wickets['Bowler'].tolist()))

# Merge the DataFrames on the matched names
df_merged_wickets = pd.merge(df_salary, df_wickets, left_on='Matched_Player', right_on='Bowler')

In [838]:
df_merged_runs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Player          39 non-null     object 
 1   Salary          39 non-null     object 
 2   Rs              39 non-null     int64  
 3   international   39 non-null     int64  
 4   iconic          0 non-null      float64
 5   Matched_Player  39 non-null     object 
 6   year            39 non-null     int32  
 7   Striker         39 non-null     object 
 8   runs_scored     39 non-null     int64  
dtypes: float64(1), int32(1), int64(3), object(4)
memory usage: 2.7+ KB


In [839]:
df_merged_wickets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Player               34 non-null     object 
 1   Salary               34 non-null     object 
 2   Rs                   34 non-null     int64  
 3   international        34 non-null     int64  
 4   iconic               0 non-null      float64
 5   Matched_Player       34 non-null     object 
 6   year                 34 non-null     int32  
 7   Bowler               34 non-null     object 
 8   wicket_confirmation  34 non-null     int64  
dtypes: float64(1), int32(1), int64(3), object(4)
memory usage: 2.4+ KB


In [840]:
df_merged_runs.head()

Unnamed: 0,Player,Salary,Rs,international,iconic,Matched_Player,year,Striker,runs_scored
0,Abhishek Porel,20 lakh,20,0,,Abishek Porel,2024,Abishek Porel,202
1,Axar Patel,9 crore,900,0,,AR Patel,2024,AR Patel,149
2,Kuldeep Yadav,2 crore,200,0,,Kuldeep Yadav,2024,Kuldeep Yadav,36
3,Lalit Yadav,65 lakh,65,0,,Lalit Yadav,2024,Lalit Yadav,10
4,Mukesh Kumar,5.5 crore,550,0,,Mukesh Kumar,2024,Mukesh Kumar,0


In [841]:
df_merged_wickets.head()

Unnamed: 0,Player,Salary,Rs,international,iconic,Matched_Player,year,Bowler,wicket_confirmation
0,Axar Patel,9 crore,900,0,,AR Patel,2024,AR Patel,9
1,Kuldeep Yadav,2 crore,200,0,,Kuldeep Yadav,2024,Kuldeep Yadav,12
2,Lalit Yadav,65 lakh,65,0,,Lalit Yadav,2024,Lalit Yadav,0
3,Mukesh Kumar,5.5 crore,550,0,,Mukesh Kumar,2024,Mukesh Kumar,15
4,Mukesh Choudhary,20 lakh,20,0,,Mukesh Choudhary,2024,Mukesh Choudhary,0


In [842]:
# Calculate the correlation
correlation = df_merged_runs['Rs'].corr(df_merged_runs['runs_scored'])

print("Correlation between Salary and Runs:", correlation)

Correlation between Salary and Runs: 0.3349654749323617


In [843]:
# Calculate the correlation
correlation = df_merged_wickets['Rs'].corr(df_merged_wickets['wicket_confirmation'])

print("Correlation between Salary and Wickets:", correlation)

Correlation between Salary and Wickets: 0.2127466075152879
