In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [2]:
games_data = pd.read_csv(
    "../../data/processed/cleaned_data_deliveries.csv",
    low_memory=False
)

In [3]:
games_data.drop(columns=["Unnamed: 0"],inplace=True)
games_data.tail()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,...,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
260915,1426312,2,KKR,SRH,9,5,SS Iyer,AK Markram,VR Iyer,1,...,bat,KKR,wickets,8.0,114.0,20.0,N,No Issues,J Madanagopal,Nitin Menon
260916,1426312,2,KKR,SRH,9,6,VR Iyer,AK Markram,SS Iyer,1,...,bat,KKR,wickets,8.0,114.0,20.0,N,No Issues,J Madanagopal,Nitin Menon
260917,1426312,2,KKR,SRH,10,1,VR Iyer,Shahbaz Ahmed,SS Iyer,1,...,bat,KKR,wickets,8.0,114.0,20.0,N,No Issues,J Madanagopal,Nitin Menon
260918,1426312,2,KKR,SRH,10,2,SS Iyer,Shahbaz Ahmed,VR Iyer,1,...,bat,KKR,wickets,8.0,114.0,20.0,N,No Issues,J Madanagopal,Nitin Menon
260919,1426312,2,KKR,SRH,10,3,VR Iyer,Shahbaz Ahmed,SS Iyer,1,...,bat,KKR,wickets,8.0,114.0,20.0,N,No Issues,J Madanagopal,Nitin Menon


In [4]:
games_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260920 entries, 0 to 260919
Data columns (total 33 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   match_id          260920 non-null  int64  
 1   inning            260920 non-null  int64  
 2   batting_team      260920 non-null  object 
 3   bowling_team      260920 non-null  object 
 4   over              260920 non-null  int64  
 5   ball              260920 non-null  int64  
 6   batter            260920 non-null  object 
 7   bowler            260920 non-null  object 
 8   non_striker       260920 non-null  object 
 9   batsman_runs      260920 non-null  int64  
 10  extra_runs        260920 non-null  int64  
 11  total_runs        260920 non-null  int64  
 12  extras_type       14125 non-null   object 
 13  is_wicket         260920 non-null  int64  
 14  player_dismissed  12950 non-null   object 
 15  dismissal_kind    12950 non-null   object 
 16  fielder           93

# Batting Stats

In [5]:
# df = games_data[["batter","bowler","non_striker"]]
batter_unique = pd.Series(games_data["batter"].unique())
non_striker_unique = pd.Series(games_data["non_striker"].unique())
bowler_unique = pd.Series(games_data["bowler"].unique())
print(batter_unique.shape[0]+bowler_unique.shape[0]+non_striker_unique.shape[0])
players_list =pd.concat([batter_unique, non_striker_unique,bowler_unique])
print(players_list.shape)
players_list = players_list.unique()
print(players_list.shape)

1866
(1866,)
(732,)


In [6]:
def bowling_stats_generator(player,batting_stats):
    stats=[]
    try:
        bowler_grouped_data = games_data.groupby("bowler").get_group(player)
        #no of balls
        no_of_balls = bowler_grouped_data.shape[0] - bowler_grouped_data[(bowler_grouped_data["extras_type"]=="noballs") | (bowler_grouped_data["extras_type"]=="wides") ].shape[0]
        stats.append(no_of_balls)
        #no of overs
        no_of_overs = no_of_balls / 6
        stats.append(no_of_overs)
        #no of runs
        total_runs = bowler_grouped_data["batsman_runs"].sum() + bowler_grouped_data[(bowler_grouped_data["extras_type"]=="noballs") | (bowler_grouped_data["extras_type"]=="wides") ].shape[0]
        stats.append(total_runs)
        #economy
        economy = total_runs/no_of_overs
        stats.append(economy)
        #no of wickets
        no_of_wickets = bowler_grouped_data[bowler_grouped_data["dismissal_kind"].isna()==False].shape[0]
        stats.append(no_of_wickets)
        #bowling avg
        Bolwing_avg = total_runs/no_of_wickets
        stats.append(Bolwing_avg)
        #strike rate
        try:
            strike_rate = no_of_balls/no_of_wickets
            stats.append(strike_rate)
        except ZeroDivisionError:
            stats.append(np.nan)
        #bundary
        boundary_percentage = ((bowler_grouped_data[bowler_grouped_data["batsman_runs"]== 4].shape[0])+(bowler_grouped_data[bowler_grouped_data["batsman_runs"]== 6].shape[0]))/no_of_balls
        boundary_percentage = boundary_percentage * 100
        stats.append(boundary_percentage)
    except KeyError:
        stats = [np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]
    final_stats= pd.concat([pd.Series(batting_stats),pd.Series(stats)],ignore_index=True)
    return(final_stats)
# for x in players_list:
#     bowling_stats_generator(x)

In [7]:
def batting_stats_generator(player):
    stats = []
    try:
        grouped_data = games_data.groupby("batter").get_group(player)
        #Batting avg
        try:
            total_runs = grouped_data["batsman_runs"].sum()
            total_diss = grouped_data[grouped_data["player_dismissed"]==player].shape[0]
            batting_avg = total_runs/total_diss
            stats.append(batting_avg)
        except ZeroDivisionError:
            stats.append(np.nan) 
        
        #Strike Rate
        total_balls = grouped_data.shape[0] - grouped_data["extras_type"].value_counts().get("wides",0)
        strike_rate = round((total_runs/total_balls)*100,2)
        stats.append(strike_rate)
        
        #Total Runs
        stats.append(total_runs)
        
        #boundary Percentage
        no_of_boundary =grouped_data["batsman_runs"].value_counts().get(4,0)*4+grouped_data["batsman_runs"].value_counts().get(6,0)*6
        boundary_percentage =round((no_of_boundary/total_runs)*100,2)
        stats.append(boundary_percentage)
        
        #dot Ball
        try:
            total_dot_balls = grouped_data["batsman_runs"].value_counts().get(0,0)-grouped_data["extras_type"].value_counts().get("wides",0)
            dot_ball_percentage  = round((total_dot_balls/total_balls)*100,2)
            stats.append(dot_ball_percentage)
        except ZeroDivisionError:
            stats.append(np.nan) 
    except KeyError:
        stats = [np.nan,np.nan,np.nan,np.nan,np.nan]
    return(bowling_stats_generator(player,stats))
# for x in players_list:
#     batting_stats_generator(x)

In [8]:
# df = pd.DataFrame(columns=['Name', 'Batting_avg', 'Batting_strikerate','Batting_total_runs','Batting_boundary','Batting_dot',
#                           'Bowling_balls','Bowling_over','Bowling_runs','Bowling_economy','Bowling_wickets','Bowling_avg','Bowling_strikerate'])

In [9]:
all_rows = []
for x in players_list:
    row = batting_stats_generator(x)  # Get a list from the function
    all_rows.append(row)  # Collect the list

stacked_rows = np.vstack(all_rows)

# Create a DataFrame from the stacked array
df1 = pd.DataFrame(stacked_rows, columns=['Batting_avg', 'Batting_strikerate','Batting_total_runs','Batting_boundary','Batting_dot',
                          'Bowling_balls','Bowling_over','Bowling_runs','Bowling_economy','Bowling_wickets','Bowling_total_runs','Bowling_avg','Bowling_strikerate'])

print(df1)

     Batting_avg  Batting_strikerate  Batting_total_runs  Batting_boundary  \
0      25.942308              106.81              1349.0             59.30   
1      28.254902              131.60              2882.0             67.73   
2      11.375000               71.09                91.0             35.16   
3      28.739130              122.98              1322.0             54.46   
4       9.142857               77.11                64.0             62.50   
..           ...                 ...                 ...               ...   
727          NaN                 NaN                 NaN               NaN   
728          NaN                 NaN                 NaN               NaN   
729          NaN                 NaN                 NaN               NaN   
730          NaN                 NaN                 NaN               NaN   
731          NaN                 NaN                 NaN               NaN   

     Batting_dot  Bowling_balls  Bowling_over  Bowling_runs  Bo

In [11]:
df1.head()
df1.shape

(732, 13)

In [12]:
df1.insert(0, 'Player_name', players_list)

In [13]:
df1.head()

Unnamed: 0,Player_name,Batting_avg,Batting_strikerate,Batting_total_runs,Batting_boundary,Batting_dot,Bowling_balls,Bowling_over,Bowling_runs,Bowling_economy,Bowling_wickets,Bowling_total_runs,Bowling_avg,Bowling_strikerate
0,SC Ganguly,25.942308,106.81,1349.0,59.3,47.43,276.0,46.0,363.0,7.891304,12.0,30.25,23.0,14.130435
1,BB McCullum,28.254902,131.6,2882.0,67.73,43.2,,,,,,,,
2,RT Ponting,11.375,71.09,91.0,35.16,53.12,,,,,,,,
3,DJ Hussey,28.73913,122.98,1322.0,54.46,37.12,317.0,52.833333,473.0,8.952681,10.0,47.3,31.7,17.350158
4,Mohammad Hafeez,9.142857,77.11,64.0,62.5,60.24,60.0,10.0,64.0,6.4,2.0,32.0,30.0,11.666667


In [14]:
df = df1

In [15]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [18]:
df.isna().sum()

Player_name             0
Batting_avg           112
Batting_strikerate     59
Batting_total_runs     59
Batting_boundary       82
Batting_dot            59
Bowling_balls         202
Bowling_over          202
Bowling_runs          202
Bowling_economy       202
Bowling_wickets       202
Bowling_total_runs    277
Bowling_avg           277
Bowling_strikerate    202
dtype: int64

In [20]:
all_nan_rows = df.isna().all(axis=1)  # axis=1 checks across rows

# Print the boolean series showing rows where all values are NaN
print(all_nan_rows)

# Check if there is at least one row with all NaN values
if all_nan_rows.any():
    print("There is at least one row where all values are NaN.")
else:
    print("No row has all NaN values.")

0      False
1      False
2      False
3      False
4      False
       ...  
727    False
728    False
729    False
730    False
731    False
Length: 732, dtype: bool
No row has all NaN values.


# K-Means

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

print("\nTraining Data:")
print(X_train)
print("\nTesting Data:")
print(X_test)


Training Data:
       Player_name  Batting_avg  Batting_strikerate  Batting_total_runs  \
683      RA Shaikh          NaN                 NaN                 NaN   
250      B Sumanth    35.000000               94.59                35.0   
336        CA Lynn    34.973684              140.63              1329.0   
260     MA Agarwal    23.412281              133.32              2669.0   
439   Vishnu Vinod     9.333333               98.25                56.0   
..             ...          ...                 ...                 ...   
71        DJ Bravo    22.941176              129.57              1560.0   
106       AB Dinda     2.888889               54.17                26.0   
270      S Aravind    19.666667              103.51                59.0   
435       TS Mills     2.000000               57.14                 8.0   
102  Sohail Tanvir    12.000000              124.14                36.0   

     Batting_boundary  Batting_dot  Bowling_balls  Bowling_over  Bowling_runs  \
68

In [24]:
df.columns

Index(['Player_name', 'Batting_avg', 'Batting_strikerate',
       'Batting_total_runs', 'Batting_boundary', 'Batting_dot',
       'Bowling_balls', 'Bowling_over', 'Bowling_runs', 'Bowling_economy',
       'Bowling_wickets', 'Bowling_total_runs', 'Bowling_avg',
       'Bowling_strikerate'],
      dtype='object')

In [25]:
numeric_columns = df.select_dtypes(include=[np.number]).columns
non_numeric_columns = df[['Player_name']]  # Keep the Player_name column

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[numeric_columns])

# Step 3: Create a new DataFrame with scaled features
df_scaled = pd.DataFrame(df_scaled, columns=numeric_columns)

# Step 4: Recombine the scaled DataFrame with the non-numeric columns
final_df = pd.concat([non_numeric_columns, df_scaled], axis=1)

print(final_df)

         Player_name  Batting_avg  Batting_strikerate  Batting_total_runs  \
0         SC Ganguly     0.737071            0.016195            0.825302   
1        BB McCullum     0.937418            0.630404            2.298913   
2         RT Ponting    -0.524939           -0.868822           -0.383963   
3          DJ Hussey     0.979368            0.416831            0.799348   
4    Mohammad Hafeez    -0.718316           -0.719667           -0.409917   
..               ...          ...                 ...                 ...   
727      M Siddharth          NaN                 NaN                 NaN   
728         MP Yadav          NaN                 NaN                 NaN   
729         S Joseph          NaN                 NaN                 NaN   
730       N Thushara          NaN                 NaN                 NaN   
731      V Kaverappa          NaN                 NaN                 NaN   

     Batting_boundary  Batting_dot  Bowling_balls  Bowling_over  Bowling_ru