In [22]:
import numpy as np
import pandas as pd
from scipy import stats
import math 

In [7]:
df=pd.read_csv('cricket_data.csv')

In [8]:
df.head()

Unnamed: 0,Year,Player_Name,Matches_Batted,Not_Outs,Runs_Scored,Highest_Score,Batting_Average,Balls_Faced,Batting_Strike_Rate,Centuries,...,Matches_Bowled,Balls_Bowled,Runs_Conceded,Wickets_Taken,Best_Bowling_Match,Bowling_Average,Economy_Rate,Bowling_Strike_Rate,Four_Wicket_Hauls,Five_Wicket_Hauls
0,2024,Ruturaj Gaikwad,2,0,61,46,30.5,51,119.61,0,...,2,0,0,0,0,0,0,0,0,0
1,2023,Ruturaj Gaikwad,16,1,590,92,42.14,400,147.5,0,...,16,0,0,0,0,0,0,0,0,0
2,2022,Ruturaj Gaikwad,14,0,368,99,26.29,291,126.46,0,...,14,0,0,0,0,0,0,0,0,0
3,2021,Ruturaj Gaikwad,16,2,635,101*,45.35,466,136.26,1,...,16,0,0,0,0,0,0,0,0,0
4,2020,Ruturaj Gaikwad,6,2,204,72,51.0,169,120.71,0,...,6,0,0,0,0,0,0,0,0,0


In [12]:
df["Highest_Score"] = df["Highest_Score"].astype(str).str.replace("*", "", regex=False)
df["Highest_Score"] = pd.to_numeric(df["Highest_Score"], errors="coerce")

In [None]:
numeric_cols = [
    "Matches_Batted", "Not_Outs", "Runs_Scored", "Highest_Score", "Batting_Average",
    "Balls_Faced", "Batting_Strike_Rate", "Centuries", "Matches_Bowled", "Balls_Bowled",
    "Runs_Conceded", "Wickets_Taken", "Bowling_Average", "Economy_Rate",
    "Bowling_Strike_Rate", "Four_Wicket_Hauls", "Five_Wicket_Hauls"
]

df_num = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

means = df_num.mean()
medians = df_num.median()
modes = df_num.mode().iloc[0]

mins = df_num.min()
maxs = df_num.max()

stds = df_num.std()
vars_ = df_num.var()

skews = df_num.skew()
kurts = df_num.kurt()

counts = df_num.count()
missing = df_num.isnull().sum()

summary = pd.DataFrame({
    "Mean": means,
    "Median": medians,
    "Mode": modes,
    "Min": mins,
    "Max": maxs,
    "StdDev": stds,
    "Variance": vars_,
    "Skewness": skews,
    "Kurtosis": kurts,
    "Count": counts,
    "Missing": missing
})

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

display(summary)

Unnamed: 0,Mean,Median,Mode,Min,Max,StdDev,Variance,Skewness,Kurtosis,Count,Missing
Matches_Batted,9.008842,10.0,14.0,0.0,19.0,5.53156,30.598152,-0.180364,-1.44327,1131,41
Not_Outs,1.486295,1.0,0.0,0.0,10.0,1.697092,2.880122,1.477457,2.631718,1131,41
Runs_Scored,132.136163,42.0,0.0,0.0,973.0,171.030647,29251.482328,1.452353,1.631771,1131,41
Highest_Score,33.212202,23.0,0.0,0.0,140.0,32.43882,1052.277055,0.772784,-0.488659,1131,41
Batting_Average,17.427339,14.55,0.0,0.0,101.0,16.531119,273.277884,0.994776,1.056924,1131,41
Balls_Faced,98.55084,38.0,0.0,0.0,640.0,122.665593,15046.847634,1.370688,1.145453,1131,41
Batting_Strike_Rate,102.224527,118.9,0.0,0.0,400.0,59.236857,3509.005215,-0.206537,0.446162,1131,41
Centuries,0.04244,0.0,0.0,0.0,4.0,0.26932,0.072534,9.052678,104.752103,1131,41
Matches_Bowled,9.049558,10.0,14.0,0.0,19.0,5.516159,30.428011,-0.186948,-1.438314,1130,42
Balls_Bowled,92.946903,27.0,0.0,0.0,408.0,118.389409,14016.052094,1.056126,-0.323809,1130,42


In [None]:
from scipy.stats import gmean

col = "Runs_Scored"

df[col] = pd.to_numeric(df[col], errors='coerce')

subsets = [0.25, 0.5, 0.75, 1.0]
mean_comparison = {}

for frac in subsets:
    subset = df[col].dropna().sample(frac=frac, random_state=42)
    arithmetic_mean = subset.mean()
    geometric_mean = gmean(subset[subset > 0])  
    mean_comparison[f"{int(frac*100)}% data"] = {
        "arithmetic_mean": arithmetic_mean,
        "geometric_mean": geometric_mean
    }

pd.DataFrame(mean_comparison).T


Unnamed: 0,arithmetic_mean,geometric_mean
25% data,133.720848,56.704387
50% data,130.263251,56.215978
75% data,133.880896,55.220643
100% data,132.136163,56.295166


In [None]:

subset1 = df[["Runs_Scored"]]

subset2 = df[["Player_Name", "Matches_Batted"]]

print("Total Rows:", len(df))

print("First row:\n", df.head(1))

print("Last row:\n", df.tail(1))

print(df.head(5))

print(df.tail(5))

print(df.sample(5, random_state=42))

subset3 = df.loc[:9, ["Player_Name", "Runs_Scored", "Wickets_Taken"]]
print(subset3)


Total Rows: 1172
First row:
    Year      Player_Name Matches_Batted Not_Outs  Runs_Scored  Highest_Score  \
0  2024  Ruturaj Gaikwad              2        0         61.0           46.0   

  Batting_Average Balls_Faced Batting_Strike_Rate Centuries Half_Centuries  \
0            30.5          51              119.61         0              0   

  Fours Sixes Catches_Taken Stumpings Matches_Bowled Balls_Bowled  \
0     8     1             0         0              2            0   

  Runs_Conceded Wickets_Taken Best_Bowling_Match Bowling_Average Economy_Rate  \
0             0             0                  0               0            0   

  Bowling_Strike_Rate Four_Wicket_Hauls Five_Wicket_Hauls  
0                   0                 0                 0  
Last row:
           Year             Player_Name Matches_Batted  Not_Outs  Runs_Scored  \
1171  No stats  Jhathavedh Subramanyan       No stats  No stats          NaN   

      Highest_Score Batting_Average Balls_Faced Batting_Str

In [23]:

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df_num = df[numeric_cols]

def manual_count(col):
    return sum([1 for x in col if not pd.isna(x)])

def manual_mean(col):
    col_clean = [x for x in col if not pd.isna(x)]
    return sum(col_clean) / len(col_clean) if len(col_clean) > 0 else np.nan

def manual_median(col):
    col_clean = sorted([x for x in col if not pd.isna(x)])
    n = len(col_clean)
    if n == 0:
        return np.nan
    mid = n // 2
    if n % 2 == 0:
        return (col_clean[mid - 1] + col_clean[mid]) / 2
    else:
        return col_clean[mid]

def manual_mode(col):
    freq = {}
    for x in col:
        if not pd.isna(x):
            freq[x] = freq.get(x, 0) + 1
    if not freq:
        return np.nan
    return max(freq, key=freq.get)

def manual_min(col):
    col_clean = [x for x in col if not pd.isna(x)]
    return sorted(col_clean)[0] if col_clean else np.nan

def manual_max(col):
    col_clean = [x for x in col if not pd.isna(x)]
    return sorted(col_clean)[-1] if col_clean else np.nan

def manual_variance(col):
    col_clean = [x for x in col if not pd.isna(x)]
    if len(col_clean) == 0:
        return np.nan
    mean = manual_mean(col_clean)
    return sum((x - mean) ** 2 for x in col_clean) / len(col_clean)

def manual_std(col):
    var = manual_variance(col)
    return math.sqrt(var) if not pd.isna(var) else np.nan

def manual_skewness(col):
    col_clean = [x for x in col if not pd.isna(x)]
    n = len(col_clean)
    if n < 2:
        return np.nan
    mean = manual_mean(col_clean)
    std = manual_std(col_clean)
    return (sum((x - mean) ** 3 for x in col_clean) / n) / (std ** 3) if std != 0 else np.nan

def manual_kurtosis(col):
    col_clean = [x for x in col if not pd.isna(x)]
    n = len(col_clean)
    if n < 2:
        return np.nan
    mean = manual_mean(col_clean)
    std = manual_std(col_clean)
    return (sum((x - mean) ** 4 for x in col_clean) / n) / (std ** 4) - 3 if std != 0 else np.nan

def manual_missing(col):
    return sum([1 for x in col if pd.isna(x)])

summary_manual = {}
for col in numeric_cols:
    column_data = df_num[col].tolist()
    summary_manual[col] = {
        "Count": manual_count(column_data),
        "Missing": manual_missing(column_data),
        "Mean": manual_mean(column_data),
        "Median": manual_median(column_data),
        "Mode": manual_mode(column_data),
        "Min": manual_min(column_data),
        "Max": manual_max(column_data),
        "Variance": manual_variance(column_data),
        "StdDev": manual_std(column_data),
        "Skewness": manual_skewness(column_data),
        "Kurtosis": manual_kurtosis(column_data)
    }

summary_manual_df = pd.DataFrame(summary_manual).T

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
display(summary_manual_df)


Unnamed: 0,Count,Missing,Mean,Median,Mode,Min,Max,Variance,StdDev,Skewness,Kurtosis
Matches_Batted,1131.0,41.0,9.008842,10.0,14.0,0.0,19.0,30.571098,5.529114,-0.180124,-1.442198
Not_Outs,1131.0,41.0,1.486295,1.0,0.0,0.0,10.0,2.877575,1.696342,1.475497,2.614797
Runs_Scored,1131.0,41.0,132.136163,42.0,0.0,0.0,973.0,29225.618949,170.95502,1.450426,1.619266
Highest_Score,1131.0,41.0,33.212202,23.0,0.0,0.0,140.0,1051.346659,32.424476,0.771758,-0.491802
Batting_Average,1131.0,41.0,17.427339,14.55,0.0,0.0,101.0,273.036259,16.523809,0.993457,1.046957
Balls_Faced,1131.0,41.0,98.55084,38.0,0.0,0.0,640.0,15033.543613,122.611352,1.368869,1.135095
Batting_Strike_Rate,1131.0,41.0,102.224527,118.9,0.0,0.0,400.0,3505.902646,59.210663,-0.206263,0.438891
Centuries,1131.0,41.0,0.04244,0.0,0.0,0.0,4.0,0.072469,0.269201,9.040668,104.28428
Matches_Bowled,1130.0,42.0,9.049558,10.0,14.0,0.0,19.0,30.401084,5.513718,-0.1867,-1.437263
Balls_Bowled,1130.0,42.0,92.946903,27.0,0.0,0.0,408.0,14003.648508,118.337012,1.054724,-0.327683
