# IPL Data Enrichment & Analysis

This notebook performs a comprehensive analysis of the IPL dataset, including:
1. Data Loading & Merging
2. Feature Engineering (Impact Scores, MVP Score)
3. Aggregation (Season Stats)
4. Insights (Top 15 tables)
5. Machine Learning (MVP Prediction & Match Outcome Prediction)

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Set paths
DATA_DIR = r"c:\ML REPO GITHUB\IPL-RR-DATASET"
OUTPUT_DIR = os.path.join(DATA_DIR, "UPDATED CSV")
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
def load_data():
    print("Loading data...")
    matches = pd.read_csv(os.path.join(DATA_DIR, "matches.csv"))
    players = pd.read_csv(os.path.join(DATA_DIR, "players.csv"))
    competitions = pd.read_csv(os.path.join(DATA_DIR, "competitions.csv"))
    
    try:
        stats = pd.read_csv(os.path.join(DATA_DIR, "player_match_stats.csv"))
    except UnicodeDecodeError:
        stats = pd.read_csv(os.path.join(DATA_DIR, "player_match_stats.csv"), encoding='latin1')
    
    return matches, players, stats, competitions

def process_data(matches, players, stats, competitions):
    print("Processing data...")
    
    # Merge Matches with Competitions to get Season
    matches = pd.merge(matches, competitions[['comp_id', 'season']], on='comp_id', how='left')
    
    # Merge Stats with Matches
    merged = pd.merge(stats, matches[['match_id', 'season', 'venue_name', 'home_team', 'away_team', 'toss_win_team_id', 'toss_opted', 'win_team_id', 'result']], on='match_id', how='left')
    
    # Merge with Players
    merged = pd.merge(merged, players[['player_id', 'batting_type', 'bowling_type', 'nationality']], on='player_id', how='left')
    
    # Cleaning
    fill_zeros = ['runs_scored', 'balls_faced', 'no_of_sixes', 'no_of_fours', 'overs_bowled', 'runs_conceded', 'wicket_taken', 'maiden_overs_bowled', 'dot_balls_bowled', 'number_of_catches_taken', 'number_of_stumping']
    for col in fill_zeros:
        if col in merged.columns:
            merged[col] = merged[col].fillna(0)
            
    # --- Metric Calculations ---
    
    # Batting Impact
    merged['strike_rate'] = pd.to_numeric(merged['strike_rate'], errors='coerce').fillna(0)
    merged['batting_impact'] = (
        (merged['runs_scored'] * merged['strike_rate'] / 100) + 
        (merged['no_of_fours'] * 1) + 
        (merged['no_of_sixes'] * 2)
    )
    
    # Bowling Impact
    merged['overs_bowled'] = pd.to_numeric(merged['overs_bowled'], errors='coerce').fillna(0)
    
    def calc_bowl_impact(row):
        if row['overs_bowled'] > 0:
            return (row['wicket_taken'] * 20) + (row['dot_balls_bowled'] * 1) - (row['runs_conceded'] / row['overs_bowled'])
        return 0
    
    merged['bowling_impact'] = merged.apply(calc_bowl_impact, axis=1)
    
    # MVP Score
    merged['fielding_points'] = (merged['number_of_catches_taken'] * 8) + (merged['number_of_stumping'] * 12)
    merged['mvp_score'] = merged['batting_impact'] + merged['bowling_impact'] + merged['fielding_points']
    
    # Boundary %
    merged['boundary_pct'] = np.where(merged['balls_faced'] > 0, (merged['no_of_fours'] + merged['no_of_sixes']) / merged['balls_faced'] * 100, 0)
    
    # Dot Ball %
    merged['balls_bowled_est'] = merged['overs_bowled'] * 6 
    merged['dot_ball_pct'] = np.where(merged['balls_bowled_est'] > 0, merged['dot_balls_bowled'] / merged['balls_bowled_est'] * 100, 0)
    
    return merged

In [4]:
def create_season_stats(match_stats):
    print("Aggregating season stats...")
    season_grp = match_stats.groupby(['season', 'player_id', 'player_name', 'nationality', 'batting_type', 'bowling_type'])
    
    summary = season_grp.agg(
        matches=('match_id', 'count'),
        total_runs=('runs_scored', 'sum'),
        total_balls_faced=('balls_faced', 'sum'),
        total_wickets=('wicket_taken', 'sum'),
        total_overs=('overs_bowled', 'sum'),
        total_runs_conceded=('runs_conceded', 'sum'),
        total_mvp=('mvp_score', 'sum'),
        mvp_std=('mvp_score', 'std')
    ).reset_index()
    
    summary['batting_avg'] = np.where(summary['matches'] > 0, summary['total_runs'] / summary['matches'], 0)
    summary['true_strike_rate'] = np.where(summary['total_balls_faced'] > 0, (summary['total_runs'] / summary['total_balls_faced']) * 100, 0)
    summary['bowling_avg'] = np.where(summary['total_wickets'] > 0, summary['total_runs_conceded'] / summary['total_wickets'], np.inf)
    summary['economy'] = np.where(summary['total_overs'] > 0, summary['total_runs_conceded'] / summary['total_overs'], 0)
    summary['consistency_score'] = np.where(summary['mvp_std'] > 0, 100 / summary['mvp_std'], 0)
    
    return summary

In [5]:
matches, players, stats, competitions = load_data()
enriched_match_stats = process_data(matches, players, stats, competitions)
season_stats = create_season_stats(enriched_match_stats)

# Save CSVs
enriched_match_stats.to_csv(os.path.join(OUTPUT_DIR, "enriched_match_player_stats_v1.csv"), index=False)
season_stats.to_csv(os.path.join(OUTPUT_DIR, "enriched_player_season_stats_v1.csv"), index=False)
print("CSVs Saved.")

Loading data...
Processing data...
Aggregating season stats...
CSVs Saved.


In [6]:
pd.set_option('display.max_columns', None)
print("\n--- TOP 15: MVP SCORE ---")
print(season_stats.sort_values('total_mvp', ascending=False)[['player_name', 'season', 'matches', 'total_mvp']].head(15))

print("\n--- TOP 15: CONSISTENCY SCORE ---")
print(season_stats[season_stats['matches'] >= 10].sort_values('consistency_score', ascending=False)[['player_name', 'season', 'total_mvp', 'consistency_score']].head(15))


--- TOP 15: MVP SCORE ---
              player_name season  matches    total_mvp
7277      Nicholas Pooran   2024       67  4624.904200
4400      Nicholas Pooran   2023       58  4049.103900
2764            Tim David   2022       63  3531.487733
7708      Abhishek Sharma   2024       47  3452.367367
7069        Andre Russell   2024       52  3155.623787
10803        Jason Holder   2025       51  3137.927936
4454           Imad Wasim   2023       50  3128.863897
10857     Nicholas Pooran   2025       46  3045.102400
6963       Faf du Plessis   2024       46  3036.170900
7435     Heinrich Klaasen   2024       48  2983.908800
7174          Travis Head   2024       36  2978.709057
10883     Shimron Hetmyer   2025       56  2974.772800
1966        Rilee Rossouw   2022       41  2881.398500
11264     Abhishek Sharma   2025       31  2876.875000
7816   Rahmanullah Gurbaz   2024       52  2847.686400

--- TOP 15: CONSISTENCY SCORE ---
              player_name season   total_mvp  consistency_

In [8]:

print("\n--- Machine Learning: MVP Prediction ---")

# Drop NA and make explicit copy (fixes SettingWithCopyWarning)
model_df = enriched_match_stats.dropna(
    subset=['mvp_score', 'batting_order', 'venue_name', 'season', 'batting_type', 'bowling_type']
).copy()

le_venue = LabelEncoder()
model_df.loc[:, 'venue_encoded'] = le_venue.fit_transform(model_df['venue_name'].astype(str))

le_season = LabelEncoder()
model_df.loc[:, 'season_encoded'] = le_season.fit_transform(model_df['season'].astype(str))

le_role = LabelEncoder()
model_df.loc[:, 'style_encoded'] = le_role.fit_transform(
    model_df['batting_type'].astype(str) + "_" + model_df['bowling_type'].astype(str)
)

features = ['batting_order', 'venue_encoded', 'style_encoded', 'season_encoded']
target = 'mvp_score'

X = model_df[features]
y = model_df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)
print(f"MVP Prediction - R2 Score: {r2_score(y_test, y_pred):.2f}")




--- Machine Learning: MVP Prediction ---
MVP Prediction - R2 Score: -0.12


In [9]:
print("\n--- Machine Learning: Match Winner Prediction ---")
match_outcomes = enriched_match_stats[['match_id', 'toss_win_team_id', 'toss_opted', 'win_team_id', 'venue_name']].drop_duplicates().dropna()
match_outcomes['toss_decision_encoded'] = (match_outcomes['toss_opted'] == 'Batting').astype(int)
match_outcomes['toss_winner_won'] = (match_outcomes['toss_win_team_id'] == match_outcomes['win_team_id']).astype(int)
le_venue_clf = LabelEncoder()
match_outcomes['venue_encoded'] = le_venue_clf.fit_transform(match_outcomes['venue_name'].astype(str))

X_clf = match_outcomes[['toss_decision_encoded', 'venue_encoded']]
y_clf = match_outcomes['toss_winner_won']

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)
clf = GradientBoostingClassifier(random_state=42)
clf.fit(X_train_c, y_train_c)
print(f"Match Winner Accuracy: {accuracy_score(y_test_c, clf.predict(X_test_c)):.2f}")


--- Machine Learning: Match Winner Prediction ---
Match Winner Accuracy: 0.54
