In [2]:
import pandas as pd

# 1. Load all CSVs
player_df = pd.read_csv('Player.csv')
team_df = pd.read_csv('Team.csv')
season_df = pd.read_csv('Season.csv')
match_df = pd.read_csv('Match.csv')
player_match_df = pd.read_csv('Player_Match.csv')
matches_df = pd.read_csv('matches.csv')  # Optional, different schema

# 2. Merge Match with Season
match_season_df = pd.merge(match_df, season_df, left_on='Season_Id', right_on='Season_Id', how='left')

# 3. Add Team Names (for both teams in a match)
match_season_team_df = pd.merge(
    match_season_df, team_df, left_on='Team_Name_Id', right_on='Team_Id', how='left', suffixes=('', '_team')
)
match_season_team_df = pd.merge(
    match_season_team_df, team_df, left_on='Opponent_Team_Id', right_on='Team_Id', how='left', suffixes=('', '_opponent')
)

# 4. Add Player Info to Player_Match
player_match_player_df = pd.merge(
    player_match_df, player_df, left_on='Player_Id', right_on='Player_Id', how='left'
)
# 5. Add Team Info to Player_Match
player_match_player_team_df = pd.merge(
    player_match_player_df, team_df, left_on='Team_Id', right_on='Team_Id', how='left'
)

# 6. Merge Player_Match (with player and team info) to Match (with all info)
final_df = pd.merge(
    player_match_player_team_df, match_season_team_df, left_on='Match_Id', right_on='Match_Id', how='left', suffixes=('_player', '_match')
)

# 7. (Optional) If you want to also include the Kaggle-style matches.csv, you can merge on date/team/venue if needed.

# 8. Save to final CSV
final_df.to_csv('combined_ipl_data.csv', index=False)

print("Combined CSV shape:", final_df.shape)


Combined CSV shape: (12694, 42)


In [3]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import StackingClassifier, StackingRegressor
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, mean_absolute_error

In [4]:
from datetime import datetime

# Custom date parser function
def custom_date_parser(date_str):
    return datetime.strptime(date_str, '%d-%b-%y')

df = pd.read_csv('combined_ipl_data.csv', parse_dates=['Match_Date'], date_parser=custom_date_parser)

  df = pd.read_csv('combined_ipl_data.csv', parse_dates=['Match_Date'], date_parser=custom_date_parser)


In [5]:
df = pd.read_csv('combined_ipl_data.csv', parse_dates=['Match_Date'])

# Feature Engineering Pipeline
def create_features(df):
    # Team Momentum Features
    for team in ['Team_Name_Id', 'Opponent_Team_Id']:
        df[f'{team}_last5_wins'] = df.groupby(team)['Match_Winner_Id'].transform(
            lambda x: x.rolling(5, min_periods=1).mean()
        )
        df[f'{team}_last3_runs'] = df.groupby(team)['total_runs'].transform(
            lambda x: x.rolling(3, min_periods=1).mean()
        )
    
    # Player Form Features
    df['player_last5_avg'] = df.groupby('Player_Id')['player_runs'].transform(
        lambda x: x.rolling(5, min_periods=1).mean()
    )
    df['player_last5_wickets'] = df.groupby('Player_Id')['player_wickets'].transform(
        lambda x: x.rolling(5, min_periods=1).mean()
    )
    
    # Venue-specific Features
    df['venue_avg_score'] = df.groupby('Venue_Name')['total_runs'].transform('mean')
    
    # Time Delta Features
    df['days_since_last_match'] = df.groupby('Team_Name_Id')['Match_Date'].diff().dt.days
    
    return df

# Preprocessing
numerical_features = ['venue_avg_score', 'Team_Name_Id_last5_wins', 
                     'player_last5_avg', 'days_since_last_match']
categorical_features = ['Venue_Name', 'Team_Name', 'Opponent_Team_Name']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# Ensemble Model Architecture
match_winner_estimators = [
    ('xgb', XGBClassifier(use_label_encoder=False)),
    ('lgbm', LGBMClassifier()),
    ('nn', MLPClassifier(hidden_layer_sizes=(64, 32)))
]

score_estimators = [
    ('xgb', XGBRegressor()),
    ('lgbm', LGBMRegressor()),
    ('nn', MLPRegressor(hidden_layer_sizes=(64, 32)))
]

# Final Models
match_winner_model = Pipeline([
    ('preprocessor', preprocessor),
    ('ensemble', StackingClassifier(
        estimators=match_winner_estimators,
        final_estimator=MLPClassifier(hidden_layer_sizes=(32, 16))
    ))
])

score_prediction_model = Pipeline([
    ('preprocessor', preprocessor),
    ('ensemble', StackingRegressor(
        estimators=score_estimators,
        final_estimator=MLPRegressor(hidden_layer_sizes=(32, 16))
    ))
])

# Player Performance Model
player_performance_model = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('num', StandardScaler(), ['player_last5_avg', 'days_since_last_match']),
        ('cat', OneHotEncoder(), ['Batting_Hand', 'Bowling_Skill'])
    ])),
    ('regressor', XGBRegressor())
])

# Training Framework
def train_models(df):
    # Time Series Split
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Match Winner Prediction
    X = df.drop(['Match_Winner_Id', 'total_runs'], axis=1)
    y_winner = df['Match_Winner_Id']
    
    # Score Prediction
    y_score = df['total_runs']
    
    # Train models
    match_winner_model.fit(X, y_winner)
    score_prediction_model.fit(X, y_score)
    
    # Player Performance Model
    player_X = df[['Player_Id', 'Batting_Hand', 'Bowling_Skill', 
                 'player_last5_avg', 'days_since_last_match']]
    player_y = df[['player_runs', 'player_wickets']]
    player_performance_model.fit(player_X, player_y)
    
    return match_winner_model, score_prediction_model, player_performance_model

# Feature Importance Analysis
def analyze_features(model):
    ohe_categories = preprocessor.named_transformers_['cat'].categories_
    feature_names = numerical_features + \
        [f"cat_{i}" for i in range(sum(len(cat) for cat in ohe_categories))]
    
    importances = np.mean([est.feature_importances_ 
                          for est in model.named_steps['ensemble'].estimators_], axis=0)
    
    return pd.DataFrame({'feature': feature_names, 'importance': importances})

# Prediction Interface
def predict_match(team1, team2, venue, players):
    # Create input DataFrame
    input_data = pd.DataFrame({
        'Team_Name_Id': [team1],
        'Opponent_Team_Id': [team2],
        'Venue_Name': [venue],
        # Add other features based on historical data
    })
    
    # Generate predictions
    winner_prob = match_winner_model.predict_proba(input_data)
    score_pred = score_prediction_model.predict(input_data)
    
    # Player predictions
    player_preds = []
    for player in players:
        player_data = df[df['Player_Id'] == player].iloc[-1]
        pred = player_performance_model.predict(player_data)
        player_preds.append({
            'player': player,
            'predicted_runs': pred[0],
            'predicted_wickets': pred[1]
        })
    
    return {
        'winner_probability': winner_prob,
        'score_range': (score_pred - 20, score_pred + 20),
        'player_predictions': player_preds
    }

  df = pd.read_csv('combined_ipl_data.csv', parse_dates=['Match_Date'])
