In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score 
#-------------
# Configure visualization settings and suppress warnings for cleaner output
%matplotlib inline 
import warnings
warnings.filterwarnings('ignore')

In [34]:
# Load the datasets
matches = pd.read_csv('matches.csv')
deliveries = pd.read_csv('deliveries.csv')

# Standardize primary key: Rename 'id' to 'match_id' in matches to align with deliveries
if 'id' in matches.columns:
    matches.rename(columns={'id': 'match_id'}, inplace=True)

# Standardize team names to account for franchise rebrandings
def fix_names(df, col_name):
    team_mappings = {
        'Delhi Daredevils': 'Delhi Capitals',
        'Deccan Chargers': 'Sunrisers Hyderabad',
        'Pune Warriors': 'Rising Pune Supergiant',
        'Kings XI Punjab': 'Punjab Kings'
    }
    df[col_name] = df[col_name].replace(team_mappings)
    return df

matches = fix_names(matches, 'team1')
matches = fix_names(matches, 'team2')
matches = fix_names(matches, 'winner')
matches = fix_names(matches, 'toss_winner')
deliveries = fix_names(deliveries, 'batting_team')
deliveries = fix_names(deliveries, 'bowling_team')

# Filter for active IPL franchises only
current_teams = [
    'Chennai Super Kings', 'Delhi Capitals', 'Gujarat Titans', 'Kolkata Knight Riders',
    'Lucknow Super Giants', 'Mumbai Indians', 'Punjab Kings', 'Rajasthan Royals',
    'Royal Challengers Bangalore', 'Sunrisers Hyderabad'
]

matches = matches[matches['team1'].isin(current_teams) & matches['team2'].isin(current_teams)]
deliveries = deliveries[deliveries['batting_team'].isin(current_teams) & deliveries['bowling_team'].isin(current_teams)]

print(f"Data Pipeline Ready. Matches: {matches.shape[0]}, Deliveries: {deliveries.shape[0]}")

Data Pipeline Ready. Matches: 965, Deliveries: 230282


In [35]:
# --- Feature Engineering for Pre-Match Predictions (Random Forest Logic) ---
# Objective: Quantify team strength and venue biases using historical data.

# 1. Calculate Historical Win % (Proxy for Team Strength)
team_wins = matches['winner'].value_counts()
team_matches = matches['team1'].value_counts() + matches['team2'].value_counts()
team_win_rate = (team_wins / team_matches).to_dict()

# Map metrics to the main dataframe
matches['t1_win_rate'] = matches['team1'].map(team_win_rate).fillna(0.5)
matches['t2_win_rate'] = matches['team2'].map(team_win_rate).fillna(0.5)

# 2. Calculate Venue Bias (Batting First Advantage)
venue_stats = matches.groupby('venue').apply(lambda x: (x['result'] == 'runs').mean()).to_dict()
matches['venue_bat_first_win_rate'] = matches['venue'].map(venue_stats).fillna(0.5)

# 3. Initialize Encoders for Categorical Variables
encoder = LabelEncoder()
venue_encoder = LabelEncoder()
toss_dec_encoder = LabelEncoder()

# Fit encoders on the global set of teams and venues
all_teams = pd.concat([matches['team1'], matches['team2']]).unique()
encoder.fit(all_teams)
venue_encoder.fit(matches['venue'])
toss_dec_encoder.fit(matches['toss_decision'])

print("Pre-Match Feature Engineering Complete.")

Pre-Match Feature Engineering Complete.


In [38]:
# --- Feature Engineering for Live Chase Predictions (Logistic Regression Logic) ---
# Objective: Calculate real-time match state (Equation, CRR, RRR) ball-by-ball.

# 1. Calculate Target Scores
total_score_df = deliveries.groupby(['match_id', 'inning']).sum()['total_runs'].reset_index()
total_score_df = total_score_df[total_score_df['inning'] == 1]
total_score_df['target'] = total_score_df['total_runs'] + 1

# Merge target scores into match data
match_df = matches.merge(total_score_df[['match_id', 'target']], on='match_id')
delivery_df = match_df.merge(deliveries, on='match_id')

# 2. Filter for 2nd Innings (Chase Scenarios)
delivery_df = delivery_df[delivery_df['inning'] == 2]

# 3. Derive Dynamic State Variables
delivery_df['current_score'] = delivery_df.groupby('match_id')['total_runs'].cumsum()
delivery_df['runs_left'] = delivery_df['target'] - delivery_df['current_score']

# FIX: Robust Balls Left Calculation
# We assume standard 120 balls. We cap it at 0 to avoid negatives.
delivery_df['balls_bowled'] = (delivery_df['over'] * 6) + delivery_df['ball']
delivery_df['balls_left'] = 120 - delivery_df['balls_bowled']
delivery_df['balls_left'] = delivery_df['balls_left'].apply(lambda x: 0 if x < 0 else x)

# Calculate Wickets Left
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].fillna("0")
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].apply(lambda x: "0" if x == "0" else "1").astype('int')
wickets = delivery_df.groupby('match_id')['player_dismissed'].cumsum()
delivery_df['wickets_left'] = 10 - wickets

# 4. Calculate Rate Metrics (CRR & RRR) with ZERO DIVISION SAFETY
# If balls_bowled is 0, CRR is 0 (to avoid infinity)
delivery_df['crr'] = np.where(delivery_df['balls_bowled'] == 0, 0, (delivery_df['current_score'] * 6) / delivery_df['balls_bowled'])

# If balls_left is 0, RRR is 0 (to avoid infinity)
delivery_df['rrr'] = np.where(delivery_df['balls_left'] == 0, 0, (delivery_df['runs_left'] * 6) / delivery_df['balls_left'])

# 5. Define Target Variable (1 if Batting Team Wins)
def result_check(row):
    return 1 if row['batting_team'] == row['winner'] else 0

delivery_df['result'] = delivery_df.apply(result_check, axis=1)

# Select and Finalize Features
final_df = delivery_df[[
    'batting_team', 'bowling_team', 'city', 'runs_left', 
    'balls_left', 'wickets_left', 'target', 'crr', 'rrr', 'result'
]]

# Rename 'target' to 'total_runs_x' to match training pipeline expectations
final_df = final_df.rename(columns={'target': 'total_runs_x'})

# Shuffle and clean
final_df = final_df.sample(frac=1)
final_df.dropna(inplace=True)
# Double check to remove any remaining infinite values just in case
final_df = final_df[np.isfinite(final_df['crr']) & np.isfinite(final_df['rrr'])]

print("Live Match Feature Engineering Complete.")

Live Match Feature Engineering Complete.


In [39]:
# --- Training the Hybrid Inference Engine ---
# Architecture:
# 1. Random Forest: Used for 1st Innings (Complex, non-linear patterns of team strength)
# 2. Logistic Regression: Used for 2nd Innings (Linear, mathematical pressure of run chase)

# --- 1. Train Random Forest (Pre-Match) ---
matches_rf = matches.dropna(subset=['winner', 'toss_winner']).copy()

# Encode Categorical Features
matches_rf['team1'] = encoder.transform(matches_rf['team1'])
matches_rf['team2'] = encoder.transform(matches_rf['team2'])
matches_rf['toss_winner'] = encoder.transform(matches_rf['toss_winner'])
matches_rf['venue'] = venue_encoder.transform(matches_rf['venue'])
matches_rf['toss_decision'] = toss_dec_encoder.transform(matches_rf['toss_decision'])
matches_rf['winner'] = encoder.transform(matches_rf['winner'])

X_rf = matches_rf[['team1', 'team2', 'venue', 'toss_winner', 'toss_decision', 't1_win_rate', 't2_win_rate', 'venue_bat_first_win_rate']]
y_rf = matches_rf['winner']

X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_rf, y_rf, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_rf, y_train_rf)

print(f"Random Forest Accuracy (Pre-Match): {accuracy_score(y_test_rf, rf_model.predict(X_test_rf))*100:.2f}%")

# --- 2. Train Logistic Regression (Live Chase) ---
X_log = final_df.drop('result', axis=1)
y_log = final_df['result']
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X_log, y_log, test_size=0.2, random_state=1)

# Pipeline handling OneHotEncoding automatically
trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse_output=False, drop='first'), ['batting_team', 'bowling_team', 'city'])
], remainder='passthrough')

pipe = Pipeline(steps=[
    ('step1', trf),
    ('step2', LogisticRegression(solver='liblinear'))
])

pipe.fit(X_train_log, y_train_log)
print(f"Logistic Regression Accuracy (Live Chase): {accuracy_score(y_test_log, pipe.predict(X_test_log))*100:.2f}%")

Random Forest Accuracy (Pre-Match): 48.19%
Logistic Regression Accuracy (Live Chase): 79.85%


In [42]:
def predict_match_state(inning, team1, team2, venue, city, toss_winner, toss_decision, 
                        current_score=0, balls_done=0, wickets_lost=0, target=None):
    """
    Hybrid Inference Function:
    - Route 1: 1st Innings -> Uses Random Forest (Historical Data)
    - Route 2: 2nd Innings -> Uses Logistic Regression (Live Equation)
    """
    
    # --- SCENARIO 1: FIRST INNINGS (Pre-Match Logic) ---
    if inning == 1:
        try:
            # Transform Inputs
            t1_id = encoder.transform([team1])[0]
            t2_id = encoder.transform([team2])[0]
            ven_id = venue_encoder.transform([venue])[0]
            t_win_id = encoder.transform([toss_winner])[0]
            t_dec_id = toss_dec_encoder.transform([toss_decision])[0]
            
            t1_rate = team_win_rate.get(team1, 0.5)
            t2_rate = team_win_rate.get(team2, 0.5)
            ven_bias = venue_stats.get(venue, 0.5)
            
            input_vector = pd.DataFrame(
                [[t1_id, t2_id, ven_id, t_win_id, t_dec_id, t1_rate, t2_rate, ven_bias]],
                columns=['team1', 'team2', 'venue', 'toss_winner', 'toss_decision', 't1_win_rate', 't2_win_rate', 'venue_bat_first_win_rate']
            )
            
            probs = rf_model.predict_proba(input_vector)[0]
            prob_t1 = probs[t1_id]
            prob_t2 = probs[t2_id]
            
            # Normalize probabilities
            total = prob_t1 + prob_t2
            return f" 1st Innings Prediction (Historical):\n   {team1}: {(prob_t1/total)*100:.1f}%\n   {team2}: {(prob_t2/total)*100:.1f}%"
            
        except Exception as e:
            return f"Error in 1st Innings Prediction: {e}"

    # --- SCENARIO 2: SECOND INNINGS (Live Chase Logic) ---
    elif inning == 2:
        if target is None:
            return "Error: Target score required for 2nd Innings."
            
        try:
            # Derive Features
            runs_left = target - current_score
            balls_left = 120 - balls_done
            wickets_left = 10 - wickets_lost
            crr = current_score * 6 / balls_done if balls_done > 0 else 0
            rrr = runs_left * 6 / balls_left if balls_left > 0 else 0
            
            # Create DataFrame
            input_df = pd.DataFrame({
                'batting_team': [team1], # Batting team is always Team 1 in this context
                'bowling_team': [team2],
                'city': [city], # FIX: We use the explicit City name now
                'runs_left': [runs_left],
                'balls_left': [balls_left],
                'wickets_left': [wickets_left],
                'total_runs_x': [target],
                'crr': [crr],
                'rrr': [rrr]
            })
            
            result_prob = pipe.predict_proba(input_df)
            win_prob = result_prob[0][1]
            loss_prob = result_prob[0][0]
            
            return f" 2nd Innings Prediction (Live Equation):\n   {team1}: {win_prob*100:.1f}%\n   {team2}: {loss_prob*100:.1f}%"
            
        except Exception as e:
            return f"Error in 2nd Innings Prediction: {e}"

# --- Test The Architecture ---
print("--- Test 1: MI vs CSK (Match Start) ---")
# Added 'Mumbai' as the city
print(predict_match_state(1, 'Mumbai Indians', 'Chennai Super Kings', 'Wankhede Stadium', 'Mumbai', 'Mumbai Indians', 'bat'))

print("\n--- Test 2: RCB vs KKR (Tight Chase) ---")
# Added 'Bangalore' as the city (Use 'Bangalore' or 'Bengaluru' depending on what is in your csv)
print(predict_match_state(2, 'Royal Challengers Bangalore', 'Kolkata Knight Riders', 'M Chinnaswamy Stadium', 'Bangalore', 'Kolkata Knight Riders', 'field', 
                          current_score=160, balls_done=100, wickets_lost=4, target=200))

--- Test 1: MI vs CSK (Match Start) ---
 1st Innings Prediction (Historical):
   Mumbai Indians: 27.6%
   Chennai Super Kings: 72.4%

--- Test 2: RCB vs KKR (Tight Chase) ---
 2nd Innings Prediction (Live Equation):
   Royal Challengers Bangalore: 58.6%
   Kolkata Knight Riders: 41.4%
