In [5]:
# 📒 ODI Score & Winning Prediction Notebook (Polished)

# 1. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, confusion_matrix, classification_report

# 2. Load Data

df = pd.read_csv('ODI_Match_Data.csv')
print(f"Dataset shape: {df.shape}")
print(df.columns)

# 3. Data Preprocessing

# Basic cleaning
columns_needed = ['match_id', 'venue', 'innings', 'batting_team', 'bowling_team', 
                  'over', 'ball', 'runs_off_bat', 'extras', 'player_dismissed', 'bowler', 'batsman']
df = df[columns_needed]

# Add 'total_runs' column
df['total_runs'] = df['runs_off_bat'] + df['extras']

# Add 'is_wicket' column
df['is_wicket'] = df['player_dismissed'].notnull().astype(int)

# 4. Aggregating Every 5 Overs

def create_features(data):
    feature_list = []
    
    for (match_id, innings), group in data.groupby(['match_id', 'innings']):
        batting_team = group['batting_team'].iloc[0]
        venue = group['venue'].iloc[0]
        total_runs = 0
        wickets = 0
        
        for over in range(0, 50, 5):
            interval = group[(group['over'] >= over) & (group['over'] < over + 5)]
            runs = interval['total_runs'].sum()
            wkts = interval['is_wicket'].sum()
            
            total_runs += runs
            wickets += wkts
            
            feature_list.append({
                'match_id': match_id,
                'innings': innings,
                'batting_team': batting_team,
                'venue': venue,
                'over_interval': f'{over}-{over+5}',
                'cumulative_runs': total_runs,
                'cumulative_wickets': wickets
            })
            
    return pd.DataFrame(feature_list)

features_df = create_features(df)
print(features_df.head())

# 5. Encode Categorical Variables

le_team = LabelEncoder()
le_venue = LabelEncoder()

features_df['batting_team_enc'] = le_team.fit_transform(features_df['batting_team'])
features_df['venue_enc'] = le_venue.fit_transform(features_df['venue'])

# 6. Prepare First Innings Data (Linear Regression)

first_innings = features_df[features_df['innings'] == 1]

X_lr = first_innings[['cumulative_runs', 'cumulative_wickets', 'batting_team_enc', 'venue_enc']]
y_lr = first_innings['cumulative_runs']

X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X_lr, y_lr, test_size=0.2, random_state=42)

# 7. Linear Regression Model

lr_model = LinearRegression()
lr_model.fit(X_train_lr, y_train_lr)

y_pred_lr = lr_model.predict(X_test_lr)

# 8. Evaluation of Linear Regression

print("--- Linear Regression Evaluation ---")
print("Mean Absolute Error:", mean_absolute_error(y_test_lr, y_pred_lr))
print("R2 Score:", r2_score(y_test_lr, y_pred_lr))

plt.figure(figsize=(8,5))
plt.scatter(y_test_lr, y_pred_lr, alpha=0.7, color='green')
plt.plot([y_test_lr.min(), y_test_lr.max()], [y_test_lr.min(), y_test_lr.max()], 'r--')
plt.xlabel("Actual Runs")
plt.ylabel("Predicted Runs")
plt.title("Actual vs Predicted Runs (First Innings)")
plt.grid(True)
plt.show()

# 9. Prepare Second Innings Data (Naive Bayes)

second_innings = features_df[features_df['innings'] == 2]

# Assume target score is the max runs scored by first innings team in that match.
target_scores = first_innings.groupby('match_id')['cumulative_runs'].max()

second_innings = second_innings.merge(target_scores, on='match_id', how='left', suffixes=('', '_target'))
second_innings['target_score'] = second_innings['cumulative_runs_target']

# Create Label for Win/Loss (Win if cumulative_runs >= target)
second_innings['win'] = (second_innings['cumulative_runs'] >= second_innings['target_score']).astype(int)

X_nb = second_innings[['cumulative_runs', 'cumulative_wickets', 'batting_team_enc', 'venue_enc', 'target_score']]
y_nb = second_innings['win']

X_train_nb, X_test_nb, y_train_nb, y_test_nb = train_test_split(X_nb, y_nb, test_size=0.2, random_state=42)

# 10. Naive Bayes Model

nb_model = GaussianNB()
nb_model.fit(X_train_nb, y_train_nb)

y_pred_nb = nb_model.predict(X_test_nb)

# 11. Evaluation of Naive Bayes

print("\n--- Naive Bayes Evaluation ---")
print("Accuracy:", accuracy_score(y_test_nb, y_pred_nb))
print("Classification Report:\n", classification_report(y_test_nb, y_pred_nb))

cm = confusion_matrix(y_test_nb, y_pred_nb)
sns.heatmap(cm, annot=True, fmt='d', cmap='coolwarm')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# 12. Death Overs Specialist Analysis (45-50 Overs)

death_overs = df[(df['over'] >= 45) & (df['over'] <= 50)]

# Best Death Over Batsmen
best_batsmen = death_overs.groupby('batsman')['runs_off_bat'].sum().sort_values(ascending=False).head(10)
print("\n🏏 Best Death Over Batsmen (Top 10):")
print(best_batsmen)

# Plotting
best_batsmen.plot(kind='barh', color='skyblue')
plt.xlabel('Runs in Death Overs')
plt.title('Top 10 Death Over Batsmen')
plt.gca().invert_yaxis()
plt.grid(True)
plt.show()

# Best Death Over Bowlers
best_bowlers = death_overs.groupby('bowler')['is_wicket'].sum().sort_values(ascending=False).head(10)
print("\n🎯 Best Death Over Bowlers (Top 10):")
print(best_bowlers)

# Plotting
best_bowlers.plot(kind='barh', color='salmon')
plt.xlabel('Wickets in Death Overs')
plt.title('Top 10 Death Over Bowlers')
plt.gca().invert_yaxis()
plt.grid(True)
plt.show()

# 13. Overall Best Performers

# Best Overall Batsmen
overall_batsmen = df.groupby('batsman')['runs_off_bat'].sum().sort_values(ascending=False).head(10)
print("\n🌟 Best Overall Batsmen (All Overs)")
print(overall_batsmen)

# Best Overall Bowlers
overall_bowlers = df.groupby('bowler')['is_wicket'].sum().sort_values(ascending=False).head(10)
print("\n🌟 Best Overall Bowlers (All Overs)")
print(overall_bowlers)

# 14. Conclusion
print("\n✅ First Innings Score Prediction (Linear Regression) Done")
print("✅ Second Innings Winning Prediction (Naive Bayes) Done")
print("✅ Death Overs Specialist & Overall Best Player Analysis Done")

# End of Notebook 🚀


  df = pd.read_csv('ODI_Match_Data.csv')


Dataset shape: (1265103, 23)
Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed', 'cricsheet_id'],
      dtype='object')


KeyError: "['over', 'batsman'] not in index"