In [9]:
# ODI Match Analysis and Prediction (2015-2025)

# 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score

# 2. Load and Filter Data
#file_path = '/mnt/data/ODI_Match_Data.csv'
df = pd.read_csv('ODI_Match_Data.csv')

df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')
df = df[(df['start_date'].dt.year >= 2015) & (df['start_date'].dt.year <= 2025)]

print(f"Filtered dataset shape: {df.shape}")

# 3. Feature Engineering
## 3.1 Total Runs and Death Over Tagging
df['total_runs'] = df['runs_off_bat'] + df['extras']
df['is_death_over'] = df['over'].apply(lambda x: 1 if x >= 41 else 0)

## 3.2 Match Summary
match_summary = df.groupby(['match_id', 'inning']).agg({
    'total_runs': 'sum',
    'player_dismissed': 'count',
    'batsman': 'count'
}).rename(columns={'player_dismissed': 'wickets_lost', 'batsman': 'balls_faced'}).reset_index()

# 4. Score Prediction (Regression)
## 4.1 Prepare Pre-40 Over Data
df_before_40 = df[df['over'] <= 40]
pre_40_summary = df_before_40.groupby(['match_id', 'inning']).agg({
    'total_runs': 'sum',
    'player_dismissed': 'count'
}).rename(columns={'total_runs': 'runs_till_40', 'player_dismissed': 'wickets_till_40'}).reset_index()

model_data = pd.merge(pre_40_summary, match_summary[['match_id', 'inning', 'total_runs']], on=['match_id', 'inning'])

## 4.2 Train and Evaluate
X = model_data[['runs_till_40', 'wickets_till_40']]
y = model_data['total_runs']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

print("\nScore Prediction Evaluation:")
print(f"MAE (Mean Absolute Error): {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R2 Score: {r2_score(y_test, y_pred):.2f}")

# 5. Winning Prediction (Classification)
## 5.1 Prepare Winner Data
match_scores = df.groupby(['match_id', 'inning', 'batting_team']).agg({'total_runs': 'sum'}).reset_index()
winner_df = match_scores.loc[match_scores.groupby('match_id')['total_runs'].idxmax()]
winner_df = winner_df[['match_id', 'batting_team']].rename(columns={'batting_team': 'winner'})

match_with_winner = pd.merge(match_scores, winner_df, on='match_id')
match_with_winner['win_label'] = (match_with_winner['batting_team'] == match_with_winner['winner']).astype(int)

## 5.2 Train and Evaluate
X = match_with_winner[['total_runs']]
y = match_with_winner['win_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print("\nWinning Prediction Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")

# 6. Best Players Analysis
## 6.1 Best Batsman Overall
batsman_stats = df.groupby('batsman').agg({
    'runs_off_bat': 'sum',
    'batsman': 'count'
}).rename(columns={'batsman': 'balls_faced'})

batsman_stats['strike_rate'] = (batsman_stats['runs_off_bat'] / batsman_stats['balls_faced']) * 100
top_batsmen = batsman_stats[batsman_stats['balls_faced'] >= 300].sort_values(by='runs_off_bat', ascending=False).head(10)

print("\nTop 10 Batsmen:")
print(top_batsmen)

## 6.2 Best Bowler Overall
bowler_stats = df.groupby('bowler').agg({
    'player_dismissed': 'count',
    'total_runs': 'sum'
})

bowler_stats['bowling_avg'] = bowler_stats['total_runs'] / bowler_stats['player_dismissed']
top_bowlers = bowler_stats[bowler_stats['player_dismissed'] >= 50].sort_values(by='player_dismissed', ascending=False).head(10)

print("\nTop 10 Bowlers:")
print(top_bowlers)

# 7. Death Overs Specialist
## 7.1 Death Over Batsman
death_overs = df[df['over'] >= 41]
death_batsman = death_overs.groupby('batsman').agg({
    'runs_off_bat': 'sum',
    'batsman': 'count'
})

death_batsman['strike_rate'] = (death_batsman['runs_off_bat'] / death_batsman['batsman']) * 100
top_death_batsmen = death_batsman[death_batsman['batsman'] >= 50].sort_values(by='strike_rate', ascending=False).head(10)

print("\nTop Death Over Batsmen:")
print(top_death_batsmen)

## 7.2 Death Over Bowler
death_bowler = death_overs.groupby('bowler').agg({
    'player_dismissed': 'count'
}).sort_values(by='player_dismissed', ascending=False).head(10)

print("\nTop Death Over Bowlers:")
print(death_bowler)


  df = pd.read_csv('ODI_Match_Data.csv')


Filtered dataset shape: (562390, 23)


KeyError: 'over'

In [7]:
print(df.columns.tolist())


['match_id', 'season', 'start_date', 'venue', 'innings', 'ball', 'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler', 'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes', 'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type', 'other_player_dismissed', 'cricsheet_id']
