In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load dataset
import gdown

gdown.download('https://drive.google.com/uc?id=1CqkArlz9WJvP-JvTmqE7U6c0BOs20DXh', 'ODI_Match_Data.csv', quiet=False)
df = pd.read_csv('ODI_Match_Data.csv')

#######################################################################


import pandas as pd

# Define chunk size (adjust depending on memory, 100,000 is usually safe)
chunksize = 100_000

# Create an empty list to hold chunks
chunks = []

# Read file in chunks
for chunk in pd.read_csv("ODI_Match_Data.csv", chunksize=chunksize, low_memory=False):
    chunks.append(chunk)

# Concatenate all chunks into one DataFrame
df = pd.concat(chunks, ignore_index=True)

# ✅ Now df contains your full dataset loaded in memory-friendly way
print("✅ CSV loaded successfully with shape:", df.shape)



###################################################################

# Feature Engineering
df['total_runs'] = df['runs_off_bat'].fillna(0) + df['extras'].fillna(0)
df['over'] = df['ball'].astype(str).str.extract(r'(\d+)\.').astype(float)

# Aggregate to over level
grouped = df.groupby(['match_id', 'innings', 'over']).agg({
    'total_runs': 'sum',
    'wicket_type': lambda x: x.notna().sum()
}).reset_index()

grouped.rename(columns={'total_runs': 'runs_this_over', 'wicket_type': 'wickets_this_over'}, inplace=True)

# Cumulative features
grouped['cumulative_runs'] = grouped.groupby(['match_id', 'innings'])['runs_this_over'].cumsum()
grouped['cumulative_wickets'] = grouped.groupby(['match_id', 'innings'])['wickets_this_over'].cumsum()

# Rolling run rate (last 5 overs)
grouped['run_rate_last_5'] = grouped.groupby(['match_id', 'innings'])['runs_this_over'].rolling(5, min_periods=1).mean().reset_index(0, drop=True)

# Final score for each innings
final_scores = grouped.groupby(['match_id', 'innings'])['cumulative_runs'].max().reset_index()
final_scores.rename(columns={'cumulative_runs': 'final_score'}, inplace=True)

# Merge back to each over's features
grouped = grouped.merge(final_scores, on=['match_id', 'innings'])

# Filter after 25th over
train_df = grouped[grouped['over'] >= 25]

# Define features and target
X = train_df[['cumulative_runs', 'cumulative_wickets', 'over', 'run_rate_last_5']]
y = train_df['final_score']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Model
model = XGBRegressor(n_estimators=200, learning_rate=0.08, max_depth=6, random_state=42)
model.fit(X_train, y_train)

# Evaluate
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)

print("✅ RMSE:", rmse)
print("✅ R² Score:", r2)

# Save model
joblib.dump(model, "xgb_score_predictor.pkl")
print("✅ Model saved as xgb_score_predictor.pkl")


Downloading...
From (original): https://drive.google.com/uc?id=1CqkArlz9WJvP-JvTmqE7U6c0BOs20DXh
From (redirected): https://drive.google.com/uc?id=1CqkArlz9WJvP-JvTmqE7U6c0BOs20DXh&confirm=t&uuid=0ae3847d-8455-4996-983a-21f544a1d1ce
To: C:\Users\ASUS\ODI_Match_Data.csv
100%|███████████████████████████████████████████████████████████████████████████████| 168M/168M [00:07<00:00, 21.2MB/s]
  df = pd.read_csv('ODI_Match_Data.csv')


✅ CSV loaded successfully with shape: (1265103, 23)


TypeError: incompatible index of inserted column with frame index

In [3]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import gdown

# Step 1: Download CSV from Google Drive
gdown.download('https://drive.google.com/uc?id=1CqkArlz9WJvP-JvTmqE7U6c0BOs20DXh', 'ODI_Match_Data.csv', quiet=False)

# Step 2: Load CSV using chunks to avoid memory crash
chunksize = 100_000
chunks = []
for chunk in pd.read_csv("ODI_Match_Data.csv", chunksize=chunksize, low_memory=False):
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
print("✅ CSV loaded successfully with shape:", df.shape)

# Step 3: Feature Engineering
df['total_runs'] = df['runs_off_bat'].fillna(0) + df['extras'].fillna(0)
df['over'] = df['ball'].astype(str).str.extract(r'(\d+)\.').astype(float)

# Aggregate at over level
grouped = df.groupby(['match_id', 'innings', 'over']).agg({
    'total_runs': 'sum',
    'wicket_type': lambda x: x.notna().sum()
}).reset_index()

grouped.rename(columns={'total_runs': 'runs_this_over', 'wicket_type': 'wickets_this_over'}, inplace=True)

# Cumulative runs and wickets
grouped['cumulative_runs'] = grouped.groupby(['match_id', 'innings'])['runs_this_over'].cumsum()
grouped['cumulative_wickets'] = grouped.groupby(['match_id', 'innings'])['wickets_this_over'].cumsum()

# ✅ FIX: Compute rolling run rate correctly
rolling_rr = (
    grouped.groupby(['match_id', 'innings'])['runs_this_over']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index()
    .rename(columns={'runs_this_over': 'run_rate_last_5'})
)

# Merge back correctly using match_id, innings, and index (which is 'over')
grouped = grouped.merge(rolling_rr, on=['match_id', 'innings', 'over'])

# Get final scores for each innings
final_scores = grouped.groupby(['match_id', 'innings'])['cumulative_runs'].max().reset_index()
final_scores.rename(columns={'cumulative_runs': 'final_score'}, inplace=True)

# Merge final score
grouped = grouped.merge(final_scores, on=['match_id', 'innings'])

# Filter for overs >= 25
train_df = grouped[grouped['over'] >= 25]

# Select features and target
X = train_df[['cumulative_runs', 'cumulative_wickets', 'over', 'run_rate_last_5']]
y = train_df['final_score']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = XGBRegressor(n_estimators=200, learning_rate=0.08, max_depth=6, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)

print("✅ RMSE:", round(rmse, 2))
print("✅ R² Score:", round(r2, 4))

# Save model
joblib.dump(model, "xgb_score_predictor.pkl")
print("✅ Model saved as xgb_score_predictor.pkl")


Downloading...
From (original): https://drive.google.com/uc?id=1CqkArlz9WJvP-JvTmqE7U6c0BOs20DXh
From (redirected): https://drive.google.com/uc?id=1CqkArlz9WJvP-JvTmqE7U6c0BOs20DXh&confirm=t&uuid=8dc7beb2-86e8-413c-8ad9-e3fe979d0c16
To: C:\Users\ASUS\ODI_Match_Data.csv
100%|███████████████████████████████████████████████████████████████████████████████| 168M/168M [00:05<00:00, 31.2MB/s]


✅ CSV loaded successfully with shape: (1265103, 23)


KeyError: 'over'

In [1]:
###############FINAL SCORE PREDICTION#######################


import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import gdown

# Step 1: Download data from Google Drive
gdown.download('https://drive.google.com/uc?id=1CqkArlz9WJvP-JvTmqE7U6c0BOs20DXh', 'ODI_Match_Data.csv', quiet=False)

# Step 2: Load large CSV in chunks (memory efficient)
chunksize = 100_000
chunks = []
for chunk in pd.read_csv("ODI_Match_Data.csv", chunksize=chunksize, low_memory=False):
    chunks.append(chunk)
df = pd.concat(chunks, ignore_index=True)
print("✅ CSV loaded successfully with shape:", df.shape)

# Step 3: Feature Engineering
df['total_runs'] = df['runs_off_bat'].fillna(0) + df['extras'].fillna(0)
df['over'] = df['ball'].astype(str).str.extract(r'(\d+)\.').astype(float)

# Aggregate per over
grouped = df.groupby(['match_id', 'innings', 'over']).agg({
    'total_runs': 'sum',
    'wicket_type': lambda x: x.notna().sum()
}).reset_index()

grouped.rename(columns={
    'total_runs': 'runs_this_over',
    'wicket_type': 'wickets_this_over'
}, inplace=True)

# Cumulative calculations
grouped['cumulative_runs'] = grouped.groupby(['match_id', 'innings'])['runs_this_over'].cumsum()
grouped['cumulative_wickets'] = grouped.groupby(['match_id', 'innings'])['wickets_this_over'].cumsum()

# Rolling run rate (last 5 overs) – FIXED VERSION
rolling_rr = (
    grouped
    .groupby(['match_id', 'innings'])[['runs_this_over']]
    .apply(lambda x: x.rolling(5, min_periods=1).mean())
    .reset_index(level=[0,1], drop=True)
    .rename(columns={'runs_this_over': 'run_rate_last_5'})
)
grouped['run_rate_last_5'] = rolling_rr.values

# Final innings score (target)
final_scores = grouped.groupby(['match_id', 'innings'])['cumulative_runs'].max().reset_index()
final_scores.rename(columns={'cumulative_runs': 'final_score'}, inplace=True)
grouped = grouped.merge(final_scores, on=['match_id', 'innings'])

# Use data after 25th over only
train_df = grouped[grouped['over'] >= 25]

# Step 4: Model Training
X = train_df[['cumulative_runs', 'cumulative_wickets', 'over', 'run_rate_last_5']]
y = train_df['final_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor(n_estimators=200, learning_rate=0.08, max_depth=6, random_state=42)
model.fit(X_train, y_train)

# Step 5: Evaluation
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)

print(f"✅ RMSE: {rmse:.2f}")
print(f"✅ R² Score: {r2:.4f}")

# Step 6: Save model
joblib.dump(model, "xgb_score_predictor.pkl")
print("✅ Model saved as xgb_score_predictor.pkl")


Downloading...
From (original): https://drive.google.com/uc?id=1CqkArlz9WJvP-JvTmqE7U6c0BOs20DXh
From (redirected): https://drive.google.com/uc?id=1CqkArlz9WJvP-JvTmqE7U6c0BOs20DXh&confirm=t&uuid=e8a9ae21-55f4-4284-98c5-b91e7db846da
To: C:\Users\ASUS\ODI_Match_Data.csv
100%|███████████████████████████████████████████████████████████████████████████████| 168M/168M [00:06<00:00, 25.7MB/s]


✅ CSV loaded successfully with shape: (1265103, 23)
✅ RMSE: 31.14
✅ R² Score: 0.7113
✅ Model saved as xgb_score_predictor.pkl


In [5]:
import pandas as pd
import gdown
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

# ✅ Step 1: Download CSV from Google Drive
gdown.download('https://drive.google.com/uc?id=1CqkArlz9WJvP-JvTmqE7U6c0BOs20DXh', 'ODI_Match_Data.csv', quiet=False)

# ✅ Step 2: Load in chunks
chunksize = 100_000
chunks = []

# Select only required columns for this task
columns_needed = [
    'match_id', 'innings', 'venue', 'batting_team', 'bowling_team',
    'runs_off_bat', 'extras'
]

for chunk in pd.read_csv("ODI_Match_Data.csv", usecols=columns_needed, chunksize=chunksize, low_memory=False):
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
print("✅ Data loaded with shape:", df.shape)

# ✅ Step 3: Clean and engineer features
df['venue'] = df['venue'].fillna("Unknown")
df['batting_team'] = df['batting_team'].fillna("Unknown")
df['bowling_team'] = df['bowling_team'].fillna("Unknown")

# Toss decision logic
df['toss_decision'] = df['innings'].apply(lambda x: 'bat' if x == 1 else 'bowl')

# Total runs
df['total_runs'] = df['runs_off_bat'] + df['extras']
match_scores = df.groupby(['match_id', 'innings'])['total_runs'].sum().unstack()
match_scores['match_result'] = (match_scores[2] > match_scores[1]).astype(int)

# Merge with toss and venue info
meta = df[['match_id', 'venue', 'batting_team', 'innings']].drop_duplicates()
toss_info = meta[meta['innings'] == 1].copy()
toss_info = toss_info.rename(columns={'batting_team': 'toss_winner'})
toss_info['toss_decision'] = 'bat'  # Assumption
toss_info = toss_info[['match_id', 'venue', 'toss_winner', 'toss_decision']]
toss_info = toss_info.merge(match_scores[['match_result']], left_on='match_id', right_index=True)

# ✅ Step 4: Encode and train
X = pd.get_dummies(toss_info[['venue', 'toss_winner', 'toss_decision']], drop_first=True)
y = toss_info['match_result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier(max_depth=4, random_state=42)
model.fit(X_train, y_train)

# Save model
joblib.dump(model, "toss_decision_model.pkl")

# ✅ Step 5: Evaluate
print("\n✅ Classification Report:\n")
print(classification_report(y_test, model.predict(X_test)))


Downloading...
From (original): https://drive.google.com/uc?id=1CqkArlz9WJvP-JvTmqE7U6c0BOs20DXh
From (redirected): https://drive.google.com/uc?id=1CqkArlz9WJvP-JvTmqE7U6c0BOs20DXh&confirm=t&uuid=959c3519-b56c-41f6-8738-3c023bb75581
To: C:\Users\ASUS\ODI_Match_Data.csv
100%|███████████████████████████████████████████████████████████████████████████████| 168M/168M [00:07<00:00, 21.4MB/s]


✅ Data loaded with shape: (1265103, 7)

✅ Classification Report:

              precision    recall  f1-score   support

           0       0.56      0.97      0.71       258
           1       0.70      0.09      0.16       218

    accuracy                           0.57       476
   macro avg       0.63      0.53      0.43       476
weighted avg       0.62      0.57      0.45       476

