In [None]:
# sports_winner_prediction.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from google.colab import drive
drive.mount('/content/drive')

# -------------------------
# 1. Load Data
# -------------------------
print("Loading data...")
df = pd.read_csv("/content/drive/My Drive/project/ipl_full_info(2008-2024).csv")
print("Initial shape:", df.shape)

# -------------------------
# 2. Data Cleaning
# -------------------------
df = df.drop_duplicates()
df = df.dropna(subset=["Batting team", "Bowling team", "Season"])
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

# Clean Season column
df["Season"] = df["Season"].astype(str).str[:4]
df["Season"] = df["Season"].astype(int)

# -------------------------
# 3. Aggregate Match-Level Data
# -------------------------
match_df = df.groupby("Match_ID").agg({
    "Season": "first",
    "Batting team": "first",
    "Bowling team": "first",
    "Venue": "first",
    "Toss_Winner": "first",
    "First_Innings_Score": "first",
    "Second_Innings_Score": "first",
    "Powerplay_Scores": "max",
    "Middle_Overs_Scores": "max",
    "Death_Overs_Scores": "max"
}).reset_index()

# Derive winner
match_df["Winner"] = np.where(
    match_df["First_Innings_Score"] > match_df["Second_Innings_Score"],
    match_df["Batting team"],
    match_df["Bowling team"]
)

print("Processed match-level data:", match_df.shape)

# -------------------------
# 4. Encode Categorical Columns (use separate encoders)
# -------------------------
team_encoder = LabelEncoder()
venue_encoder = LabelEncoder()
toss_encoder = LabelEncoder()
winner_encoder = LabelEncoder()

match_df["Batting team_enc"] = team_encoder.fit_transform(match_df["Batting team"])
match_df["Bowling team_enc"] = team_encoder.transform(match_df["Bowling team"])
match_df["Venue_enc"] = venue_encoder.fit_transform(match_df["Venue"])
match_df["Toss_Winner_enc"] = toss_encoder.fit_transform(match_df["Toss_Winner"])
match_df["Winner_enc"] = winner_encoder.fit_transform(match_df["Winner"])

# -------------------------
# 5. Winner Prediction (Classification)
# -------------------------
X_cls = match_df[[
    "Season", "Batting team_enc", "Bowling team_enc",
    "Venue_enc", "Toss_Winner_enc",
    "Second_Innings_Score", "Powerplay_Scores", "Middle_Overs_Scores", "Death_Overs_Scores"
]]
y_cls = match_df["Winner_enc"]

X_train, X_test, y_train, y_test = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("\n--- Winner Prediction ---")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



from sklearn.metrics import ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=winner_encoder.classes_)  # actual team names
disp.plot(cmap="Blues", xticks_rotation=90)
plt.title("Winner Prediction Confusion Matrix (Teams)")
plt.show()


# -------------------------
# 6. Score Forecasting (Regression)
# -------------------------
X_reg = X_cls.copy()
y_reg = match_df["First_Innings_Score"]

X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

reg = RandomForestRegressor(n_estimators=200, random_state=42)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

print("\n--- Score Forecasting ---")
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

plt.scatter(y_test, y_pred, alpha=0.6)
plt.xlabel("Actual Score")
plt.ylabel("Predicted Score")
plt.title("First Innings Score Forecasting")
plt.show()

# -------------------------
# 7. Predict New Example
# -------------------------
sample_cls = pd.DataFrame({
    "Season": [2024],
    "Batting team_enc": team_encoder.transform(["Chennai Super Kings"]),
    "Bowling team_enc": team_encoder.transform(["Mumbai Indians"]),
    "Venue_enc": venue_encoder.transform(["Wankhede Stadium"]),
    "Toss_Winner_enc": toss_encoder.transform(["Mumbai Indians"]),
    "Second_Innings_Score": [0],
    "Powerplay_Scores": [65],
    "Middle_Overs_Scores": [85],
    "Death_Overs_Scores": [50]
})

pred_winner = clf.predict(sample_cls)
print("\nPredicted Winner:", winner_encoder.inverse_transform(pred_winner))

sample_reg = sample_cls.copy()
pred_score = reg.predict(sample_reg)
print("Forecasted First Innings Score:", int(pred_score[0]))
