In [None]:
# sports_winner_prediction_enhanced.py

# -------------------------
# 0. Setup
# -------------------------
# Install necessary libraries if you haven't already
# !pip install pandas numpy matplotlib seaborn scikit-learn meteostat tqdm

!pip install meteostat
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from datetime import datetime
from meteostat import Point, Daily
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import ConfusionMatrixDisplay

# Suppress future warnings for cleaner output
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# This part is for Google Colab. If running locally, you don't need it.
from google.colab import drive
drive.mount('/content/drive')

# -------------------------
# 1. Load Data
# -------------------------
print("Loading data...")
# Update this path to your local file path if not using Colab
df = pd.read_csv("/content/drive/My Drive/project/ipl_full_info(2008-2024).csv")
#df = pd.read_csv("ipl_full_info(2008-2024).csv") # Assuming file is in the same directory
print("Initial shape:", df.shape)

# -------------------------
# 2. Data Cleaning
# -------------------------
print("Cleaning data...")
df = df.drop_duplicates()
df = df.dropna(subset=["Batting team", "Bowling team", "Season", "Date", "Venue", "Toss_Winner"])
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

# Clean Season column
df["Season"] = df["Season"].astype(str).str[:4]
df["Season"] = df["Season"].astype(int)

# -------------------------
# 3. Aggregate Match-Level Data
# -------------------------
print("Aggregating match-level data...")
match_df = df.groupby("Match_ID").agg({
    "Season": "first",
    "Date": "first",
    "Batting team": "first",
    "Bowling team": "first",
    "Venue": "first",
    "Toss_Winner": "first",
    "Toss_Decision": "first",
    "First_Innings_Score": "first",
    "Second_Innings_Score": "first",
}).reset_index()

# Derive winner
match_df["Winner"] = np.where(
    match_df["First_Innings_Score"] > match_df["Second_Innings_Score"],
    match_df["Batting team"],
    match_df["Bowling team"]
)
print("Processed match-level data shape:", match_df.shape)

# -------------------------
# 4. Feature Engineering: Weather Integration
# -------------------------
print("Integrating historical weather data... (This may take a few minutes)")

# Dictionary of venue coordinates (Latitude, Longitude)
venue_coords = {
    'Wankhede Stadium': (18.9388, 72.8258),
    'MA Chidambaram Stadium': (13.063, 80.2784),
    'Eden Gardens': (22.5646, 88.3433),
    'M Chinnaswamy Stadium': (12.9786, 77.5996),
    'Arun Jaitley Stadium': (28.6367, 77.243),
    'Narendra Modi Stadium': (23.0931, 72.592),
    'Punjab Cricket Association IS Bindra Stadium': (30.706, 76.739),
    'Rajiv Gandhi International Stadium': (17.4069, 78.5508),
    'Sawai Mansingh Stadium': (26.8954, 75.8055),
    # Add other stadiums as needed, looking up their coordinates
    'Dubai International Cricket Stadium': (25.0442, 55.2186),
    'Sharjah Cricket Stadium': (25.3421, 55.4121),
    'Sheikh Zayed Stadium': (24.4369, 54.4936),
    'Maharashtra Cricket Association Stadium': (18.6383, 73.725),
    'Dr DY Patil Sports Academy': (19.091, 73.0122),
    'Brabourne Stadium': (18.9348, 72.8252),
}

# Function to get historical weather data using Meteostat
def get_historical_weather(row):
    lat, lon = venue_coords.get(row['Venue'], (None, None))
    if lat is None:
        return pd.Series({'avg_temp': None, 'precipitation': None})

    start = row['Date']
    end = row['Date']
    location = Point(lat, lon)
    try:
        data = Daily(location, start, end)
        data = data.fetch()
        if not data.empty:
            return pd.Series({'avg_temp': data['tavg'].iloc[0], 'precipitation': data['prcp'].iloc[0]})
    except Exception as e:
        pass # Handle API errors or missing data
    return pd.Series({'avg_temp': None, 'precipitation': None})

# Apply the function with a progress bar
tqdm.pandas(desc="Fetching Weather")
weather_data = match_df.progress_apply(get_historical_weather, axis=1)

# Join weather data and fill missing values (e.g., with the mean)
match_df = pd.concat([match_df, weather_data], axis=1)
match_df['avg_temp'].fillna(match_df['avg_temp'].mean(), inplace=True)
match_df['precipitation'].fillna(match_df['precipitation'].mean(), inplace=True)
print("Weather data integration complete.")

# -------------------------
# 5. Encode Categorical Columns
# -------------------------
print("Encoding categorical features...")
team_encoder = LabelEncoder()
venue_encoder = LabelEncoder()
toss_winner_encoder = LabelEncoder()
winner_encoder = LabelEncoder()
toss_decision_encoder = LabelEncoder()

# Fit team encoder on all possible team names to avoid errors
all_teams = pd.concat([match_df['Batting team'], match_df['Bowling team'], match_df['Toss_Winner'], match_df['Winner']]).unique()
team_encoder.fit(all_teams)
winner_encoder.fit(all_teams)
toss_winner_encoder.fit(all_teams)

match_df["Batting team_enc"] = team_encoder.transform(match_df["Batting team"])
match_df["Bowling team_enc"] = team_encoder.transform(match_df["Bowling team"])
match_df["Venue_enc"] = venue_encoder.fit_transform(match_df["Venue"])
match_df["Toss_Winner_enc"] = toss_winner_encoder.transform(match_df["Toss_Winner"])
match_df["Winner_enc"] = winner_encoder.transform(match_df["Winner"])
match_df["Toss_Decision_enc"] = toss_decision_encoder.fit_transform(match_df["Toss_Decision"])

# -------------------------
# 6. Toss Decision Prediction Model (New)
# -------------------------
print("\n--- Building Toss Decision Prediction Model ---")
X_toss = match_df[["Venue_enc", "Season", "avg_temp", "precipitation"]]
y_toss = match_df["Toss_Decision_enc"] # Target: 0 for 'bat', 1 for 'field'

X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_toss, y_toss, test_size=0.2, random_state=42)

toss_clf = RandomForestClassifier(n_estimators=100, random_state=42)
toss_clf.fit(X_train_t, y_train_t)
y_pred_t = toss_clf.predict(X_test_t)

print(f"Toss Decision Model Accuracy: {accuracy_score(y_test_t, y_pred_t):.2%}")
print("Toss Decision Labels:", list(toss_decision_encoder.classes_)) # ['bat', 'field']

# -------------------------
# 7. Winner Prediction (Classification) - Updated with Weather
# -------------------------
print("\n--- Building Winner Prediction Model (with Weather) ---")
X_cls = match_df[[
    "Season", "Batting team_enc", "Bowling team_enc",
    "Venue_enc", "Toss_Winner_enc", "Toss_Decision_enc",
    "avg_temp", "precipitation"
]]
y_cls = match_df["Winner_enc"]

X_train, X_test, y_train, y_test = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")

# -------------------------
# 8. Score Forecasting (Regression) - Updated with Weather
# -------------------------
print("\n--- Building Score Forecasting Model (with Weather) ---")
X_reg = X_cls.copy() # Use the same features as winner prediction for consistency
y_reg = match_df["First_Innings_Score"]

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

reg = RandomForestRegressor(n_estimators=200, random_state=42)
reg.fit(X_train_r, y_train_r)
y_pred_r = reg.predict(X_test_r)

print("MSE:", mean_squared_error(y_test_r, y_pred_r))
print("R2 Score:", r2_score(y_test_r, y_pred_r))

# -------------------------
# 9. Comprehensive Prediction for a Future Match
# -------------------------
def get_future_weather(lat, lon, target_date_str):
    """Fetches weather forecast using Open-Meteo API."""
    try:
        url = (f"https://api.open-meteo.com/v1/forecast?latitude={lat}&longitude={lon}"
               f"&daily=temperature_2m_mean,precipitation_sum&timezone=auto&start_date={target_date_str}&end_date={target_date_str}")
        response = requests.get(url)
        data = response.json()
        avg_temp = data['daily']['temperature_2m_mean'][0]
        precipitation = data['daily']['precipitation_sum'][0]
        return avg_temp, precipitation
    except Exception:
        print("Could not fetch future weather, using historical averages.")
        return match_df['avg_temp'].mean(), match_df['precipitation'].mean()


def predict_full_match_scenario(team1, team2, venue, date_str, season=2025):
    """
    Predicts toss decision, winner, and score for a future match.
    """
    print(f"\n🔮 Predicting Scenario: {team1} vs {team2} at {venue} on {date_str}")

    # Step 1: Get Venue Coordinates and Weather Forecast
    lat, lon = venue_coords.get(venue, (None, None))
    if lat is None:
        return "Error: Venue coordinates not found."

    avg_temp, precipitation = get_future_weather(lat, lon, date_str)
    print(f"🌤️ Forecast: Temp ~{avg_temp:.1f}°C, Precipitation ~{precipitation:.1f}mm")

    # Step 2: Encode Venue and Predict Toss Decision
    venue_enc = venue_encoder.transform([venue])[0]
    toss_pred_features = pd.DataFrame({
        "Venue_enc": [venue_enc],
        "Season": [season],
        "avg_temp": [avg_temp],
        "precipitation": [precipitation]
    })
    predicted_decision_enc = toss_clf.predict(toss_pred_features)[0]
    predicted_decision = toss_decision_encoder.inverse_transform([predicted_decision_enc])[0]
    print(f"🪙 Predicted Toss Decision: Captain who wins will likely choose to **{predicted_decision.upper()}**.")

    # Step 3: Simulate scenarios for who wins toss and predict winner/score
    for toss_winner_team in [team1, team2]:
        print("-" * 30)
        print(f"Scenario: **{toss_winner_team}** wins the toss.")

        if predicted_decision == 'bat':
            batting_team, bowling_team = toss_winner_team, team2 if toss_winner_team == team1 else team1
        else: # field
            bowling_team, batting_team = toss_winner_team, team2 if toss_winner_team == team1 else team1

        # Prepare features for main models
        features = pd.DataFrame({
            "Season": [season],
            "Batting team_enc": team_encoder.transform([batting_team]),
            "Bowling team_enc": team_encoder.transform([bowling_team]),
            "Venue_enc": [venue_enc],
            "Toss_Winner_enc": toss_winner_encoder.transform([toss_winner_team]),
            "Toss_Decision_enc": [predicted_decision_enc],
            "avg_temp": [avg_temp],
            "precipitation": [precipitation]
        })

        # Predict Winner
        winner_pred_enc = clf.predict(features)
        winner = winner_encoder.inverse_transform(winner_pred_enc)[0]

        # Forecast Score
        score_pred = reg.predict(features)[0]

        print(f"   - First to Bat: **{batting_team}**")
        print(f"   - Predicted 1st Innings Score: **~{int(score_pred)}**")
        print(f"   - Predicted Match Winner: **{winner}** 🏆")


# --- Example Usage ---
predict_full_match_scenario(
    team1="Chennai Super Kings",
    team2="Mumbai Indians",
    venue="Wankhede Stadium",
    date_str="2025-04-10" # Must be a future or recent date for forecast API
)

predict_full_match_scenario(
    team1="Royal Challengers Bangalore",
    team2="Kolkata Knight Riders",
    venue="Eden Gardens",
    date_str="2025-04-12"

)




# 10. Interactive Prediction with User Input (NEW SECTION)
# -------------------------
print("\n" + "="*50)
print("🏏 IPL Match Scenario Predictor 🏏")
print("="*50)

# --- Display available options to the user ---
print("\nAvailable Teams:")
known_teams = list(team_encoder.classes_)
print(", ".join(known_teams))

print("\nAvailable Venues (with coordinates):")
known_venues = list(venue_coords.keys())
print(", ".join(known_venues))

# --- Get and validate user input ---
while True:
    user_team1 = input("\nEnter Team 1 (from the list above): ")
    if user_team1 in known_teams:
        break
    print("Invalid team name. Please check spelling and try again.")

while True:
    user_team2 = input("Enter Team 2 (must be different from Team 1): ")
    if user_team2 in known_teams and user_team2 != user_team1:
        break
    print("Invalid team name or same as Team 1. Please try again.")

while True:
    user_venue = input("Enter Venue (from the list above): ")
    if user_venue in known_venues:
        break
    print("Invalid venue name. Please check spelling and try again.")

while True:
    user_date = input("Enter match date (YYYY-MM-DD): ")
    try:
        datetime.strptime(user_date, '%Y-%m-%d')
        break
    except ValueError:
        print("Incorrect date format. Please use YYYY-MM-DD.")


# --- Run prediction with user input ---
predict_full_match_scenario(
    team1=user_team1,
    team2=user_team2,
    venue=user_venue,
    date_str=user_date
)