In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report

def parse_str_to_num(val):
    """
    Attempts to convert a value to float.
    If the value is a string with a colon (e.g. '675:51:00'),
    it splits the string, converts the first two parts to floats 
    (interpreting them as goals for and goals against) and returns 
    their difference.
    Otherwise, it tries to convert the entire value to float.
    If all fails, returns np.nan.
    """
    if isinstance(val, (int, float)):
        return val

    val_str = str(val).strip()
    if ':' in val_str:
        parts = val_str.split(':')
        if len(parts) >= 2:
            try:
                gf = float(parts[0])
                ga = float(parts[1])
                return gf - ga
            except Exception:
                return np.nan
        else:
            return np.nan
    else:
        try:
            return float(val_str)
        except Exception:
            return np.nan

# --- Step 1: Load Data ---
def load_data():
    # Load CSV files
    perf_df = pd.read_csv('/kaggle/input/champions-league-dataset-1955-2023/UCL_AllTime_Performance_Table.csv')
    finals_df = pd.read_csv('/kaggle/input/champions-league-dataset-1955-2023/UCL_Finals_1955-2023.csv')
    
    print("Performance Data Columns:", perf_df.columns.tolist())
    print("Finals Data Columns:", finals_df.columns.tolist())
    
    # For every column in the performance data except 'Team', try to convert strings to numeric.
    for col in perf_df.columns:
        if col != 'Team' and perf_df[col].dtype == object:
            perf_df[col] = perf_df[col].apply(parse_str_to_num)
    
    return perf_df, finals_df

# --- Step 2: Build a Team-Level Dataset from Finals ---
def build_team_dataset(finals_df, perf_df):
    """
    For each final, create two rows: one for the winner (Label = 1) and one for the runner-up (Label = 0).
    It pulls performance stats from the performance table using the 'Team' column.
    """
    rows = []
    for _, row in tqdm(finals_df.iterrows(), total=len(finals_df), desc="Processing Finals"):
        season = row['Season']
        winner_team = row['Winners']
        # Note: In your finals CSV, the runner-up column is 'Runners-up' (with a lowercase 'u')
        runnerup_team = row['Runners-up']

        # Lookup performance stats for the winner
        winner_stats = perf_df[perf_df['Team'] == winner_team]
        if not winner_stats.empty:
            data = winner_stats.iloc[0].to_dict()
            data['Team'] = winner_team
            data['Season'] = season
            data['Label'] = 1  # Winner
            rows.append(data)
        else:
            print(f"Warning: Performance stats not found for winner '{winner_team}' in season {season}")

        # Lookup performance stats for the runner-up
        runner_stats = perf_df[perf_df['Team'] == runnerup_team]
        if not runner_stats.empty:
            data = runner_stats.iloc[0].to_dict()
            data['Team'] = runnerup_team
            data['Season'] = season
            data['Label'] = 0  # Runner-up
            rows.append(data)
        else:
            print(f"Warning: Performance stats not found for runner-up '{runnerup_team}' in season {season}")
    
    team_data = pd.DataFrame(rows)
    return team_data

# --- Step 3: Prepare Features and Labels ---
def prepare_features(team_data):
    # Drop non-numeric columns: 'Team' and 'Season'
    drop_cols = ['Team', 'Season']
    feature_cols = [col for col in team_data.columns if col not in drop_cols + ['Label']]
    
    X = team_data[feature_cols].fillna(0)
    y = team_data['Label']
    print("Features used:", feature_cols)
    return X, y

# --- Step 4: Train and Evaluate Models ---
def train_models(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print("Training basic models...")
    # Decision Tree
    dt_params = {'max_depth': [2, 5, 10], 'min_samples_split': [2, 5, 10]}
    dt_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_params, cv=3, scoring='accuracy')
    dt_cv.fit(X_train, y_train)
    dt_pred = dt_cv.best_estimator_.predict(X_test)
    print("Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))
    
    # Naive Bayes
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    nb_pred = nb.predict(X_test)
    print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_pred))
    
    # KNN
    knn_params = {'n_neighbors': [3, 5, 7]}
    knn_cv = GridSearchCV(KNeighborsClassifier(), knn_params, cv=3, scoring='accuracy')
    knn_cv.fit(X_train, y_train)
    knn_pred = knn_cv.best_estimator_.predict(X_test)
    print("KNN Accuracy:", accuracy_score(y_test, knn_pred))
    
    print("\nTraining advanced models...")
    # Random Forest
    rf_params = {'n_estimators': [50, 100], 'max_depth': [3, 5, None]}
    rf_cv = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=3, scoring='accuracy')
    rf_cv.fit(X_train, y_train)
    rf_pred = rf_cv.best_estimator_.predict(X_test)
    print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
    
    # Gradient Boosting
    gb_params = {'n_estimators': [50, 100], 'learning_rate': [0.05, 0.1, 0.2]}
    gb_cv = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=3, scoring='accuracy')
    gb_cv.fit(X_train, y_train)
    gb_pred = gb_cv.best_estimator_.predict(X_test)
    print("Gradient Boosting Accuracy:", accuracy_score(y_test, gb_pred))
    
    # Ensemble: Voting Classifier using soft voting from RF and GB
    voting_clf = VotingClassifier(estimators=[
        ('rf', rf_cv.best_estimator_),
        ('gb', gb_cv.best_estimator_)
    ], voting='soft')
    voting_clf.fit(X_train, y_train)
    voting_pred = voting_clf.predict(X_test)
    print("Voting Classifier Accuracy:", accuracy_score(y_test, voting_pred))
    print("\nDetailed Voting Classifier Report:")
    print(classification_report(y_test, voting_pred))
    
    # Return ensemble model for later prediction
    return voting_clf

# --- Prediction for This Year's Final ---
def predict_final(model, new_data):
    """
    Given a trained model and new_data (a DataFrame with the same feature columns),
    return the predictions and prediction probabilities.
    """
    new_data = new_data.fillna(0)
    prediction = model.predict(new_data)
    proba = model.predict_proba(new_data)
    return prediction, proba

# --- Main Pipeline ---
def main():
    # 1. Load datasets
    perf_df, finals_df = load_data()
    
    # 2. Build a combined team-level dataset from finals
    team_data = build_team_dataset(finals_df, perf_df)
    print("Combined team dataset shape:", team_data.shape)
    print(team_data.head())
    
    # 3. Prepare features and labels
    X, y = prepare_features(team_data)
    
    # 4. Train and evaluate models
    ensemble_model = train_models(X, y)
    
    # --- Predicting This Year's Final ---
    # Since current-season data is not available, we automatically select finalists from the performance table.
    # Here we choose the two teams with the highest 'Pt.' as a proxy.
    top_two = perf_df.sort_values(by='Pt.', ascending=False).head(2)
    print("\nAutomatically chosen finalists based on highest points:")
    print(top_two[['Team', 'Pt.']])
    
    # Select the feature columns (should match training features)
    feature_cols = ['#', 'M.', 'W', 'D', 'L', 'goals', 'Dif', 'Pt.']
    new_data = top_two[feature_cols].copy()
    new_data.reset_index(drop=True, inplace=True)
    
    prediction, proba = predict_final(ensemble_model, new_data)
    
    print("\nPredictions for this year's final teams:")
    for i, row in new_data.iterrows():
        team_name = top_two.iloc[i]['Team']
        print(f"{team_name} Prediction (1 = predicted win): {prediction[i]}")
    
    print("\nPrediction Probabilities (columns: [Prob for runner-up, Prob for winner]):")
    print(proba)
    
    # Determine the team with the highest probability for class 1 (win)
    winner_index = np.argmax(proba[:, 1])
    predicted_winner = top_two.iloc[winner_index]['Team']
    win_probability = proba[winner_index, 1]
    
    print(f"\nPredicted Winner for this year's final: {predicted_winner} with a win probability of {win_probability:.2%}")
    
if __name__ == "__main__":
    main()


Performance Data Columns: ['#', 'Team', 'M.', 'W', 'D', 'L', 'goals', 'Dif', 'Pt.']
Finals Data Columns: ['Season', 'Country', 'Winners', 'Score', 'Runners-up', 'Country.1', 'Venue', 'Attend\xadance', 'Notes']


Processing Finals:   0%|          | 0/69 [00:00<?, ?it/s]

Combined team dataset shape: (128, 11)
     #         Team   M.    W   D    L   goals  Dif  Pt.   Season  Label
0  1.0  Real Madrid  486  291  85  110  1021.0  533  533  1955–56      1
1  1.0  Real Madrid  486  291  85  110  1021.0  533  533  1956–57      1
2  1.0  Real Madrid  486  291  85  110  1021.0  533  533  1957–58      1
3  6.0     AC Milan  265  127  69   69   415.0  171  171  1957–58      0
4  1.0  Real Madrid  486  291  85  110  1021.0  533  533  1958–59      1
Features used: ['#', 'M.', 'W', 'D', 'L', 'goals', 'Dif', 'Pt.']
Training basic models...
Decision Tree Accuracy: 0.4230769230769231
Naive Bayes Accuracy: 0.4230769230769231
KNN Accuracy: 0.3076923076923077

Training advanced models...
Random Forest Accuracy: 0.5769230769230769
Gradient Boosting Accuracy: 0.4230769230769231
Voting Classifier Accuracy: 0.5384615384615384

Detailed Voting Classifier Report:
              precision    recall  f1-score   support

           0       0.50      0.25      0.33        12
     