In [1]:
# ==========================================
# ‚öΩ Football Match Outcome Prediction (Jupyter Version)
# ==========================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, confusion_matrix

# ========= File paths =========
FIFA_CSV = "fifa2020-2024.csv"
RESULTS_CSV = "result1.csv"
SCORERS_CSV = "goalscorers.csv"

# ========= Load data =========
try:
    df_fifa = pd.read_csv(FIFA_CSV)
    df_results = pd.read_csv(RESULTS_CSV)
    goals_df = pd.read_csv(SCORERS_CSV)
    print("‚úÖ CSVs loaded successfully!")
except FileNotFoundError as e:
    raise RuntimeError(f"Required CSV missing: {e.filename}")

# ========= Preprocess =========
df_fifa['date'] = pd.to_datetime(df_fifa['date'], errors='coerce')
df_results['date'] = pd.to_datetime(df_results['date'], errors='coerce')
goals_df['date'] = pd.to_datetime(goals_df['date'], errors='coerce')

df_fifa = df_fifa.sort_values('date').reset_index(drop=True)
df_results = df_results.sort_values('date').reset_index(drop=True)

# Identify correct points column
if 'total.points' in df_fifa.columns:
    fifa_points_col = 'total.points'
elif 'total_points' in df_fifa.columns:
    fifa_points_col = 'total_points'
else:
    raise RuntimeError("FIFA CSV must have 'total.points' or 'total_points' column.")

# Handle goal booleans
goals_df['own_goal'] = goals_df['own_goal'].astype(str).str.upper().isin(['TRUE', '1', 'YES'])
goals_df['penalty'] = goals_df['penalty'].astype(str).str.upper().isin(['TRUE', '1', 'YES'])

if 'minute' in goals_df.columns:
    goals_df['minute'] = pd.to_numeric(goals_df['minute'], errors='coerce')

# ========= Merge FIFA data with match results =========
fifa_home = df_fifa.rename(columns={'rank': 'home_rank', fifa_points_col: 'home_points'})[['date', 'team', 'home_rank', 'home_points']]
fifa_away = df_fifa.rename(columns={'rank': 'away_rank', fifa_points_col: 'away_points'})[['date', 'team', 'away_rank', 'away_points']]

df_merged = pd.merge_asof(
    df_results, fifa_home,
    left_on='date', right_on='date',
    left_by='home_team', right_by='team',
    direction='backward'
).rename(columns={'team': 'home_team_fifa_name'})

df_merged = pd.merge_asof(
    df_merged, fifa_away,
    left_on='date', right_on='date',
    left_by='away_team', right_by='team',
    direction='backward'
).rename(columns={'team': 'away_team_fifa_name'})

required_fifa_cols = ['home_rank', 'home_points', 'away_rank', 'away_points']
df_merged = df_merged.dropna(subset=required_fifa_cols)

# ========= Feature Engineering =========
df_merged['rank_difference'] = df_merged['home_rank'] - df_merged['away_rank']
df_merged['points_difference'] = df_merged['home_points'] - df_merged['away_points']
df_merged['home_win'] = (df_merged['home_score'] > df_merged['away_score']).astype(int)
df_merged['score_difference'] = df_merged['home_score'] - df_merged['away_score']

if 'tournament' in df_merged.columns:
    df_merged = pd.get_dummies(df_merged, columns=['tournament'], prefix='tournament')
else:
    df_merged['tournament_missing'] = 1

if 'neutral' not in df_merged.columns:
    df_merged['neutral'] = 0
df_merged['neutral'] = df_merged['neutral'].astype(int)

# ========= Select Features =========
engineered_features = ['rank_difference', 'points_difference', 'neutral']
tournament_features = [c for c in df_merged.columns if c.startswith('tournament_')]
features = engineered_features + tournament_features

X = df_merged[features].copy()
y_clf = df_merged['home_win'].copy()
y_reg = df_merged['score_difference'].copy()

# ========= Train/Test Split =========
X_train, X_test, y_train_clf, y_test_clf, y_train_reg, y_test_reg = train_test_split(
    X, y_clf, y_reg, test_size=0.2, random_state=42
)

# ========= Standardize =========
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# ========= Train Models =========
model_clf = LogisticRegression(random_state=42, max_iter=1000)
model_reg = RandomForestRegressor(random_state=42)

model_clf.fit(X_train_scaled, y_train_clf)
model_reg.fit(X_train_scaled, y_train_reg)

print("‚úÖ Models trained successfully!")

# ========= Evaluate Models =========

# --- Regression Evaluation (Score Difference Prediction) ---
y_pred_reg = model_reg.predict(X_test_scaled)
mse = mean_squared_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_reg, y_pred_reg)

print("\nüéØ Random Forest Regression Performance:")
print(f"  MSE  : {mse:.4f}")
print(f"  RMSE : {rmse:.4f}")
print(f"  R¬≤   : {r2:.4f}")

# --- Classification Evaluation (Home Win Prediction) ---
y_pred_clf = model_clf.predict(X_test_scaled)
acc = accuracy_score(y_test_clf, y_pred_clf)
f1 = f1_score(y_test_clf, y_pred_clf)
cm = confusion_matrix(y_test_clf, y_pred_clf)

print("\nüèÜ Logistic Regression Classification Performance:")
print(f"  Accuracy : {acc:.4f}")
print(f"  F1 Score : {f1:.4f}")
print("  Confusion Matrix:")
print(cm)

# ========= Show top rows for reference =========
print("\nüìä Sample merged data preview:")
display(df_merged.head())


‚úÖ CSVs loaded successfully!
‚úÖ Models trained successfully!

üéØ Random Forest Regression Performance:
  MSE  : 3.3756
  RMSE : 1.8373
  R¬≤   : 0.2960

üèÜ Logistic Regression Classification Performance:
  Accuracy : 0.7314
  F1 Score : 0.7186
  Confusion Matrix:
[[616 175]
 [251 544]]

üìä Sample merged data preview:


Unnamed: 0,date,home_team,away_team,home_score,away_score,city,country,neutral,home_team_fifa_name,home_rank,...,tournament_Three Nations Cup,tournament_Tri Nation Tournament,tournament_Tri-Nations Series,tournament_UEFA Euro,tournament_UEFA Euro qualification,tournament_UEFA Nations League,tournament_UNCAF Cup,tournament_Unity Cup,tournament_WAFF Championship,tournament_Windward Islands Tournament
0,2015-09-05,Chile,Paraguay,3,2,Santiago,Chile,0,Chile,43.0,...,False,False,False,False,False,False,False,False,False,False
1,2015-09-05,Ukraine,Belarus,3,1,Lviv,Ukraine,0,Ukraine,25.0,...,False,False,False,False,True,False,False,False,False,False
2,2015-09-05,Tanzania,Nigeria,0,0,Dar es Salaam,Tanzania,0,Tanzania,113.0,...,False,False,False,False,False,False,False,False,False,False
3,2015-09-05,Switzerland,Slovenia,3,2,Basel,Switzerland,0,Switzerland,15.0,...,False,False,False,False,True,False,False,False,False,False
4,2015-09-05,Spain,Slovakia,2,0,Oviedo,Spain,0,Spain,3.0,...,False,False,False,False,True,False,False,False,False,False
