In [20]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

In [6]:
df = pd.read_csv('/content/team_features.csv')
df.head()

Unnamed: 0,date,team,opponent,team_score,opponent_score,win,neutral,avg_goals_last_5,win_rate_last_5,avg_goals_conceded_last_5,goal_diff,avg_goal_diff_last_5,days_since_last_match
0,2010-01-02,Iran,North Korea,1.0,0.0,1,True,,,,1.0,,
1,2010-01-02,Mali,Qatar,0.0,0.0,0,False,,,,0.0,,
2,2010-01-02,North Korea,Iran,0.0,1.0,0,True,,,,-1.0,,
3,2010-01-02,Qatar,Mali,0.0,0.0,0,False,,,,0.0,,
4,2010-01-02,Syria,Zimbabwe,6.0,0.0,1,True,,,,6.0,,


In [7]:
df.isna().sum()

Unnamed: 0,0
date,0
team,0
opponent,0
team_score,0
opponent_score,0
win,0
neutral,0
avg_goals_last_5,309
win_rate_last_5,309
avg_goals_conceded_last_5,309


In [8]:
FEATURES = ["avg_goals_last_5", "win_rate_last_5", "neutral", "avg_goals_conceded_last_5",
            "avg_goal_diff_last_5"]

TARGET = "win"
df_model = df.dropna(subset=FEATURES + ["win"]).copy()

In [9]:
scaler = StandardScaler()
df_model[FEATURES] = scaler.fit_transform(df_model[FEATURES])

In [10]:
train_df = df_model[df_model["date"] < "2019-01-01"]
test_df  = df_model[df_model["date"] >= "2019-01-01"]

X_train = train_df[FEATURES]
y_train = train_df[TARGET]

X_test = test_df[FEATURES]
y_test = test_df[TARGET]

In [11]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

Accuracy: 0.6233005333133028
ROC AUC: 0.6213775419608948


In [12]:
coef_df = pd.DataFrame({
    "feature": FEATURES,
    "coefficient": model.coef_[0]
}).sort_values(by="coefficient", ascending=False)

coef_df

Unnamed: 0,feature,coefficient
4,avg_goal_diff_last_5,0.14374
0,avg_goals_last_5,0.093878
1,win_rate_last_5,0.028106
2,neutral,-0.020492
3,avg_goals_conceded_last_5,-0.136348


In [13]:
def predict_win_probability(team_features, model):
    """
    team_features: pandas DataFrame with 1 row
    model: trained sklearn pipeline
    """
    return model.predict_proba(team_features)[0, 1]


In [14]:
latest_features = df_model.sort_values(by="date").groupby("team").tail(1).set_index("team")
latest_features

Unnamed: 0_level_0,date,opponent,team_score,opponent_score,win,neutral,avg_goals_last_5,win_rate_last_5,avg_goals_conceded_last_5,goal_diff,avg_goal_diff_last_5,days_since_last_match
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Rhodes,2011-06-27,Jersey,0.0,2.0,0,1.531517,0.764645,2.406431,-0.386087,-2.0,0.701109,1.0
Kiribati,2011-09-05,Tahiti,1.0,17.0,0,1.531517,-1.223277,-1.500392,9.015116,-16.0,-6.565510,2.0
Western Sahara,2012-06-09,Occitania,1.0,3.0,0,1.531517,1.361022,0.453019,2.054610,-2.0,-0.529528,1.0
Saint Pierre and Miquelon,2012-09-28,New Caledonia,1.0,16.0,0,1.531517,-1.382311,-1.500392,9.593652,-15.0,-7.034324,2.0
Provence,2013-02-13,Monaco,6.0,1.0,1,-0.652947,4.104355,0.062337,1.132569,5.0,1.685619,249.0
...,...,...,...,...,...,...,...,...,...,...,...,...
Estonia,2025-11-18,Cyprus,4.0,2.0,1,-0.652947,-0.905209,-1.500392,1.349520,2.0,-1.408554,5.0
Ecuador,2025-11-18,New Zealand,2.0,0.0,1,1.531517,-0.905209,-0.719028,-1.036939,2.0,0.138532,5.0
India,2025-11-18,Bangladesh,0.0,1.0,0,-0.652947,-1.143760,-1.500392,-0.169136,-1.0,-0.564689,35.0
Laos,2025-11-19,Vietnam,0.0,2.0,0,-0.652947,-0.666659,-0.719028,2.000373,-2.0,-1.689843,36.0


In [15]:
df_model.columns

Index(['date', 'team', 'opponent', 'team_score', 'opponent_score', 'win',
       'neutral', 'avg_goals_last_5', 'win_rate_last_5',
       'avg_goals_conceded_last_5', 'goal_diff', 'avg_goal_diff_last_5',
       'days_since_last_match'],
      dtype='object')

In [18]:
def simulate_match(team_a, team_b, team_table, model):
  row_a = team_table.loc[team_a]
  row_b = team_table.loc[team_b]

  feature_values = {
    "avg_goals_last_5": row_a["avg_goals_last_5"] - row_b["avg_goals_last_5"],
    "win_rate_last_5": row_a["win_rate_last_5"] - row_b["win_rate_last_5"],
    "neutral": 1, # Neutral is a constant feature and should be handled accordingly
    "avg_goals_conceded_last_5": row_a["avg_goals_conceded_last_5"] - row_b["avg_goals_conceded_last_5"],
    "avg_goal_diff_last_5": row_a["avg_goal_diff_last_5"] - row_b["avg_goal_diff_last_5"]
  }

  # Ensure the feature_diff DataFrame has columns in the same order as FEATURES
  # FEATURES list is defined as: ['avg_goals_last_5', 'win_rate_last_5', 'neutral', 'avg_goals_conceded_last_5', 'avg_goal_diff_last_5']
  feature_diff = pd.DataFrame([feature_values], columns=FEATURES)

  print(feature_diff)
  win_prob = model.predict_proba(feature_diff)[0, 1]
  print(win_prob)
  return team_a if np.random.rand() < win_prob else team_b

In [19]:
print(simulate_match("Brazil", "Germany", latest_features, model))

   avg_goals_last_5  win_rate_last_5  neutral  avg_goals_conceded_last_5  \
0          0.477101        -0.781365        1                   0.216951   

   avg_goal_diff_last_5  
0              0.140644  
0.3773353192655765
Germany


In [21]:
joblib.dump(model, 'soccer_model.pkl')
joblib.dump(scaler, 'soccer_scaler.pkl')
print("Model and Scaler saved as .pkl files")

Model and Scaler saved as .pkl files
