In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [40]:
url = 'https://raw.githubusercontent.com/bechosen-spec/Women-Football-Result-Prediction/main/cleaned_results.csv'
df = pd.read_csv(url)

In [41]:
df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1969-11-01,Italy,France,1,0,Euro,Novara,Italy,False
1,1969-11-01,Denmark,England,4,3,Euro,Aosta,Italy,True
2,1969-11-02,England,France,2,0,Euro,Turin,Italy,True
3,1969-11-02,Italy,Denmark,3,1,Euro,Turin,Italy,False
4,1975-08-25,Thailand,Australia,3,2,AFC Championship,Hong Kong,Hong Kong,True


In [42]:
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year
0,1969-11-01,Italy,France,1,0,Euro,Novara,Italy,False,1969
1,1969-11-01,Denmark,England,4,3,Euro,Aosta,Italy,True,1969
2,1969-11-02,England,France,2,0,Euro,Turin,Italy,True,1969
3,1969-11-02,Italy,Denmark,3,1,Euro,Turin,Italy,False,1969
4,1975-08-25,Thailand,Australia,3,2,AFC Championship,Hong Kong,Hong Kong,True,1975


In [43]:
features_df = df.drop(columns=["home_score", "away_score", "date"])
y_home = df["home_score"]
y_away = df["away_score"]

In [44]:
!pip install category_encoders


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [45]:
import category_encoders

In [46]:
### Frequency Encoding
features_df_copy = features_df.copy()
columns_list = features_df.columns

#encode all columns
for col in columns_list:
  #encoder
  encoder = category_encoders.CountEncoder(cols = col, normalize=True)

  ##fit and transform
  features_df[col] = encoder.fit_transform(features_df[col])

features_df.head()

Unnamed: 0,home_team,away_team,tournament,city,country,neutral,year
0,0.022318,0.021908,0.004095,0.000205,0.016994,0.556511,0.000819
1,0.02457,0.02068,0.004095,0.000205,0.016994,0.443489,0.000819
2,0.022113,0.021908,0.004095,0.000614,0.016994,0.443489,0.000819
3,0.022318,0.030917,0.004095,0.000614,0.016994,0.556511,0.000819
4,0.010442,0.013309,0.054873,0.011671,0.011671,0.443489,0.002048


## Hypertuning parameters for home score

In [47]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [48]:
X_train, X_test, y_train, y_test = train_test_split(features_df, y_home, test_size=0.2, random_state=42)

In [49]:
model = RandomForestRegressor(random_state=42)

In [50]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

In [51]:
print("Mean-squared error:", mse)

Mean-squared error: 4.04388146928473


In [52]:
r2 = r2_score(y_test, y_pred)
print("R-squared score:", r2)

R-squared score: 0.44605494687083147


In [53]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [54]:
#Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]}
scoring = {
    'MSE': make_scorer(mean_squared_error),
    'R2': make_scorer(r2_score)}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, refit='MSE', cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_mse = grid_search.best_score_
best_r2 = grid_search.cv_results_['mean_test_R2'][grid_search.best_index_]
best_model = RandomForestRegressor(**best_params)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Best parameters:", best_params)
print("Best MSE:", best_mse)
print("Best R2:", best_r2)
print("MSE on test set:", mse)
print("R2 on test set:", r2)

Best parameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Best MSE: 4.526806521611325
Best R2: 0.39976987276085557
MSE on test set: 4.077894802786777
R2 on test set: 0.4413956812674743


In [55]:
model = RandomForestRegressor(n_estimators=100,max_depth=5, min_samples_split=10, min_samples_leaf=1, random_state=42)

In [56]:
import joblib


model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean-squared error:", mse)
print("R-squared score:", r2)

# Export the model
# joblib.dump(model, 'home_score_model.pkl')

Mean-squared error: 4.0753167965730155
R-squared score: 0.44174882559178175


In [57]:
X_train, X_test, y_train, y_test = train_test_split(features_df, y_away, test_size=0.2, random_state=42)

In [58]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean-squared error:", mse)
r2 = r2_score(y_test, y_pred)
print("R-squared score:", r2)

Mean-squared error: 2.8864719026174868
R-squared score: 0.24106995899253936


## Hypertuning parameters for away score

In [59]:
import time

In [60]:
#Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]}
scoring = {
    'MSE': make_scorer(mean_squared_error),
    'R2': make_scorer(r2_score)}
print(time.time())
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, refit='MSE', cv=5)
grid_search.fit(X_train, y_train)
print(time.time())
best_params = grid_search.best_params_
best_mse = grid_search.best_score_
best_r2 = grid_search.cv_results_['mean_test_R2'][grid_search.best_index_]
best_model = RandomForestRegressor(**best_params)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Best parameters:", best_params)
print("Best MSE:", best_mse)
print("Best R2:", best_r2)
print("MSE on test set:", mse)
print("R2 on test set:", r2)

1687287263.8685
1687287805.8813906
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best MSE: 2.8407984736759015
Best R2: 0.27435718364502354
MSE on test set: 2.899961061815753
R2 on test set: 0.23752330117327802


In [61]:
model = RandomForestRegressor(n_estimators=100,max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean-squared error:", mse)
print("R-squared score:", r2)

# Export the model
# joblib.dump(model, 'away_score_model.pkl')

Mean-squared error: 2.8864719026174868
R-squared score: 0.24106995899253936


## Predicting the score

In [62]:
def prediction(X):
    hX_train, hX_test, hy_train, hy_test = train_test_split(features_df, y_home, test_size=0.2, random_state=42)
    aX_train, aX_test, ay_train, ay_test = train_test_split(features_df, y_away, test_size=0.2, random_state=42)
    hmodel = RandomForestRegressor(n_estimators=100,max_depth=5, min_samples_split=10, min_samples_leaf=1, random_state=42)
    amodel = RandomForestRegressor(n_estimators=100,max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=42)
    hmodel.fit(hX_train, hy_train)
    amodel.fit(aX_train, ay_train)
    hy_pred = hmodel.predict(X)
    ay_pred = amodel.predict(X)
    print("Score:", np.round(hy_pred).astype(int)
, ":",np.round(ay_pred).astype(int)
 )


In [63]:
X=features_df.iloc[0]
X=pd.DataFrame(X)
a=X.values
values= a.flatten().tolist()
values

[0.02231777231777232,
 0.02190827190827191,
 0.004095004095004095,
 0.00020475020475020476,
 0.016994266994266993,
 0.5565110565110565,
 0.000819000819000819]

In [64]:
input_row = pd.DataFrame([values], columns=['home_team', 'away_team', 'tournament', 'city', 'country','neutral','year'])


In [65]:
prediction(input_row)

Score: [1] : [1]


In [66]:
prediction(input_row)

Score: [1] : [1]


In [67]:
prediction(input_row)

Score: [1] : [1]


In [68]:
prediction(input_row)

Score: [1] : [1]


Predicting the score

In [86]:
import pandas as pd
import category_encoders
import joblib

# Define the input features
features = ["Italy", "England", "Euro", "Novara", "Italy", False, 1969]

# Convert the features to a DataFrame
features_df = pd.DataFrame([features], columns=["home_team", "away_team", "tournament", "city", "country", "neutral", "year"])

# Load the trained models
home_model = joblib.load("/content/away_score_model.pkl")
away_model = joblib.load("/content/home_score_model.pkl")

# Create a function to encode features using frequency encoding
def encode_features(features):
    features_df = pd.DataFrame([features], columns=["home_team", "away_team", "tournament", "city", "country", "neutral", "year"])

    # Frequency Encoding
    features_df_copy = features_df.copy()
    columns_list = features_df.columns

    # Encode all columns
    for col in columns_list:
        # Encoder
        encoder = category_encoders.CountEncoder(cols=col, normalize=True)

        # Fit and transform
        features_df[col] = encoder.fit_transform(features_df[col])

    return features_df

# Encode the input features
features_df_encoded = encode_features(features)

# Make predictions using the models
home_score = home_model.predict(features_df_encoded)
away_score = away_model.predict(features_df_encoded)

# Print the predicted scores
print(features[0], int(home_score[0]), ":", int(away_score[0]), features[1])

# Determine the winner
if home_score[0] > away_score[0]:
    print(features[0], "wins!")
else:
    print(features[1], "wins!")


Italy 2 : 1 England
Italy wins!
