# Football Match Result Prediction Using Gaussian NB
### Daniyal Mehraeen

### Part a)

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import LabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
teams = pd.read_csv("./Team.csv", index_col=0)
team_attributes = pd.read_csv("./Team_Attributes.csv", index_col=0)
matches = pd.read_csv("./match.csv", index_col=0)
players = pd.read_csv("./player.csv", index_col=0)
player_attributes = pd.read_csv("./player_Attributes.csv", index_col=0)

In [None]:
teams.head()

In [None]:
team_attributes.head()

In [None]:
matches.head()

In [None]:
players.head()

In [None]:
player_attributes.head()

In [None]:
matches.insert(0, "match_result", None)

In [None]:
matches.loc[matches["home_team_goal"] > matches["away_team_goal"],"match_result"] = "Win"
matches.loc[matches["home_team_goal"] == matches["away_team_goal"],"match_result"] = "Draw"
matches.loc[matches["home_team_goal"] < matches["away_team_goal"],"match_result"] = "Lose"

In [None]:
matches.head()

### Part b)

#### Preprocessing the data

In [None]:
# Convrting Dates in all tables into seconds from 1970/1/1

time_origin = datetime(1970, 1, 1)

player_attributes["date"] = player_attributes.apply(lambda row: (datetime.strptime(row["date"], "%Y-%m-%d %H:%M:%S") - time_origin).days, axis=1, result_type='expand')

matches["date"] = matches.apply(lambda row: (datetime.strptime(row["date"], "%Y-%m-%d %H:%M:%S") - time_origin).days, axis=1, result_type='expand')

players["birthday"] = players.apply(lambda row: (datetime.strptime(row["birthday"], "%Y-%m-%d %H:%M:%S") - time_origin).days, axis=1, result_type='expand')

team_attributes["date"] = team_attributes.apply(lambda row: (datetime.strptime(row["date"], "%Y-%m-%d %H:%M:%S") - time_origin).days, axis=1, result_type='expand')

In [None]:
(player_attributes.isna().sum()*100) / player_attributes.count()

In [None]:
player_attributes.drop(columns=["attacking_work_rate"], inplace=True)

In [None]:
numerical_columns = player_attributes.select_dtypes(include=[float, int]).columns
player_attributes[numerical_columns] = player_attributes[numerical_columns].fillna(player_attributes[numerical_columns].mean())

In [None]:
print(team_attributes[team_attributes["buildUpPlayDribbling"].isna()]["buildUpPlayDribblingClass"].value_counts())
print()
print(team_attributes["buildUpPlayDribblingClass"].value_counts())

mean_values = team_attributes.groupby("buildUpPlayDribblingClass")["buildUpPlayDribbling"].mean()
team_attributes.fillna(mean_values["Little"], inplace=True)

In [None]:
matches.isna().sum()

In [None]:
data = matches.copy()

Adding the nearest data available for each team to each of their matches

In [None]:
def get_latest_team_rating(team_id, date):
    team_ratings =  team_attributes[team_attributes["team_api_id"] == team_id]
    return team_ratings.loc[np.abs(team_ratings["date"] - date).idxmin()]

# Initializing the List of new columns name
home_team_columns_to_edit  = []
away_team_columns_to_edit  = []

# Initializing the position of insert
insert_position = list(data.columns).index("home_team_api_id") + 1

# Inserting the home team attributes columns to the dataframe
for idx, col in enumerate(team_attributes.columns):
    col_name = "home_team_" + col
    data.insert(insert_position + idx, col_name, None)
    home_team_columns_to_edit.append(col_name)

# Initializing the position of insert
insert_position = list(data.columns).index("away_team_api_id") + 1

# Inserting the away team attributes columns to the dataframe
for idx, col in enumerate(team_attributes.columns):
    col_name = "away_team_" + col
    data.insert(insert_position + idx, col_name, None)
    away_team_columns_to_edit.append(col_name)


# Applying the get_latest_team_rating function to new home & away team columns for all rows
data[home_team_columns_to_edit] = data.apply(lambda row: get_latest_team_rating(team_id=row["home_team_api_id"], date=row["date"]), axis=1, result_type='expand')
data[away_team_columns_to_edit] = data.apply(lambda row: get_latest_team_rating(team_id=row["away_team_api_id"], date=row["date"]), axis=1, result_type='expand')


Adding the nearest data available of each player for each match 

In [None]:
# Merging player table with player_attributes table
player_attributes =  player_attributes.merge(players.drop("id", axis=1), on=["player_api_id", "player_fifa_api_id"], how="inner")

In [None]:
player_attributes.head()

In [None]:
# Adding the player stats of home teams
num_team_players = 11

for i in range(num_team_players):
    
    # Genearting the key column for merging datasets
    column_name = f"home_player_{i+1}"

    # Craeting a rename dictionary for the merged datasets
    renamed_col_dict = {}

    for col in player_attributes.columns:
        if col != "date":
            renamed_col_dict[col] = f"home_player_{i+1}_{col}"
        else:
            renamed_col_dict["date_y"] = f"home_player_{i+1}_{col}"
            renamed_col_dict["date_x"] = "date"

    columns_to_edit = [ f"home_player_{i+1}_" + col for col in player_attributes.columns ]
    
    # Join the DataFrames for each related column in data
    data = pd.merge(data, player_attributes, left_on=column_name, right_on='player_api_id', how='inner').rename(columns=renamed_col_dict)

    data["threshold"] = (np.abs(data["date"] - data[f"home_player_{i+1}_date"]))

    data = data.loc[data.groupby("match_id")["threshold"].idxmin()]

data.head()


In [None]:
# Adding player stats of away teams
num_team_players = 11

for i in range(num_team_players):
    
    # Genearting the key column for merging datasets
    column_name = f"away_player_{i+1}"

    # Craeting a rename dictionary for the merged datasets
    renamed_col_dict = {}

    for col in player_attributes.columns:
        if col != "date":
            renamed_col_dict[col] = f"away_player_{i+1}_{col}"
        else:
            renamed_col_dict["date_y"] = f"away_player_{i+1}_{col}"
            renamed_col_dict["date_x"] = "date"

    columns_to_edit = [ f"away_player_{i+1}_" + col for col in player_attributes.columns ]
    
    # Join the DataFrames for each related column in data
    data = pd.merge(data, player_attributes, left_on=column_name, right_on='player_api_id', how='inner').rename(columns=renamed_col_dict)

    data["threshold"] = (np.abs(data["date"] - data[f"away_player_{i+1}_date"]))

    data = data.loc[data.groupby("match_id")["threshold"].idxmin()]


data.head()


Removing all non-sense columns like IDs Names and extra Dates

In [None]:
data.isna().sum().sum()

#### Extracting new features

In [55]:
# Function to calculate w_ratio
def calculate_result_ratio(row, result):
    # Filter rows with the same type and lower number
    relevant_rows = data[(data["home_team_api_id"] == row["home_team_api_id"]) & (data["date"] < row["date"])]
    
    if len(relevant_rows) == 0:
        return row["match_result"] == result
    else:
        count = len(relevant_rows[relevant_rows["match_result"] == result])
        total_count = len(relevant_rows)
        return count / total_count

# Apply function to each row
data['home_team_win_ratio'] = data.apply(lambda row: calculate_result_ratio(row, "Win"), axis=1)

Discretizing data using quantiles for naive bayes

In [None]:
def discretize_quantiles(df, num_bins=5):
    discretized_df = df.copy()
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    numerical_cols.remove("date")
    
    for col in numerical_cols:
        discretized_df[col] = pd.qcut(df[col], num_bins, labels=False, duplicates="drop")
        
    return discretized_df

discretized_data = discretize_quantiles(data, num_bins=10)
discretized_data.head()

In [None]:
columns_to_drop = [col for col in data.columns if 'id' in col or 'name' in col or "date" in col]

data = data.drop(columns=columns_to_drop)
discretized_data = discretized_data.drop(columns=columns_to_drop)

data.shape

### Part d)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
encoded_data = data.copy()
encoded_discretized_data = discretized_data.copy()

for column in encoded_data.columns:
    if encoded_data[column].dtype == 'object':

        encoded_data[column] = le.fit_transform(encoded_data[column])
        encoded_discretized_data[column] = le.fit_transform(encoded_data[column])


In [None]:
# Select the row with the max value for a season column as test data
max_value = encoded_data["season"].max()
test_data = encoded_data[encoded_data["season"] == max_value]
test_data_discretized = encoded_discretized_data[encoded_discretized_data["season"] == max_value]

# Selecting the remaining rows as train data
train_data = encoded_data[encoded_data["season"] != max_value]
train_data_discretized = encoded_discretized_data[encoded_discretized_data["season"] != max_value]

In [None]:
X_train, y_train = train_data.drop(columns="match_result"), train_data["match_result"]
X_test, y_test = test_data.drop(columns="match_result"), test_data["match_result"]

X_train_gnb, y_train_gnb = train_data_discretized.drop(columns="match_result"), train_data_discretized["match_result"]
X_test_gnb, y_test_gnb = test_data_discretized.drop(columns="match_result"), test_data_discretized["match_result"]

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(12, 4))

# Plotting histograms
sns.histplot(data=encoded_data["match_result"], ax=axes[0])
sns.histplot(data=y_train, ax=axes[1])
sns.histplot(data=y_test, ax=axes[2])

# Setting plot titles
axes[0].set_title('Dataset Match Result Freq.')
axes[1].set_title('Train Data Match Result Freq.')
axes[2].set_title('Test Data Match Result Freq.')

# Adjusting spacing between subplots
plt.tight_layout()

# Displaying the plot
plt.show()

### Part e)

In [None]:
# Naive Bayes
naive_bayes = GaussianNB()
naive_bayes.fit(X_train_gnb, y_train_gnb)

nb_predictions = naive_bayes.predict(X_test_gnb)

nb_accuracy = accuracy_score(y_true=y_test_gnb, y_pred=nb_predictions)
print(f"Naive Bayes Accuracy: {nb_accuracy*100: .2f}%")

In [None]:
# Logistic Regression
logistic_regression = LogisticRegression(penalty="l1", max_iter=100000, solver="liblinear")
logistic_regression.fit(X_train, y_train)

lr_predictions = logistic_regression.predict(X_test)

lr_accuracy = accuracy_score(y_test, lr_predictions)
print(f"Logistic Regression Accuracy: {lr_accuracy*100: .2f}%")

### Part f)

In [None]:
X = encoded_discretized_data.drop(columns=["match_result"])
y = encoded_discretized_data["match_result"]

# Step 1: Create an instance of the OneHotEncoder
encoder = OneHotEncoder()

# Step 2: Fit and transform the data
X_encoded = encoder.fit_transform(X)

# Step 3: Convert the encoded data to a dense array
X_encoded = X_encoded.toarray()

# Step 2: Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Step 3: Perform one-hot encoding on the target variable
encoder = LabelBinarizer()
y_train_encoded = encoder.fit_transform(y_train)

# Step 4: Train the one-vs-all classifier using logistic regression
classifier = OneVsRestClassifier(BernoulliNB())
classifier.fit(X_train, y_train_encoded)

# Step 5: Make predictions on the test set
y_pred_encoded = classifier.predict(X_test)

# Step 6: Decode the predicted labels
y_pred = encoder.inverse_transform(y_pred_encoded)

# Step 7: Calculate the accuracy of the predicted labels
bernoulli_accuracy = accuracy_score(y_test, y_pred)
print(f"Bernoulli Naive Bayes Accuracy: {bernoulli_accuracy*100: .2f}%")

In [None]:
X = encoded_discretized_data.drop(columns="match_result")
y = encoded_discretized_data["match_result"]

# Step 2: Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Step 3: Train the Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Step 4: Make predictions on the test set
y_pred = classifier.predict(X_test)

# Step 5: Calculate the accuracy of the predicted labels
multinomial_accuracy = accuracy_score(y_test, y_pred)
print(f"Multinomial Naive Bayes Accuracy: {multinomial_accuracy*100: .2f}%")