# Project Football

In this project we checked historical data about football matches.

_SOL_: I made a change about some business questions.

## Data Collection

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# get data from kaggle
football_train_df = pd.read_csv('./data/football_train.csv')
football_target_df = pd.read_csv('./data/football_target.csv')

# print shapes
print("train shape:",football_train_df.shape)
print("target shape:",football_target_df.shape)


train shape: (110938, 190)
target shape: (110938, 3)


In [None]:
football_target_df.set_index('id', inplace=True)
football_train_df.set_index('id', inplace=True)

In [None]:
# join football_train_df and football_target_df (only score column)
football_target_df = football_target_df[['score']]
football_train_df = football_train_df.join(football_target_df)
print(football_train_df.shape)

(110938, 190)


## Data Preprocessing

### Data Cleaning

We remove, fill, replace, ...

In [None]:
# remove every team that has less than 10 matches
teams_less_10_matches = football_train_df['home_team_name'].value_counts() < 10
teams_less_10_matches = teams_less_10_matches[teams_less_10_matches].index

# remove teams from teams_less_10_matches
football_train_df = football_train_df[~football_train_df['home_team_name'].isin(teams_less_10_matches)]
football_train_df = football_train_df[~football_train_df['away_team_name'].isin(teams_less_10_matches)]
print(football_train_df.shape)

(79055, 190)


In [None]:
# Extract diccionary with the league_id and league_name and drop league_name from the dataframes
league_dict = football_train_df[['league_id', 'league_name']].drop_duplicates().set_index('league_id').to_dict()['league_name']
print(football_train_df.shape)


(110938, 190)


Se eliminaron los datos del coach debido a que habian muy poquitos que cumplian con esto y solamente añadia ruido \
Eliminamos todos los equipos que tuvieran menos de 10 partidos en el historico \
Eliminamos aquellas filas que tengan menos del 50% de información disponible


### Filtros opcionales (ejecutar sin eliminar los equipos)

In [None]:
# remove every column has "coach" in his name
football_train_df = football_train_df[football_train_df.columns.drop(list(football_train_df.filter(regex='coach', axis=1)))]
print(football_train_df.shape)

(110938, 168)


In [None]:
# remove every row has more than 50% of missing values
football_train_df = football_train_df.dropna(thresh=0.5*len(football_train_df), axis=1)
print(football_train_df.shape)


In [None]:
# remove every row has more than 70% of missing values
football_train_df = football_train_df.dropna(thresh=0.7*len(football_train_df.columns), axis=0)
print(football_train_df.shape)

In [None]:
# code to remove duplicates
football_train_df = football_train_df.drop_duplicates()

In [None]:
# imprimir columnas
football_train_df.columns.to_list()

['target',
 'home_team_name',
 'away_team_name',
 'match_date',
 'league_name',
 'league_id',
 'is_cup',
 'home_team_coach_id',
 'away_team_coach_id',
 'home_team_history_match_date_1',
 'home_team_history_match_date_2',
 'home_team_history_match_date_3',
 'home_team_history_match_date_4',
 'home_team_history_match_date_5',
 'home_team_history_match_date_6',
 'home_team_history_match_date_7',
 'home_team_history_match_date_8',
 'home_team_history_match_date_9',
 'home_team_history_match_date_10',
 'home_team_history_is_play_home_1',
 'home_team_history_is_play_home_2',
 'home_team_history_is_play_home_3',
 'home_team_history_is_play_home_4',
 'home_team_history_is_play_home_5',
 'home_team_history_is_play_home_6',
 'home_team_history_is_play_home_7',
 'home_team_history_is_play_home_8',
 'home_team_history_is_play_home_9',
 'home_team_history_is_play_home_10',
 'home_team_history_is_cup_1',
 'home_team_history_is_cup_2',
 'home_team_history_is_cup_3',
 'home_team_history_is_cup_4',
 'h

### Data Preparation

In [None]:
from sklearn.preprocessing import LabelEncoder

# change datetime format
# filter date columns and change datatype
for col in football_train_df.filter(regex='date', axis=1).columns:
    football_train_df[col] = pd.to_datetime(football_train_df[col])

# Set league_name as category
football_train_df['league_name'] = football_train_df['league_name'].astype('category')

# Label encoding target
le = LabelEncoder()
football_train_df['target'] = le.fit_transform(football_train_df['target'])

# Label encoding is_coup
football_train_df['is_cup'] = le.fit_transform(football_train_df['is_cup'])

# Separate score column # - # in two columns home_score and away_score and merge it with football_train_df
football_train_df['home_score'] = football_train_df['score'].str.split('-', expand=True)[0].astype(int)
football_train_df['away_score'] = football_train_df['score'].str.split('-', expand=True)[1].astype(int)
football_train_df.drop('score', axis=1, inplace=True)



## Feature Engineering


In [None]:
# Feature Engineering
football_reduced_df = football_train_df[[
    'home_team_name',
    'away_team_name',
    'match_date',
    'league_name',
    'is_cup',
    'home_score',
    'away_score',
    'target']].copy()

# Calculate last home team match
# find the days between the last match of the home team
football_reduced_df['home_days_betwent_last_match'] = (football_reduced_df['match_date'] - football_train_df['home_team_history_match_date_1']).dt.days
football_reduced_df['home_days_betwent_last_match'].head(10)

# home average goals in last 10 matches (columns home_team_history_goal_1 to home_team_history_goal_10)
football_reduced_df['home_avg_home_goal_last_10'] = football_train_df.filter(regex='home_team_history_goal', axis=1).mean(axis=1)
football_reduced_df['home_avg_opponent_goal_last_10'] = football_train_df.filter(regex='home_team_history_opponent_goal', axis=1).mean(axis=1)

football_reduced_df['away_avg_home_goal_last_10'] = football_train_df.filter(regex='away_team_history_goal', axis=1).mean(axis=1)
football_reduced_df['away_avg_opponent_goal_last_10'] = football_train_df.filter(regex='away_team_history_opponent_goal', axis=1).mean(axis=1)
# Aqui podemos hacer feature engineering para obtener la cantidad de partidos jugados por cada equipo

# Habra algun mes en donde el rendimiento se vea aectado?
# create columns by month based on date
football_reduced_df['month'] = football_reduced_df['match_date'].dt.month

# how many matches did win the home/away team in the last 10 matches
home_history_goals = football_train_df.filter(regex='home_team_history_goal', axis=1).to_numpy()
home_history_oponent_goals = football_train_df.filter(regex='home_team_history_opponent_goal', axis=1).to_numpy()
football_reduced_df['home_wins_last_10'] = (home_history_goals > home_history_oponent_goals).sum(axis=1)
football_reduced_df['home_dawn_last_10'] = (home_history_goals == home_history_oponent_goals).sum(axis=1)
football_reduced_df['home_lose_last_10'] = (home_history_goals < home_history_oponent_goals).sum(axis=1)

away_history_goals = football_train_df.filter(regex='away_team_history_goal', axis=1).to_numpy()
away_history_oponent_goals = football_train_df.filter(regex='away_team_history_opponent_goal', axis=1).to_numpy()
football_reduced_df['away_wins_last_10'] = (away_history_goals > away_history_oponent_goals).sum(axis=1)
football_reduced_df['away_dawn_last_10'] = (away_history_goals == away_history_oponent_goals).sum(axis=1)
football_reduced_df['away_lose_last_10'] = (away_history_goals < away_history_oponent_goals).sum(axis=1)

# mean team rating and oponent rating
football_reduced_df['home_avg_rating_last_10'] = football_train_df.filter(regex='home_team_history_rating', axis=1).mean(axis=1)
football_reduced_df['home_avg_opponent_rating_last_10'] = football_train_df.filter(regex='home_team_history_opponent_rating', axis=1).mean(axis=1)

football_reduced_df['away_avg_rating_last_10'] = football_train_df.filter(regex='away_team_history_rating', axis=1).mean(axis=1)
football_reduced_df['away_avg_opponent_rating_last_10'] = football_train_df.filter(regex='away_team_history_opponent_rating', axis=1).mean(axis=1)

print(football_reduced_df.shape)


(79055, 24)


In [None]:
football_reduced_df.columns

Index(['home_team_name', 'away_team_name', 'match_date', 'league_name',
       'is_cup', 'home_score', 'away_score', 'target',
       'home_days_betwent_last_match', 'home_avg_home_goal_last_10',
       'home_avg_opponent_goal_last_10', 'away_avg_home_goal_last_10',
       'away_avg_opponent_goal_last_10', 'month', 'home_wins_last_10',
       'home_dawn_last_10', 'home_lose_last_10', 'away_wins_last_10',
       'away_dawn_last_10', 'away_lose_last_10', 'home_avg_rating_last_10',
       'home_avg_opponent_rating_last_10', 'away_avg_rating_last_10',
       'away_avg_opponent_rating_last_10'],
      dtype='object')

## Model Selection

In [None]:
# get columns datatype
football_reduced_df.dtypes

# join football_reduced_df and football_train_df
football_train_df = football_train_df.join(football_reduced_df[[
       'home_days_betwent_last_match', 'home_avg_home_goal_last_10',
       'home_avg_opponent_goal_last_10', 'away_avg_home_goal_last_10',
       'away_avg_opponent_goal_last_10', 'month', 'home_wins_last_10',
       'home_dawn_last_10', 'home_lose_last_10', 'away_wins_last_10',
       'away_dawn_last_10', 'away_lose_last_10', 'home_avg_rating_last_10',
       'home_avg_opponent_rating_last_10', 'away_avg_rating_last_10',
       'away_avg_opponent_rating_last_10']])


In [None]:

print(football_train_df.shape)
#football_reduced_df.dropna(inplace=True)
print(football_reduced_df.shape)

(110938, 207)
(110938, 24)


In [None]:
# Split into test dataset and train dataset
from sklearn.model_selection import train_test_split


X = football_reduced_df.drop([
    'home_team_name',
    'away_team_name',
    'target',
    'match_date',
    'league_name',
    'home_score',
    'away_score'], axis=1)

Y = football_reduced_df['target']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


### Test Linear Regression


### Test K-Nearest Neighbors

In [None]:
# Test KKN for classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, Y_train)

y_hat = knn.predict(X_test)
accuracy_score(Y_test, y_hat)

ValueError: Input X contains NaN.
KNeighborsClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

### Random Forest

In [None]:
# Random forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, Y_train)

y_hat = rf.predict(X_test)
accuracy_score(Y_test, y_hat)

0.46486829577911776

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


#scores = cross_val_score(rf, X, Y, cv=5, scoring='accuracy')
#print(scores)
# Random forest with original dropped data
#aux_df = football_train_df.dropna().copy()
se = StandardScaler()

aux_df = football_train_df.copy()
for col in aux_df.filter(regex='date', axis=1).columns:
    aux_df.drop(col, axis=1, inplace=True)

X = aux_df.drop([
    'home_team_name',
    'away_team_name',
    #'target',
    'league_name',
    'home_score',
    'away_score'], axis=1)

X = X.fillna(0)
Y = X['target']
X.drop('target', axis=1, inplace=True)

X = se.fit_transform(X)
Y = le.fit_transform(Y)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train, Y_train)

y_hat = rf.predict(X_test)
accuracy_score(Y_test, y_hat)


0.4889129259058951

In [None]:
# Cross validation Random FOrest
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf, X, Y, cv=5, scoring='accuracy')
print(scores)

[1.         0.99986602 1.         1.         0.99993301]


In [None]:
print(type(scores))

<class 'numpy.ndarray'>


In [None]:
scores = cross_val_score(rf, X, Y, cv=5, scoring='accuracy')
print(scores)

|     | reducced | full_dropna_without_coach |fd_fill0_without_coach | Merged |
|-----|----------|---------------------------|-----------------------|--------|
| kn  | 0.44     |      0.39                 |          0.42         |        |
| rt  | 0.46     |      0.47                 |          0.46         |        |
|CV-kn|          |                           |          0.41         |        |
|CV-rt|          |                           |       0.99 - 1        |        |

In [None]:
print(X.shape)

(74641, 143)


In [None]:
knn = KNeighborsClassifier(n_neighbors=200)
knn.fit(X_train, Y_train)

y_hat = knn.predict(X_test)
accuracy_score(Y_test, y_hat)

0.42410311880295654

In [None]:
# knn cross validation
scores = cross_val_score(knn, X, Y, cv=5, scoring='accuracy')
print(scores)

[0.40980642 0.40159432 0.40507771 0.42825563 0.4249732 ]


In [None]:
scores.mean()

0.4139414548979496

### Test XGBoost for Regression

In [None]:
# code for a XGBoost regressor
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

xgb = XGBClassifier()

param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'n_estimators': [50, 100, 200, 500],
    'subsample': [0.5, 0.7, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.7, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'reg_alpha': [0, 0.1, 1, 10],
    'reg_lambda': [0, 0.1, 1, 10]
}

random_search = RandomizedSearchCV(xgb, param_grid, n_iter=20, scoring='accuracy', cv=2, verbose=1, n_jobs=-1)
random_search.fit(X_train, Y_train)

print(f"Best parameters found by Grid Search: {random_search.best_params_}")
#y_hat = xgb.predict(X_test)
#accuracy_score(Y_test, y_hat)

Fitting 2 folds for each of 20 candidates, totalling 40 fits
Best parameters found by Grid Search: {'subsample': 0.9, 'reg_lambda': 10, 'reg_alpha': 0.1, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.4, 'colsample_bytree': 0.5}


In [None]:
from xgboost import XGBClassifier

params = {'subsample': 0.9, 'reg_lambda': 10, 'reg_alpha': 0.1, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.4, 'colsample_bytree': 0.5}
xgb = XGBClassifier(params)
xgb.fit(X_train, Y_train)

y_hat = xgb.predict(X_test)
accuracy_score(Y_test, y_hat)

0.4830539030106364

Un modelo aleatorio es mejor?

In [None]:
# generate pandas df with random integer values 0, 1 or 2
import numpy as np
df = pd.DataFrame(np.random.randint(0, 3, size=football_target_df.shape), columns=['random'])
print(football_train_df.shape)

# Count random column and target column and calculate accuracy
accuracy_score(df['random'], football_train_df['target'])


(110938, 191)


0.3338801853287422

In [None]:
# Red neuronal con Tenserflow
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# One hot encoding in Y
Y_train = to_categorical(Y_train)
Y_test = to_categorical(Y_test)

print(X_train.shape)
print(Y_train.shape)

ann = Sequential()
ann.add(Dense(units=8, activation='tanh', input_dim=X_train.shape[1]))
ann.add(Dense(units=8, activation='softmax'))
ann.add(Dense(units=8, activation='relu'))
ann.add(Dense(units=8, activation='softmax'))
ann.add(Dense(units=8, activation='relu'))
ann.add(Dense(units=8, activation='relu'))
ann.add(Dense(units=8, activation='tanh'))
ann.add(Dense(units=3, activation='relu'))

ann.compile( optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy'] )

ann.fit(X_train, Y_train, epochs=20, batch_size=32)


(71149, 180)
(71149, 3)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x78f5b2bcab90>

In [None]:
loss, accuracy = ann.evaluate(X_test, Y_test)
print("Accuracy:", accuracy)

Accuracy: 0.4786238372325897


### Models Comparison

In [None]:
# compare metrics

## Model Training

In [None]:
# prepare data for the model

# scaling data - additional encoder

# feature selection

# create model with hyperparameters

# train model

## Model Evaluation

In [None]:
# test metrics classical

# show plots about results

In [None]:
# scatter plot comparing real and predicted values

# line plot about real vs predicted

# box plot about real vs predicted

In [None]:
# pickle the model
import pickle

# save the model to disk
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

***

# Business Questions

1. What is the risk level for unpayments based on age and region?


In [None]:
# generate a table or a plot or a business metric

2. What is average amount of unpayments? what is the segment of values where unpayments are appearin?

In [None]:
# generate a table or a plot or a business metric