In [24]:
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [27]:
data = pd.read_csv('/content/results.csv')

In [28]:
data

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1969-11-01,Italy,France,1,0,Euro,Novara,Italy,False
1,1969-11-01,Denmark,England,4,3,Euro,Aosta,Italy,True
2,1969-11-02,England,France,2,0,Euro,Turin,Italy,True
3,1969-11-02,Italy,Denmark,3,1,Euro,Turin,Italy,False
4,1975-08-25,Thailand,Australia,3,2,AFC Championship,Hong Kong,Hong Kong,True
...,...,...,...,...,...,...,...,...,...
4309,2022-02-22,France,Netherlands,3,1,Tournoi de France,Le Havre,France,False
4310,2022-02-23,New Zealand,Czech Republic,0,0,SheBelieves Cup,Frisco,United States,True
4311,2022-02-23,United States,Iceland,5,0,SheBelieves Cup,Frisco,United States,False
4312,2022-02-23,Sweden,Italy,1,1,Algarve Cup,Lagos,Portugal,True


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4314 entries, 0 to 4313
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        4314 non-null   object
 1   home_team   4314 non-null   object
 2   away_team   4314 non-null   object
 3   home_score  4314 non-null   int64 
 4   away_score  4314 non-null   int64 
 5   tournament  4314 non-null   object
 6   city        4314 non-null   object
 7   country     4314 non-null   object
 8   neutral     4314 non-null   bool  
dtypes: bool(1), int64(2), object(6)
memory usage: 274.0+ KB


In [30]:
data['year'] = data['date'].apply(lambda x: x[0:4])
data['month'] = data['date'].apply(lambda x: x[5:7])

data = data.drop('date', axis=1)

In [31]:
data

Unnamed: 0,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year,month
0,Italy,France,1,0,Euro,Novara,Italy,False,1969,11
1,Denmark,England,4,3,Euro,Aosta,Italy,True,1969,11
2,England,France,2,0,Euro,Turin,Italy,True,1969,11
3,Italy,Denmark,3,1,Euro,Turin,Italy,False,1969,11
4,Thailand,Australia,3,2,AFC Championship,Hong Kong,Hong Kong,True,1975,08
...,...,...,...,...,...,...,...,...,...,...
4309,France,Netherlands,3,1,Tournoi de France,Le Havre,France,False,2022,02
4310,New Zealand,Czech Republic,0,0,SheBelieves Cup,Frisco,United States,True,2022,02
4311,United States,Iceland,5,0,SheBelieves Cup,Frisco,United States,False,2022,02
4312,Sweden,Italy,1,1,Algarve Cup,Lagos,Portugal,True,2022,02


In [32]:
data['home_victory'] = (data['home_score'] > data['away_score']).astype(np.int)

data = data.drop(['home_score', 'away_score'], axis=1)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """Entry point for launching an IPython kernel.


In [33]:
data['neutral'] = data['neutral'].astype(np.int)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """Entry point for launching an IPython kernel.


In [34]:
data

Unnamed: 0,home_team,away_team,tournament,city,country,neutral,year,month,home_victory
0,Italy,France,Euro,Novara,Italy,0,1969,11,1
1,Denmark,England,Euro,Aosta,Italy,1,1969,11,1
2,England,France,Euro,Turin,Italy,1,1969,11,1
3,Italy,Denmark,Euro,Turin,Italy,0,1969,11,1
4,Thailand,Australia,AFC Championship,Hong Kong,Hong Kong,1,1975,08,1
...,...,...,...,...,...,...,...,...,...
4309,France,Netherlands,Tournoi de France,Le Havre,France,0,2022,02,1
4310,New Zealand,Czech Republic,SheBelieves Cup,Frisco,United States,1,2022,02,0
4311,United States,Iceland,SheBelieves Cup,Frisco,United States,0,2022,02,1
4312,Sweden,Italy,Algarve Cup,Lagos,Portugal,1,2022,02,0


In [35]:
def onehot_encode(df, columns, prefixes):
    df = df.copy()
    for column, prefix in zip(columns, prefixes):
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [36]:
data = onehot_encode(
    data,
    ['home_team', 'away_team', 'tournament', 'city', 'country'],
    ['home', 'away', 'tourn', 'city', 'country']
)

In [37]:
data

Unnamed: 0,neutral,year,month,home_victory,home_Albania,home_Algeria,home_American Samoa,home_Andorra,home_Angola,home_Anguilla,...,country_Turkey,country_US Virgin Islands,country_Uganda,country_Ukraine,country_United Kingdom,country_United States,country_Vietnam,country_Wales,country_Zambia,country_Zimbabwe
0,0,1969,11,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1969,11,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1969,11,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1969,11,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1975,08,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4309,0,2022,02,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4310,1,2022,02,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4311,0,2022,02,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4312,1,2022,02,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
y = data.loc[:, 'home_victory']
X = data.drop('home_victory', axis=1)

In [39]:
scaler = StandardScaler()

X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=67)

In [41]:
X.shape

(4314, 1540)

In [42]:
y.mean()

0.5319888734353269

In [44]:
inputs = tf.keras.Input(shape=(1540,))
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs, outputs)


model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)


batch_size = 32
epochs = 20

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[tf.keras.callbacks.ReduceLROnPlateau()]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [45]:
np.argmax(history.history['val_auc'])

6

In [46]:
fig = px.line(
    history.history,
    y=['loss', 'val_loss'],
    labels={'x': "Epoch", 'y': "Loss"},
    title="Loss Over Time"
)

fig.show()

In [47]:
fig = px.line(
    history.history,
    y=['auc', 'val_auc'],
    labels={'x': "Epoch", 'y': "AUC"},
    title="AUC Over Time"
)

fig.show()

In [48]:
model.evaluate(X_test, y_test)



[1.5231720209121704, 0.6733590960502625, 0.7181618213653564]