In [54]:
import pandas as pd
import numpy as np
import tensorflow.keras as K
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [55]:
football_df = pd.read_csv("data/clean_data.csv")
football_df.head()

Unnamed: 0.1,Unnamed: 0,home_players,away_players,away coach,away name,home coach,home name,league,location,referee,season,stadium,winner
0,30,"['Iker Casillas', 'Raul Bravo', 'Ronaldo', 'Ro...","['Ricardo Sanzol', 'Enrique Corrales', 'Savo M...",Javier Aguirre,CA Osasuna,Garcia Remón,Real Madrid,Primera División,Madrid,Rafael Ramirez Dominguez,2004/2005,Santiago Bernabeu,Home
1,123,"['Antionio Doblas', 'Melli', 'Edú', 'Marcos As...","['Pepe Reina', 'Armando Sa', 'José Mari', 'Jua...",Manuel Pellegrini,FC Villarreal,Lorenzo Serra Ferrer,Real Betis,Primera División,Sevilla,Esquinas Torres,2004/2005,Ruiz de Lopera,Home
2,154,"['Asier Riesgo', 'Javier Garrido', 'Kahveci Ni...","['Javier Sanchez Broto', 'David Belenguer', 'R...",Enrique Sánchez Flores,FC Getafe,José Mari Amorrortu,Real Sociedad,Primera División,San Sebastian,Manuel Mejuto Gonzales,2004/2005,Anoeta,Draw
3,219,"['Victor Valdés', 'Juliano Belletti', ""Samuel ...","['Leonardo Franco', 'Pablo', 'Fernando Torres'...",César Ferrando,Atletico Madrid,Frank Rijkaard,FC Barcelona,Primera División,Viña del Mar,Manuel Mejuto Gonzales,2004/2005,Sausalito,Away
4,260,"['Vallejo Juantxo Elia', 'Rafael Clavero', 'Ri...","['Victor Valdés', 'Juliano Belletti', ""Samuel ...",Frank Rijkaard,FC Barcelona,Javier Aguirre,CA Osasuna,Primera División,Pamplona,David Fernandez Fernandez Borbalan,2004/2005,El Sadar,Away


In [56]:
NUM_WORDS_PLAYERS = 12500
NUM_WORDS_COACHES = 5000
NUM_WORDS_TEAMS = 5000
NUM_WORDS_STADIUM = 2000
NUM_WORDS_SEASON = 100
NUM_WORDS_REFEREE = 5000

In [57]:
tokenizer_players = K.preprocessing.text.Tokenizer(NUM_WORDS_PLAYERS)
tokenizer_coaches = K.preprocessing.text.Tokenizer(NUM_WORDS_COACHES)
tokenizer_teams = K.preprocessing.text.Tokenizer(NUM_WORDS_TEAMS)
tokenizer_stadium = K.preprocessing.text.Tokenizer(NUM_WORDS_STADIUM)
tokenizer_season = K.preprocessing.text.Tokenizer(NUM_WORDS_SEASON)
tokenizer_referee = K.preprocessing.text.Tokenizer(NUM_WORDS_REFEREE)

In [5]:
keep = 0.3
football_df = football_df[:int(len(football_df)*0.3)]
football_df.describe()

Unnamed: 0.1,Unnamed: 0
count,6683.0
mean,5570.31019
std,2160.539083
min,30.0
25%,3852.5
50%,5660.0
75%,7426.5
max,9154.0


In [58]:
def row_to_array(row):
    clean_row = row.replace("[","").replace("]","").replace("'","").split(",")
    return ", ".join(clean_row)

home_players = football_df["home_players"].apply(row_to_array)
away_players = football_df["away_players"].apply(row_to_array)
players = pd.concat([home_players, away_players],ignore_index=True)
players[1]

'Antionio Doblas,  Melli,  Edú,  Marcos Assunçao,  Juanito,  David Rivas,  Washington Tais,  Juan José Cañas,  Fernando,  Joaquín,  Ricardo Oliveira'

In [59]:
tokenizer_players.fit_on_texts(players)
tokenizer_players.word_index

{'david': 1,
 'marco': 2,
 'christian': 3,
 'diego': 4,
 'daniel': 5,
 'kevin': 6,
 'de': 7,
 'sergio': 8,
 'antonio': 9,
 'andrea': 10,
 'michael': 11,
 'carlos': 12,
 'nicolas': 13,
 'juan': 14,
 'pablo': 15,
 'martin': 16,
 'javier': 17,
 'thomas': 18,
 'mario': 19,
 'jonathan': 20,
 'daniele': 21,
 'manuel': 22,
 'roberto': 23,
 'james': 24,
 'fernando': 25,
 'alessandro': 26,
 'ivan': 27,
 'francesco': 28,
 'paul': 29,
 'steven': 30,
 'luca': 31,
 'john': 32,
 'luis': 33,
 'alberto': 34,
 'lucas': 35,
 'lopez': 36,
 'robert': 37,
 'cristian': 38,
 'victor': 39,
 'mathieu': 40,
 'marc': 41,
 'garcia': 42,
 'jose': 43,
 'jean': 44,
 'fabio': 45,
 'benjamin': 46,
 'josé': 47,
 'sebastian': 48,
 'pedro': 49,
 'van': 50,
 'federico': 51,
 'angel': 52,
 'silva': 53,
 'alvaro': 54,
 'simone': 55,
 'stefan': 56,
 'mikel': 57,
 'alexander': 58,
 'ruben': 59,
 'andre': 60,
 'ben': 61,
 'ricardo': 62,
 'davide': 63,
 'jordan': 64,
 'jeremy': 65,
 'anthony': 66,
 'jan': 67,
 'simon': 68,
 'st

In [60]:
home_players = tokenizer_players.texts_to_matrix(home_players)
away_players = tokenizer_players.texts_to_matrix(away_players)
away_players

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [61]:
coaches = pd.concat([football_df["away coach"], football_df["home coach"]], ignore_index=True)
coaches[:5]

0            Javier Aguirre
1         Manuel Pellegrini
2    Enrique Sánchez Flores
3            César Ferrando
4            Frank Rijkaard
dtype: object

In [62]:
tokenizer_coaches.fit_on_texts(coaches)
home_coaches = tokenizer_coaches.texts_to_matrix(football_df["home coach"])
away_coaches = tokenizer_coaches.texts_to_matrix(football_df["away coach"])
home_coaches

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [63]:
teams = pd.concat([football_df["away name"], football_df["home name"]], ignore_index=True)
tokenizer_teams.fit_on_texts(teams)

home_teams = tokenizer_teams.texts_to_matrix(football_df["home name"])
away_teams = tokenizer_teams.texts_to_matrix(football_df["away name"])
home_teams

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [64]:
stadiums = football_df["stadium"]
tokenizer_stadium.fit_on_texts(stadiums)

stadiums = tokenizer_stadium.texts_to_matrix(stadiums)
stadiums

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [65]:
season = football_df["season"]
tokenizer_season.fit_on_texts(season)

season = tokenizer_season.texts_to_matrix(season)
season

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [66]:
referee = football_df["referee"]
tokenizer_referee.fit_on_texts(referee)

referee = tokenizer_referee.texts_to_matrix(referee)
referee

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Model

In [89]:
home_players_input = layers.Input(shape=(NUM_WORDS_PLAYERS,), name="home_players")
away_players_input = layers.Input(shape=(NUM_WORDS_PLAYERS,), name="away_players")
home_coaches_input = layers.Input(shape=(NUM_WORDS_COACHES,), name="home_coaches")
away_coaches_input = layers.Input(shape=(NUM_WORDS_COACHES,), name="away_coaches")
home_teams_input = layers.Input(shape=(NUM_WORDS_TEAMS,), name="home_teams")
away_teams_input = layers.Input(shape=(NUM_WORDS_TEAMS,), name="away_teams")
stadiums_input = layers.Input(shape=(NUM_WORDS_STADIUM,), name="stadiums")
season_input = layers.Input(shape=(NUM_WORDS_SEASON,), name="season")
referee_input = layers.Input(shape=(NUM_WORDS_REFEREE,), name="referee")

In [90]:
#players_emb = layers.Embedding(NUM_WORDS_PLAYERS,512)
#coaches_emb = layers.Embedding(NUM_WORDS_COACHES,256)
#team_emb = layers.Embedding(NUM_WORDS_TEAMS,256)

#home_player_features = players_emb(home_players_input)
#away_player_features = players_emb(away_players_input)
#home_coaches_features = coaches_emb(home_coaches_input)
#away_coaches_features = coaches_emb(away_coaches_input)
#home_teams_features = team_emb(home_teams_input)
#away_teams_features = team_emb(away_teams_input)
#stadiums_features = layers.Embedding(NUM_WORDS_STADIUM,64)(stadiums_input)
#season_features = layers.Embedding(NUM_WORDS_SEASON,32)(season_input)
#referee_features = layers.Embedding(NUM_WORDS_REFEREE,64)(referee_input)

players_dense = layers.Dense(512,activation="relu",kernel_regularizer=K.regularizers.l2(0.01))
coaches_dense = layers.Dense(64,activation="relu")
team_dense = layers.Dense(128,activation="relu")

home_player_features = players_dense(home_players_input)
away_player_features = players_dense(away_players_input)
home_coaches_features = coaches_dense(home_coaches_input)
away_coaches_features = coaches_dense(away_coaches_input)
home_teams_features = team_dense(home_teams_input)
away_teams_features = team_dense(away_teams_input)
stadiums_features = layers.Dense(64, activation="relu")(stadiums_input)
season_features = layers.Dense(32, activation="relu")(season_input)
referee_features = layers.Dense(32, activation="relu")(referee_input)

In [91]:
#players_lstm = layers.LSTM(512)
#coaches_lstm = layers.LSTM(64)
#team_lstm = layers.LSTM(256)

#home_player_features = players_lstm(home_player_features)
#away_player_features = players_lstm(away_player_features)
#home_coaches_features = coaches_lstm(home_coaches_features)
#away_coaches_features = coaches_lstm(away_coaches_features)
#home_teams_features = team_lstm(home_teams_features)
#away_teams_features = team_lstm(away_teams_features)
#stadiums_features = layers.LSTM(64)(stadiums_features)
#season_features = layers.LSTM(32)(season_features)
#referee_features = layers.LSTM(64)(referee_features)

In [92]:
x = layers.concatenate([home_player_features, away_player_features, home_coaches_features, away_coaches_features, 
                        home_teams_features, away_teams_features, stadiums_features, 
                        season_features, referee_features])

In [93]:
#x = layers.Dense(2048, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(64, activation="relu")(x)
predictions = layers.Dense(3, activation="softmax")(x)

In [94]:
model = K.models.Model(inputs=[home_players_input, away_players_input, home_coaches_input, away_coaches_input, 
                               home_teams_input, away_teams_input, stadiums_input, season_input, referee_input],
                      outputs=predictions, name="Football Prediction")

In [95]:
model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=["accuracy"])
model.summary()

Model: "Football Prediction"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
home_players (InputLayer)       [(None, 12500)]      0                                            
__________________________________________________________________________________________________
away_players (InputLayer)       [(None, 12500)]      0                                            
__________________________________________________________________________________________________
home_coaches (InputLayer)       [(None, 5000)]       0                                            
__________________________________________________________________________________________________
away_coaches (InputLayer)       [(None, 5000)]       0                                            
________________________________________________________________________________

In [96]:
#print(len(home_players), len(away_players),len(home_coaches), len(away_coaches), len(home_teams), len(away_teams),len(stadiums),len(season),len(referee))
#X = np.concatenate([home_players, away_players, home_coaches, away_coaches, home_teams, away_teams, stadiums, season, referee], axis=1)
y = pd.get_dummies(football_df["winner"])

home_players_train, home_players_test, y_train, y_test = train_test_split(home_players,y, test_size=0.2, random_state=42)
away_players_train, away_players_test, y_train, y_test = train_test_split(away_players,y, test_size=0.2, random_state=42)

home_coaches_train, home_coaches_test, y_train, y_test = train_test_split(home_coaches,y, test_size=0.2, random_state=42)
away_coaches_train, away_coaches_test, y_train, y_test = train_test_split(away_coaches,y, test_size=0.2, random_state=42)

home_teams_train, home_teams_test, y_train, y_test = train_test_split(home_teams,y, test_size=0.2, random_state=42)
away_teams_train, away_teams_test, y_train, y_test = train_test_split(away_teams,y, test_size=0.2, random_state=42)

stadiums_train, stadiums_test, y_train, y_test = train_test_split(stadiums,y, test_size=0.2, random_state=42)
season_train, season_test, y_train, y_test = train_test_split(season,y, test_size=0.2, random_state=42)
referee_train, referee_test, y_train, y_test = train_test_split(referee,y, test_size=0.2, random_state=42)

In [97]:
early_stopping = K.callbacks.EarlyStopping("val_acc",restore_best_weights=True,patience=20,verbose=1)
reduce_lr = K.callbacks.ReduceLROnPlateau("val_acc",patience=5,verbose=1)

model.fit(x=[home_players_train, away_players_train, home_coaches_train, away_coaches_train,
            home_teams_train, away_teams_train, stadiums_train, season_train, referee_train],y=y_train,
          validation_data=([home_players_test, away_players_test, home_coaches_test, away_coaches_test,
            home_teams_test, away_teams_test, stadiums_test, season_test, referee_test],y_test),
          epochs=50, batch_size=512, callbacks=[early_stopping, reduce_lr])

Train on 17821 samples, validate on 4456 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 00011: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 00016: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50

Epoch 00021: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.
Epoch 00021: early stopping


<tensorflow.python.keras.callbacks.History at 0x7fb770902940>

In [98]:
loss,acc = model.evaluate(x=[home_players_test, away_players_test, home_coaches_test, away_coaches_test,
            home_teams_test, away_teams_test, stadiums_test, season_test, referee_test],y=y_test)



[1.2124243197792095, 0.51817775]

In [None]:
model.save(f"model.{loss}loss.")