In [1]:

import tensorflow as tf 
import numpy as np 
import pandas as pd 
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline 
from tensorflow.keras.preprocessing import text , sequence 
from tensorflow.keras.layers import Dense , LSTM   , Input ,Bidirectional ,Dropout
from tensorflow.keras.models import Model 

In [2]:
# load the data 
path = 'C:/Users/DELL/Downloads/football-match-probability-prediction'
train_data = pd.read_csv(os.path.join(path , 'train.csv'))
test_data = pd.read_csv(os.path.join(path , 'test.csv'))

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
# converting date to useful features 
def convert_date(data): 
    data['match_date'] = pd.to_datetime(data['match_date'])
    
    for i in range(1,11) : 
        data[f'home_team_history_match_date_{i}'] = pd.to_datetime(data[f'home_team_history_match_date_{i}'])
        data[f'away_team_history_match_date_{i}'] = pd.to_datetime(data[f'home_team_history_match_date_{i}'])
        
    for i in range(1,11) : 
        data[f'home_time_difference_between_two_matches_{i}'] = (data['match_date'] - data[f'home_team_history_match_date_{i}'] ).dt.days
        data[f'away_time_difference_between_two_matches_{i}'] = (data['match_date'] - data[f'away_team_history_match_date_{i}'] ).dt.days
        data = data.drop(f'home_team_history_match_date_{i}' , axis =1)
        data = data.drop(f'away_team_history_match_date_{i}' , axis =1)
        
    data = data.drop('match_date' , axis =1)
    return data

In [4]:
# we get the ratio between the days and number of matches played in this days 
def matches_span(data):
    data['home_team_exhaustion'] = data['home_time_difference_between_two_matches_10'] / 10 
    data['away_team_exhaustion'] = data['away_time_difference_between_two_matches_10'] / 10
    return data 

In [5]:
# get the average scoring for the two teams throw the 10 matches 
# and the average opponent scoring aganist the two teams 
def average_scoring(data) :
    data['home_team_average_scoreing'] = 0 
    data['away_team_average_scoreing'] = 0 
    data['home_opponent_average_scoreing'] = 0
    data['away_opponent_average_scoreing'] = 0
    for i in range(1,11) : 
        data['home_team_average_scoreing'] += data[f'home_team_history_goal_{i}']
        data['away_team_average_scoreing'] += data[f'home_team_history_goal_{i}']
    
        data['home_opponent_average_scoreing'] += data[f'home_team_history_opponent_goal_{i}']
        data['away_opponent_average_scoreing'] += data[f'away_team_history_opponent_goal_{i}']
    
    data['home_team_average_scoreing'] /= 10 
    data['away_team_average_scoreing'] /= 10 
    data['home_opponent_average_scoreing'] /= 10
    data['away_opponent_average_scoreing'] /= 10
    return data
    

In [6]:
# get the ratio of winning for each team 
def who_win(data) : 
    data['home_team_average_winning_rate'] = 0
    data['away_team_average_winning_rate'] = 0
    for i in range(1 , 11 ) : 
        data[f'home_target_match{i}'] = data[f'home_team_history_goal_{i}'] - data[f'home_team_history_opponent_goal_{i}']
        data[f'away_target_match{i}'] = data[f'away_team_history_goal_{i}'] - data[f'away_team_history_opponent_goal_{i}']
    for i in range(1,11) : 
        data[f'home_target_match{i}'] =  data[f'home_target_match{i}'].apply(lambda X : 1 if X > 0 else 0)
        data[f'away_target_match{i}'] =  data[f'away_target_match{i}'].apply(lambda X : 1 if X > 0 else 0)
    for i in range(1 ,11 ) : 
        data['home_team_average_winning_rate'] += data[f'home_target_match{i}'] 
        data['away_team_average_winning_rate'] += data[f'away_target_match{i}']
        data = data.drop(f'home_target_match{i}', axis = 1 )
        data = data.drop(f'away_target_match{i}', axis = 1 )
    data['home_team_average_winning_rate'] /= 10
    data['away_team_average_winning_rate'] /= 10
    return data
        

In [7]:
# this pipeline deal with the missing data and scale the data 
steps = [('imputer', SimpleImputer(strategy = 'mean')), ('scaler', StandardScaler())]
pip = Pipeline(steps) 

In [8]:
# process the text columns 
text_columns = ['home_team_name' , 'away_team_name' , 'league_name']
max_f = 1000
max_len = 5 
def process_text(data) : 
    tock = text.Tokenizer(max_f)
    tock.fit_on_texts(data)
    data = tock.texts_to_sequences(data)
    data = sequence.pad_sequences(data , maxlen = max_len)
    return data


In [9]:
def train_data_process(train_data) : 
    # rid of any rows with out the teams name because useless
    train_data = train_data[ (train_data['home_team_name'].isna() == False ) & (train_data['away_team_name'].isna() == False ) ].reset_index(drop=True)
    train_data = convert_date(train_data)
    train_data['is_cup'] = train_data['is_cup'].apply(lambda X : 0 if X == False else 1)
    train_data = matches_span(train_data)
    train_data = average_scoring(train_data)
    train_data = who_win(train_data)
    # map the target columns to labels 
    train_labels = pd.get_dummies(train_data['target'] )
    train_data = train_data.drop('target' , axis = 1) 
    #scaling the data 
    scaled_train_data = pip.fit_transform(train_data.iloc[: , 5:])
    # process the text_data 
    processed_train_text = np.concatenate([process_text(train_data[col]) for col in text_columns ] , axis = 1)
    # concat the scaled data with the processed text 
    train_processed_data = np.concatenate([scaled_train_data ,processed_train_text ] , axis = 1 )
    return train_processed_data , train_labels
    

In [10]:
X_train , y_train = train_data_process(train_data)

In [11]:
def test_data_process(test_data_f) : 
    test_data_f = convert_date(test_data_f)
    test_data_f['is_cup'] = test_data_f['is_cup'].apply(lambda X : 0 if X == False else 1)
    test_data_f = matches_span(test_data_f)
    test_data_f = average_scoring(test_data_f)
    test_data_f = who_win(test_data_f)
    scaled_test_data = pip.transform(test_data_f.iloc[: , 5:])
    processed_test_text = np.concatenate([process_text(test_data_f[col]) for col in text_columns ] , axis = 1)
    test_processed_data = np.concatenate([scaled_test_data ,processed_test_text ] , axis = 1 )
    return test_processed_data 

In [13]:
X_test = test_data_process(test_data)

In [14]:
from sklearn.model_selection import train_test_split
X_train , X_val , y_train, y_val = train_test_split(X_train , y_train , test_size = .1 , random_state =123 ) 

In [15]:
# add dummy axis to the data to match the model expected shape 
X_train = X_train[: ,: ,np.newaxis]
X_val = X_val[: ,: ,np.newaxis]

In [16]:
inputs = Input(shape=X_train.shape[1:])

X = Bidirectional(LSTM(16, return_sequences=True, activation='tanh'))(inputs)
X = tf.keras.layers.BatchNormalization()(X)
X = Bidirectional(LSTM(8, return_sequences=True, activation='tanh'))(X)
X = tf.keras.layers.BatchNormalization()(X)
X = tf.keras.layers.Flatten()(X)
X = Dense(64 , activation ='tanh')(X)
X = Dense(3 ,activation ='softmax')(X)
model = Model(inputs , X)

In [19]:
model.compile(optimizer ='adam' , loss ='categorical_crossentropy' , metrics=['accuracy'])

In [20]:
hist = model.fit(X_train, y_train,  validation_data =(X_val , y_val),epochs = 30 ,batch_size =64 )

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
  51/1561 [..............................] - ETA: 6:08 - loss: 0.9921 - accuracy: 0.5107

KeyboardInterrupt: 

In [22]:
preds = model.predict(X_test)

In [23]:
sub = pd.DataFrame({
    'id': test_data['id'],
    'away': preds[:, 0],
    'draw': preds[:, 1],
    'home': preds[:, 2]
})
sub

Unnamed: 0,id,away,draw,home
0,17761448,0.363812,0.249806,0.386382
1,17695487,0.252411,0.314528,0.433061
2,17715496,0.372683,0.270427,0.356891
3,17715493,0.589213,0.268732,0.142055
4,17715492,0.195901,0.286983,0.517116
...,...,...,...,...
72706,18450246,0.299968,0.343341,0.356691
72707,18164889,0.159855,0.222598,0.617547
72708,18449018,0.672934,0.232117,0.094948
72709,17958831,0.179605,0.261150,0.559245


In [24]:
sub.to_csv('sub_2.csv' , index =False)