In [16]:
import os
import pandas as pd
from scipy.stats import chi2_contingency

In [5]:
OUTPUT_COL = 'target'
ID_COL = 'id'
CURRENT_COLS = ['home_team_name', 'away_team_name', 'match_date',
    'league_name', 'league_id', 'is_cup', 'home_team_coach_id', 'away_team_coach_id'
]
HIST_HOME_COLS = ['home_team_history_match_date_', 'home_team_history_is_play_home_', 'home_team_history_is_cup_', 'home_team_history_goal_'
    , 'home_team_history_opponent_goal_', 'home_team_history_rating_', 'home_team_history_opponent_rating_'
    , 'home_team_history_coach_', 'home_team_history_league_id_'
]
HIST_AWAY_COLS = ['away_team_history_match_date_', 'away_team_history_is_play_home_', 'away_team_history_is_cup_', 'away_team_history_goal_'
    , 'away_team_history_opponent_goal_', 'away_team_history_rating_', 'away_team_history_opponent_rating_'
    , 'away_team_history_coach_', 'away_team_history_league_id_'
]

DATE_COLS = ['match_date'] + ['home_team_history_match_date_'+str(i) for i in range(1, 11)] \
    + ['away_team_history_match_date_'+str(i) for i in range(1, 11)]

DROP_COLS = DATE_COLS + [ID_COL, 'home_team_name', 'away_team_name', 'league_name']

DROP_NA_COL = ['home_team_name']

MIN_MAX_SCALER_COLS = ['league_id', 'home_team_coach_id', 'away_team_coach_id'] + ['home_team_history_league_id_'+str(i) for i in range(1, 11)] \
    + ['away_team_history_league_id_'+str(i) for i in range(1, 11)] \
    + ['home_team_history_coach_'+str(i) for i in range(1, 11)] \
    + ['away_team_history_coach_'+str(i) for i in range(1, 11)]

In [6]:
DATASET_PATH = '../dataset/Predictive modeling - football-match-probability-prediction/'
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'

TRAIN_PATH = os.path.join(DATASET_PATH, TRAIN_FILE)
TEST_PATH = os.path.join(DATASET_PATH, TEST_FILE)

In [7]:
df_train = pd.read_csv(TRAIN_PATH)

  df_train = pd.read_csv(TRAIN_PATH)


In [11]:
df_train_sub = df_train[[OUTPUT_COL] + CURRENT_COLS].copy()

In [13]:
df_train_sub.head()

Unnamed: 0,target,home_team_name,away_team_name,match_date,league_name,league_id,is_cup,home_team_coach_id,away_team_coach_id
0,away,Newell's Old Boys,River Plate,2019-12-01 00:45:00,Superliga,636,False,468196.0,468200.0
1,home,Real Estelí,Deportivo Las Sabanas,2019-12-01 01:00:00,Primera Division,752,False,516788.0,22169161.0
2,draw,UPNFM,Marathón,2019-12-01 01:00:00,Liga Nacional,734,False,2510608.0,456313.0
3,away,León,Morelia,2019-12-01 01:00:00,Liga MX,743,False,1552508.0,465797.0
4,home,Cobán Imperial,Iztapa,2019-12-01 01:00:00,Liga Nacional,705,False,429958.0,426870.0


In [14]:
df_train_sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110938 entries, 0 to 110937
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   target              110938 non-null  object 
 1   home_team_name      110937 non-null  object 
 2   away_team_name      110937 non-null  object 
 3   match_date          110938 non-null  object 
 4   league_name         110937 non-null  object 
 5   league_id           110938 non-null  int64  
 6   is_cup              110937 non-null  object 
 7   home_team_coach_id  89217 non-null   float64
 8   away_team_coach_id  89123 non-null   float64
dtypes: float64(2), int64(1), object(6)
memory usage: 7.6+ MB


In [15]:
df_train_sub.describe()

Unnamed: 0,league_id,home_team_coach_id,away_team_coach_id
count,110938.0,89217.0,89123.0
mean,910.457012,12871340.0,12915520.0
std,549.120985,15784790.0,15803440.0
min,2.0,2.0,2.0
25%,444.0,459612.0,459565.0
50%,947.0,1552728.0,1552806.0
75%,1293.0,32431160.0,32795390.0
max,2039.0,37568490.0,37568470.0


## 1. Feature reduction

In [19]:
ref_col = 'home_team_coach_id'
obs = pd.crosstab(df_train_sub[OUTPUT_COL], df_train_sub[ref_col])
chi_test = chi2_contingency(obs)

thr = 0.05
if chi_test.pvalue < thr:
    print(f'Significant difference --> remain {ref_col} column')
else:
    print(f'Not significant difference --> drop {ref_col} column')


Significant difference --> remain home_team_coach_id column


In [20]:
ref_col = 'away_team_coach_id'
obs = pd.crosstab(df_train_sub[OUTPUT_COL], df_train_sub[ref_col])
chi_test = chi2_contingency(obs)

thr = 0.05
if chi_test.pvalue < thr:
    print(f'Significant difference --> remain {ref_col} column')
else:
    print(f'Not significant difference --> drop {ref_col} column')


Significant difference --> remain away_team_coach_id column


## 2. Preprocessing

In [21]:
# remove nan value in current columns (CURRENT_COLS)
df_train = df_train.dropna(subset = DROP_NA_COL)

In [23]:
df_train[OUTPUT_COL].value_counts()

target
home    48113
away    35173
draw    27651
Name: count, dtype: int64

In [24]:
def oversample(df, target_col):
    classes = df[target_col].value_counts().to_dict()
    most = max(classes.values())
    classes_list = []
    for key in classes:
        classes_list.append(df[df[target_col] == key]) 
    classes_sample = []
    for i in range(1,len(classes_list)):
        classes_sample.append(classes_list[i].sample(most, replace=True))
    df_maybe = pd.concat(classes_sample)
    final_df = pd.concat([df_maybe,classes_list[0]], axis=0)
    final_df = final_df.reset_index(drop=True)
    
    return final_df

In [25]:
df_train = oversample(df_train, OUTPUT_COL)

In [26]:
df_train[OUTPUT_COL].value_counts()

target
away    48113
draw    48113
home    48113
Name: count, dtype: int64

## 3. Feature engineering