In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from data_preprocessing import *
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, log_loss
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, KFold

In [2]:
X_train = pd.read_csv('./preprocessed_data/train_set_stage2.csv', index_col=0)
X_test = pd.read_csv('./preprocessed_data/test_set_stage2.csv', index_col=0) 

In [3]:
target_col = ['FTR']

teams_cols =['HomeTeam','AwayTeam']

teams_ratio_cols = ['HomeTeamWinRatio', 'AwayTeamWinRatio']

teams_ratio_cat_cols = ['HomeTeamWinRatio_Cat', 'AwayTeamWinRatio_Cat']

last_year_postion_cols = ['HomeTeamLastYearPosition', 'AwayTeamLastYearPosition']

total_cols = ['HomeTeamGoalsScored','AwayTeamGoalsScored','HomeTeamGoalsLost','AwayTeamGoalsLost','HomeTeamShootsMade', 
              'AwayTeamShootsMade','HomeTeamTargetShootsMade','AwayTeamTargetShootsMade','HomeTeamCorners','AwayTeamCorners',
              'HomeTeamTotalPoints','AwayTeamTotalPoints']

total_cat_cols = ['HomeTeamTargetShootsMade_Cat', 'AwayTeamTargetShootsMade_Cat', 'HomeTeamGoalsScored_Cat',
                  'AwayTeamGoalsScored_Cat', 'HomeTeamGoalsLost_Cat','AwayTeamGoalsLost_Cat', 'HomeTeamShootsMade_Cat',
                  'AwayTeamShootsMade_Cat','HomeTeamCorners_Cat', 'AwayTeamCorners_Cat', 'HomeTeamTotalPoints_Cat',
                  'AwayTeamTotalPoints_Cat',]

last_matches_results_cols = ['HomeTeamLast1Match','AwayTeamLast1Match', 'HomeTeamLast2Match', 'AwayTeamLast2Match',
                             'HomeTeamLast3Match', 'AwayTeamLast3Match', 'HomeTeamLast4Match','AwayTeamLast4Match', 
                             'HomeTeamLast5Match', 'AwayTeamLast5Match',]

last_matches_points_cols = ['HomeTeamPointsFromLast3Matches','AwayTeamPointsFromLast3Matches', 
                            'HomeTeamPointsFromLast5Matches','AwayTeamPointsFromLast5Matches', 
                            'HomeTeamPointsFromLast10Matches','AwayTeamPointsFromLast10Matches']

binary_cols = ['HomeTeamWinStreak3', 'HomeTeamWinStreak5', 'HomeTeamLossStreak3','HomeTeamLossStreak5', 
               'AwayTeamWinStreak3', 'AwayTeamWinStreak5','AwayTeamLossStreak3', 'AwayTeamLossStreak5',
               'IsHomeTeamRegulars', 'IsAwayTeamRegulars', 'IsHomeTeamRookie', 'IsAwayTeamRookie']

diff_cols = ['HomeTeamGoalsDifference', 'AwayTeamGoalsDifference','TotalGoalsDifference','DifferenceTotalPoints',
             'Difference1MatchPoints', 'Difference3MatchesPoints','Difference5MatchesPoints','Difference10MatchesPoints',
             'DifferenceInShoots', 'DifferenceInTargetShoots', 'DifferenceInCorners','DifferenceInLastYearPosition'] 

diff_cat_cols = ['HomeTeamGoalsDifference_Cat','AwayTeamGoalsDifference_Cat', 'TotalGoalsDifference_Cat',
                 'DifferenceTotalPoints_Cat', 'Difference10MatchesPoints_Cat','DifferenceInShoots_Cat',
                 'DifferenceInTargetShoots_Cat','DifferenceInCorners_Cat']

In [4]:
y_train = np.array(X_train[target_col]).flatten()
y_test = np.array(X_test[target_col]).flatten()

In [5]:
# home_team_names = np.unique(X_train['HomeTeam'])
# away_team_names = np.unique(X_train['AwayTeam'])
# team_names=[home_team_names, away_team_names]

In [6]:
kfold = KFold(n_splits=5, shuffle=False, random_state=None)

In [7]:
# all transformes from module data_preprocessing.py
base_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([* binary_cols, *teams_ratio_cols]) ),
])

minmax_scaling_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([*last_matches_points_cols, *last_matches_results_cols, *last_year_postion_cols]) ),
    ('minmax_scaler', MinMaxScaler() )
])

standard_scaling_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([*diff_cols]) ),
    ('standard_scaler', StandardScaler() )
])

# label enocoding team names
ordinal_encoder_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([*teams_cols]) ),
#     ('ordinal_encoder', OrdinalEncoder() ),
    ('ordinal_encoder', OrdinalEncoder(categories=team_names) ),
    ('minmax_scaler', MinMaxScaler() )
])

# process two features to the same scale(leaving dependencies between them)
goals_scored_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([total_cols[0], total_cols[1]]) ),
    ('two_column_scaler', TwoColumnScaler(scaler=MinMaxScaler() ))
])

goals_lost_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([total_cols[2], total_cols[3]]) ),
    ('two_column_scaler', TwoColumnScaler(scaler=MinMaxScaler() ))
])

shoot_made_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([total_cols[4], total_cols[5]]) ),
    ('two_column_scaler', TwoColumnScaler(scaler=MinMaxScaler() ))
])

total_shoot_made_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([total_cols[6], total_cols[7]]) ),
    ('two_column_scaler', TwoColumnScaler(scaler=MinMaxScaler() ))
])

corners_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([total_cols[8], total_cols[9]]) ),
    ('two_column_scaler', TwoColumnScaler(scaler=MinMaxScaler() ))
])

total_points_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([total_cols[10], total_cols[11]]) ),
    ('two_column_scaler', TwoColumnScaler(scaler=MinMaxScaler() ))
])

In [8]:
base_process_pipeline = FeatureUnion(transformer_list=[
                                    ('base_pipeline ', base_pipeline),
                                    ('minmax_scaling_pipeline', minmax_scaling_pipeline),
                                    ('standard_scaling_pipeline', standard_scaling_pipeline),
                                    ('ordinal_encoder_pipeline', ordinal_encoder_pipeline),
                                    ('goals_scored_pipeline', goals_scored_pipeline),
                                    ('goals_lost_pipeline', goals_lost_pipeline),
                                    ('shoot_made_pipeline', shoot_made_pipeline),
                                    ('total_shoot_made_pipeline', total_shoot_made_pipeline),
                                    ('corners_pipeline', corners_pipeline),
                                    ('total_points_pipeline', total_points_pipeline),
])

In [9]:
clf_rf = RandomForestClassifier(random_state=42, n_estimators=600, max_depth=19, verbose=0)

clf_xgb = XGBClassifier(subsample=0.8, 
                        scale_pos_weight=0.8,
                        reg_alpha=1e-05,
                        n_estimators=700,
                        min_child_weight=3,
                        max_depth=8,
                        learning_rate=0.01,
                        gamma=0.4,
                        colsample_bytree=0.7,
                        random_state=42,
                        verbose=0)

clf_ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                             n_estimators=30,
                             learning_rate=0.6,
                             random_state=42)


clf_cat = CatBoostClassifier(random_state=42, n_estimators=400, max_depth=9, verbose=0)

pipe_rf = Pipeline([
    ('base_process_pipeline', base_process_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(clf_rf) ),
    ('classifier', clf_rf )
])

pipe_xgb = Pipeline([
    ('base_process_pipeline', base_process_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(clf_xgb) ),
    ('classifier', clf_xgb)
])

pipe_ada = Pipeline([
    ('base_process_pipeline', base_process_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(clf_ada) ),
    ('classifier', clf_ada)
])

pipe_cat = Pipeline([
    ('base_process_pipeline', base_process_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(clf_cat) ),
    ('classifier', clf_cat)
])

In [10]:
# stack_clf = StackingClassifier(estimators=[('rf',pipe_rf), ('xgb', pipe_xgb), ('ada', pipe_ada), ('cat', pipe_cat)],
#                                final_estimator = LogisticRegression(C=1),
#                                cv=kfold,
#                                n_jobs=1,
#                                verbose=1)

In [11]:
# stack_clf.fit(X_train, y_train)
# y_pred = stack_clf.predict(X_test)

# print(metrics.accuracy_score(y_test, y_pred))  
# print(metrics.precision_score(y_test, y_pred))
# print(metrics.recall_score(y_test, y_pred))
# print(metrics.f1_score(y_test, y_pred))
# print(metrics.roc_auc_score(y_test, y_pred))

### Linear pipeline

In [12]:
# all transformes from data_preprocessing.py
base_cat_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([*binary_cols]) ),
])

home_team_encoding_pipeline = Pipeline([
    ('encoding', TargetMeanEncodingTransformer(X_train, teams_cols[0], *target_col) ),
])

away_team_encoding_pipeline = Pipeline([
    ('encoding', TargetMeanEncodingTransformer(X_train, teams_cols[1], *target_col) ),
])

minmax_scaling_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([*teams_ratio_cat_cols, *last_matches_points_cols, *last_matches_results_cols,
                                       *last_year_postion_cols, *diff_cat_cols, *total_cat_cols]) ),
    ('minmax_scaler', MinMaxScaler() )
])

In [13]:
cat_process_pipeline = FeatureUnion(transformer_list=[
                                   ('home_teams_encoding', home_team_encoding_pipeline),
                                   ('away_teams_encoding', away_team_encoding_pipeline),
                                   ('base_pipeline ', base_cat_pipeline),
                                   ('minmax_scaling_pipeline', minmax_scaling_pipeline),
])

In [14]:
clf_svc = SVC(kernel='linear', C=0.1, probability=True, random_state=42, verbose=0, max_iter=10000)

clf_lr = LogisticRegression(random_state=42, C=0.1, max_iter=10000, verbose=0)

clf_knn = KNeighborsClassifier(n_neighbors=9, metric='manhattan', leaf_size=35)

clf_rbf = SVC(kernel='rbf', gamma=0.01, C=100, probability=True, max_iter=10000, random_state=42)

pipe_svc = Pipeline([
    ('base_process_pipeline', cat_process_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(clf_svc) ),
    ('classifier', clf_svc )
])

pipe_lr = Pipeline([
    ('base_process_pipeline', cat_process_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(clf_lr) ),
    ('classifier', clf_lr )
])

pipe_knn = Pipeline([
    ('base_process_pipeline', cat_process_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(clf_knn) ),
    ('classifier', clf_knn )
])

pipe_rbf = Pipeline([
    ('base_process_pipeline', cat_process_pipeline),
    ('feature_seletion', ImportantFeaturesSelector(clf_rbf) ),
    ('classifier', clf_rbf )
])

In [15]:
stack_clf = StackingClassifier(estimators=[('svc',pipe_svc), ('lr', pipe_lr), ('knn', pipe_knn), ('rbf', pipe_rbf),
                                          ('rf',pipe_rf), ('xgb', pipe_xgb), ('ada', pipe_ada), ('cat', pipe_cat)],
                                   final_estimator = LogisticRegression(C=1),
                                   cv=kfold,
                                   n_jobs=1,
                                   verbose=10)

In [16]:
stack_clf.fit(X_train, y_train)
y_pred = stack_clf.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred))  
print(metrics.precision_score(y_test, y_pred))
print(metrics.recall_score(y_test, y_pred))
print(metrics.f1_score(y_test, y_pred))
print(metrics.roc_auc_score(y_test, y_pred))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  3.4min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  5.1min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  6.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  8.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  8.5min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elap

ValueError: Found unknown categories ['Ipswich', 'Bradford', 'Coventry'] in column 0 during transform