In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import random
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [2]:
train = pd.read_csv('train.csv', low_memory=False)

In [3]:
test  = pd.read_csv('test.csv')

In [4]:
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
            " columns that have missing values.")
    return mis_val_table_ren_columns

In [5]:
def data_preprocessing_handler(df):
    train = 0
    train = df.loc[:]
    
    train = train[train['is_cup'].notna()]
    
    train['league_name'] = pd.factorize(train['league_name'])[0]+1
    
    target = {
        'home': 1,
        'away': 2,
        'draw': 3,
    }
    try:
        train['target'] = train['target'].map(target)
    except:
        pass
    
    teams = dict.fromkeys(list(train['home_team_name'].value_counts().keys()) + list(train['away_team_name'].value_counts().keys()))
    dic = {}
    for i, team in enumerate(teams):    
        dic[team] = i
    
    train['home_team_name']=train['home_team_name'].map(dic)
    train['away_team_name']=train['away_team_name'].map(dic)
    train['away_team_name']=train['away_team_name'].astype('int64')
    train['away_team_name']=train['away_team_name'].astype('int64')
    
    train = train[train['home_team_name'] != train['away_team_name']]
    
    train['date'] = pd.to_datetime(train['match_date'])
    datees = [i for i in train.columns if 'match_date' in i]
    is_cups = [i for i in train.columns if '_is_cup' in i]
    
    train[datees] = train[datees].fillna(np.random.choice(list(range(1,13))))
    
    for i in train:
        if 'match_date' in i:
            try:
                train[i] = pd.to_datetime(train[i]).dt.month.astype(int)        
            except:
                pass
    
    for i in train:
        if 'goal' in i:
            train[i] = train[i].fillna(0)
    
    ratings = [c for c in train if '_rating_' in c]
    for r in [ratings]:
        for i in train[r]:
            train[i] = train[i].fillna(value=train[r][i].mean())      
    
    home_coach_ids = [c for c in train if ('home_team_history_coach' in c)] + ['home_team_coach_id']
    away_coach_ids = [c for c in train if ('away_team_history_coach' in c)] + ['away_team_coach_id']
            
    train['home_team_coach_id'] = train[['date', 'home_team_name'] + home_coach_ids].groupby(['home_team_name'], sort=True)['home_team_coach_id'].apply(lambda x: x.ffill())
    train['away_team_coach_id'] = train[['date', 'away_team_name'] + away_coach_ids].groupby(['away_team_name'], sort=True)['away_team_coach_id'].apply(lambda x: x.ffill())    
    train[['home_team_coach_id', 'away_team_coach_id']] = train[['home_team_coach_id', 'away_team_coach_id']].fillna(0) 
    train[home_coach_ids+away_coach_ids] = train[home_coach_ids+away_coach_ids].fillna(0) 
    
    league_ids = [c for c in train if ('league_id' in c)]
    train[league_ids] = train[league_ids].ffill(axis = 1).bfill(axis = 1)
    
    cols = [i for i in train.columns if (not 'history_league_id_' in i) and (not '_team_history_coach_' in i)]

    train = train[cols]
    
    all_is_cup = [i for i in train if 'history_is_cup' in i]
    train[all_is_cup] = train[all_is_cup].fillna(value=np.random.choice(train['is_cup']))
    
    is_cup_dummies =  pd.get_dummies(train['is_cup'], prefix_sep='_',prefix='is_cup')
    train = pd.concat([train, is_cup_dummies], axis=1)
    
    train.drop('is_cup', axis=1, inplace=True)
    
    is_play_home = [i for i in train.columns if 'is_play_home' in i]
    train[is_play_home] = train[is_play_home].fillna(np.random.choice([0, 1.0])) 
    
    away_team_history_goal = [i for i in train.columns if 'away_team_history_goal' in i]
    home_team_history_goal = [i for i in train.columns if 'home_team_history_goal' in i]

    history_opponent_goal = [i for i in train.columns if 'history_opponent_goal' in i]

    home_team_history_rating = [i for i in train.columns if 'home_team_history_rating' in i]
    away_team_history_rating = [i for i in train.columns if 'away_team_history_rating' in i]

    home_team_history_opponent_rating = [i for i in train.columns if 'home_team_history_opponent_rating' in i]
    away_team_history_opponent_rating = [i for i in train.columns if 'away_team_history_opponent_rating' in i]
    
    return train[['id', 'target', 'home_team_name', 'away_team_name', 'league_id', 'league_name', 'date', 'match_date', 'home_team_coach_id', 'away_team_coach_id', 'is_cup_False', 'is_cup_True'] + home_team_history_goal + away_team_history_goal + home_team_history_rating + away_team_history_rating + history_opponent_goal + home_team_history_opponent_rating + away_team_history_opponent_rating] 

In [6]:
test['target'] = 0

In [7]:
train = data_preprocessing_handler(train)

In [8]:
test = data_preprocessing_handler(test)

In [9]:
x_train = train.loc[:, train.columns != 'target']
x_test = test.loc[:, train.columns != 'target']

In [10]:
x_test.shape

(72711, 91)

In [12]:
x_train.shape

(110935, 91)

In [13]:
test['target'] = 0

In [15]:
y_train = train[['target', 'match_date']]
y_test = test[['target', 'match_date']]

In [16]:
missing_values_table(x_train)

Your selected dataframe has 91 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [17]:
missing_values_table(x_test)

Your selected dataframe has 91 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [18]:
len(x_test.columns)

91

In [19]:
x_train.drop('date', axis=1, inplace=True)
x_test.drop('date', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [20]:
# from sklearn.naive_bayes import GaussianNB
# gnb = GaussianNB()

In [21]:
# gnb.fit(x_train, y_train)

In [22]:
# y_pred = gnb.predict_proba(x_test)

In [23]:
# print('Naive Bayes classifier for Gaussian Nomial accuracy: {:.4f}'.format(accuracy_score(y_test, y_pred)))

In [24]:
# y_pred.shape

In [25]:
# from sklearn.ensemble import RandomForestClassifier

In [26]:
# randomForest= RandomForestClassifier(n_estimators=100)
# randomForest.fit(x_train, y_train)
# y_pred = randomForest.predict_proba(x_test)

In [28]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=32)
clf.fit(x_train, y_train)
y_pred = clf.predict_proba(x_test)

# print('neural_network Classifier accuracy: {:.4f}'.format(accuracy_score(y_val_test, y_pred)*100))

ValueError: y should be a 1d array, got an array of shape (110935, 2) instead.

In [227]:
y_pred

array([[0.38921878, 0.30245028, 0.30833094],
       [0.38083646, 0.32704658, 0.29211695],
       [0.39448967, 0.30488529, 0.30062504],
       ...,
       [0.34584607, 0.38814637, 0.26600756],
       [0.51446987, 0.24222168, 0.24330845],
       [0.37159329, 0.32998167, 0.29842504]])

In [228]:
submission = pd.DataFrame(y_pred)

In [229]:
submission = submission.round(2)

In [231]:
submission

Unnamed: 0,0,1,2
0,0.39,0.30,0.31
1,0.38,0.33,0.29
2,0.39,0.30,0.30
3,0.28,0.40,0.32
4,0.44,0.27,0.29
...,...,...,...
72706,0.43,0.28,0.30
72707,0.48,0.23,0.29
72708,0.35,0.39,0.27
72709,0.51,0.24,0.24


In [232]:
submission['id'] = test['id']

In [233]:
submission.rename(columns={
    0: 'home',
    1: 'away',
    2: 'draw',
}).to_csv('submission.csv', index=False)

In [234]:
submission.columns

Index([0, 1, 2, 'id'], dtype='object')

In [244]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=32)
xgb.fit(x_train, y_train)
y_pred = xgb.predict_proba(x_test)





In [245]:
submission = pd.DataFrame(y_pred)

In [246]:
submission = submission.round(2)

In [247]:
submission

Unnamed: 0,0,1,2
0,0.42,0.29,0.29
1,0.24,0.42,0.34
2,0.34,0.36,0.29
3,0.11,0.65,0.24
4,0.55,0.17,0.28
...,...,...,...
72706,0.49,0.28,0.23
72707,0.61,0.08,0.32
72708,0.18,0.39,0.43
72709,0.49,0.23,0.28


In [240]:
submission['id'] = test['id']

In [241]:
submission.rename(columns={
    0: 'home',
    1: 'away',
    2: 'draw',
}).to_csv('submission.csv', index=False)

In [242]:
submission.columns

Index([0, 1, 2, 'id'], dtype='object')