In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import random
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [79]:
train_m = pd.read_csv('train.csv', low_memory=False)
test_m  = pd.read_csv('test.csv')

In [123]:
train = train_m.copy()
test = test_m.copy()

In [124]:
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
            " columns that have missing values.")
    return mis_val_table_ren_columns

In [125]:
missing_values_table(test)

Your selected dataframe has 189 columns.
There are 182 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
away_team_history_coach_10,21289,29.3
home_team_history_coach_10,21260,29.2
away_team_history_coach_9,20997,28.9
home_team_history_coach_9,20996,28.9
away_team_history_coach_8,20728,28.5
...,...,...
away_team_history_league_id_1,488,0.7
away_team_history_goal_1,488,0.7
away_team_history_is_play_home_1,488,0.7
away_team_history_opponent_goal_1,488,0.7


In [126]:
def data_preprocessing_handler(df):
    train = 0
    train = df.loc[:]
    
    train = train[train['is_cup'].notna()]
    
    train['is_cup'] = pd.factorize(train['is_cup'])[0]+1
    
    target = {
        'home': 1,
        'away': 2,
        'draw': 3,
    }        

    try:
        train['target'] = train['target'].map(target)
    except:
        pass
    teams = dict.fromkeys(list(train['home_team_name'].value_counts().keys()) + list(train['away_team_name'].value_counts().keys()))
    dic = {}
    for i, team in enumerate(teams):    
        dic[team] = i        
    
    train['home_team_name']=train['home_team_name'].map(dic)
    train['away_team_name']=train['away_team_name'].map(dic)
    train['away_team_name']=train['away_team_name'].astype('int64')
    train['away_team_name']=train['away_team_name'].astype('int64')
    
    for i in train:
        if 'goal' in i:
            train[i] = train[i].fillna(0)
    
    ratings = [c for c in train if '_rating_' in c]
    for r in [ratings]:
        for i in train[r]:
            train[i] = train[i].fillna(value=train[r][i].mean())
    
    away_team_history_goal = [i for i in train.columns if 'away_team_history_goal' in i]
    away_team_history_opponent_goal = [i for i in train.columns if 'away_team_history_opponent_goal' in i]
    away_team_history_rating = [i for i in train.columns if 'away_team_history_rating' in i]
    away_team_history_opponent_rating = [i for i in train.columns if 'away_team_history_opponent_rating' in i]

    home_team_history_goal = [i for i in train.columns if 'home_team_history_goal' in i]
    home_team_history_opponent_goal = [i for i in train.columns if 'home_team_history_opponent_goal' in i]
    home_team_history_rating = [i for i in train.columns if 'home_team_history_rating' in i]
    home_team_history_opponent_rating = [i for i in train.columns if 'home_team_history_opponent_rating' in i]

    train['home_goal'] = train[home_team_history_goal].sum(axis=1)
    train['home_opp_goal'] = train[home_team_history_opponent_goal].sum(axis=1)
    train['home_rating'] = train[home_team_history_rating].mean(axis=1)
    train['home_opp_rating'] = train[home_team_history_opponent_rating].mean(axis=1)

    train['away_goal'] = train[away_team_history_goal].sum(axis=1)
    train['away_opp_goal'] = train[away_team_history_opponent_goal].sum(axis=1)
    train['away_rating'] = train[away_team_history_rating].mean(axis=1)
    train['away_opp_rating'] = train[away_team_history_opponent_rating].mean(axis=1)
    
    train['date'] = pd.to_datetime(train['match_date'])
    datees = [i for i in train.columns if 'match_date' in i]
    is_cups = [i for i in train.columns if '_is_cup' in i]
    
    train[datees] = train[datees].fillna(np.random.choice(list(range(1,13))))
    
    for i in train:
        if 'match_date' in i:
            try:
                train[i] = pd.to_datetime(train[i]).dt.month.astype(int)        
            except:
                pass              
    
    home_coach_ids = [c for c in train if ('home_team_history_coach' in c)] + ['home_team_coach_id']
    away_coach_ids = [c for c in train if ('away_team_history_coach' in c)] + ['away_team_coach_id']
            
    train['home_team_coach_id'] = train[['date', 'home_team_name'] + home_coach_ids].groupby(['home_team_name'], sort=True)['home_team_coach_id'].apply(lambda x: x.ffill())
    train['away_team_coach_id'] = train[['date', 'away_team_name'] + away_coach_ids].groupby(['away_team_name'], sort=True)['away_team_coach_id'].apply(lambda x: x.ffill())    
    train[['home_team_coach_id', 'away_team_coach_id']] = train[['home_team_coach_id', 'away_team_coach_id']].fillna(0) 
    train[home_coach_ids+away_coach_ids] = train[home_coach_ids+away_coach_ids].fillna(0)         
        
    all_is_cup = [i for i in train if 'history_is_cup' in i]
    train[all_is_cup] = train[all_is_cup].fillna(value=np.random.choice(train['is_cup']))            
        
    return train[['id', 'target', 'home_goal', 'home_team_name', 'away_team_name', 'home_opp_goal', 'home_team_coach_id', 'home_rating', 'home_opp_rating', 'away_goal', 'away_opp_goal', 'away_team_coach_id', 'away_rating','away_opp_rating', 'date', 'is_cup']]

In [127]:
test['target'] = 0

In [128]:
train = data_preprocessing_handler(train)

In [129]:
test = data_preprocessing_handler(test)

In [130]:
x_train = train.loc[:, train.columns != 'target']
x_test = test.loc[:, train.columns != 'target']

In [131]:
x_train.shape

(110937, 15)

In [132]:
x_test.shape

(72711, 15)

In [133]:
y_train = train['target']
y_test = test['target']

In [134]:
missing_values_table(x_train)

Your selected dataframe has 15 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [135]:
missing_values_table(x_test)

Your selected dataframe has 15 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [136]:
len(x_test.columns)

15

In [137]:
x_train.drop('date', axis=1, inplace=True)
x_test.drop('date', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [138]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=13)
xgb.fit(x_train, y_train)
y_pred = xgb.predict_proba(x_test)





In [139]:
submission = pd.DataFrame(y_pred)
submission = submission.round(2)

In [140]:
submission

Unnamed: 0,0,1,2
0,0.45,0.24,0.30
1,0.27,0.43,0.30
2,0.40,0.30,0.30
3,0.22,0.48,0.30
4,0.49,0.20,0.31
...,...,...,...
72706,0.41,0.28,0.32
72707,0.56,0.17,0.27
72708,0.18,0.61,0.21
72709,0.56,0.19,0.25


In [100]:
submission['id'] = test['id']

In [104]:
submission.rename(columns={
    0: 'home',
    1: 'away',
    2: 'draw',
}).to_csv('submission.csv', index=False)

In [105]:
submission.columns

Index([0, 1, 2, 'id'], dtype='object')

In [147]:
import lightgbm as ltb
model = ltb.LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=32, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=41, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=2000, subsample_freq=0)
model.fit(x_train, y_train)
y_pred = model.predict_proba(x_test)



In [151]:
submission = pd.DataFrame(y_pred)
submission = submission.round(2)
submission

Unnamed: 0,0,1,2
0,0.47,0.23,0.30
1,0.22,0.46,0.31
2,0.40,0.31,0.30
3,0.22,0.47,0.31
4,0.46,0.21,0.33
...,...,...,...
72706,0.42,0.26,0.32
72707,0.56,0.17,0.26
72708,0.15,0.67,0.17
72709,0.53,0.19,0.28


In [152]:
submission['id'] = test['id']

In [153]:
submission.rename(columns={
    0: 'home',
    1: 'away',
    2: 'draw',
}).to_csv('submission.csv', index=False)