In [158]:
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import cohen_kappa_score, accuracy_score, mean_absolute_error, f1_score
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from tqdm import tqdm
import xgboost as xgb
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
from datetime import datetime

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [159]:
seed = 2020

In [160]:
df_feature = pd.read_csv('processed_data.csv')

In [161]:
from sklearn.preprocessing import LabelEncoder
for f in tqdm(df_feature.select_dtypes('object')):
    lbl = LabelEncoder()
    df_feature[f] = lbl.fit_transform(df_feature[f].astype(str))

  0%|          | 0/4424 [00:00<?, ?it/s]


In [162]:
df_test = df_feature[df_feature['label']==1].drop(columns=['label']).copy()
df_train = df_feature.copy()

In [163]:
df_test.head()

Unnamed: 0,date,Latitude,Longitude,notes_polarity,notes_subjectivity,comments_polarity,comments_subjectivity
4,205,47.93348,-119.03591,0.0,0.0,0.0,0.0
5,131,47.091118,-122.682562,0.371429,0.542857,0.0,0.5
12,129,47.685672,-122.386464,-0.2,0.0,0.0,0.0
63,129,48.767306,-122.44893,-0.225,0.444444,0.0,0.0
64,121,47.581661,-121.98741,-0.097222,0.338889,0.0,0.0


In [164]:
df_train.head()

Unnamed: 0,date,label,Latitude,Longitude,notes_polarity,notes_subjectivity,comments_polarity,comments_subjectivity
0,98,2,47.708791,-117.472206,0.0,0.0,0.0,0.0
1,64,2,47.634119,-122.646627,-0.25,0.888889,0.0,0.0
2,136,3,49.060215,-122.641648,0.0,0.0,0.0,0.0
3,109,2,48.874475,-122.58183,-0.014683,0.345437,0.5,0.5
4,205,1,47.93348,-119.03591,0.0,0.0,0.0,0.0


In [165]:
ycol = 'label'
feature_names = list(
    filter(lambda x: x not in [ycol], df_train.columns))

model = xgb.XGBClassifier(min_child_weight=6,max_depth=15,
                                objective='multi:softmax',num_class=5)
# model = xgb.XGBRegressor(num_leaves=64,
#                          max_depth=8,
#                          learning_rate=0.08,
#                          n_estimators=10000000,
#                          subsample=0.75,
#                          feature_fraction=0.75,
#                          reg_alpha=0.7,
#                          reg_lambda=1.2,
#                          random_state=seed,
#                          metric=None,
#                          tree_method='hist'
#                          )
oof = []
prediction = df_test.copy()
prediction['label'] = 0
df_importance_list = []

kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    xgb_model = model.fit(X_train,
                          Y_train,
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=1000,
                          eval_metric="merror",
                          early_stopping_rounds=500)
    # xgb_model = model.fit(X_train,
    #                       Y_train,
    #                       eval_set=[(X_train, Y_train), (X_val, Y_val)],
    #                       verbose=1000,
    #                       eval_metric='mae',
    #                       early_stopping_rounds=500)

    pred_val = xgb_model.predict(
        X_val)
    df_oof = df_train.iloc[val_idx].copy()
    df_oof['label'] = pred_val
    oof.append(df_oof)

    pred_test = xgb_model.predict(
        df_test[feature_names])
    prediction['label'] += pred_test / 5

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': xgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del xgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



[0]	validation_0-merror:0.09890	validation_1-merror:0.13220
[99]	validation_0-merror:0.00085	validation_1-merror:0.13446


[0]	validation_0-merror:0.09861	validation_1-merror:0.14689
[99]	validation_0-merror:0.00198	validation_1-merror:0.14350


[0]	validation_0-merror:0.09720	validation_1-merror:0.12994
[99]	validation_0-merror:0.00141	validation_1-merror:0.12994


[0]	validation_0-merror:0.10257	validation_1-merror:0.12655
[99]	validation_0-merror:0.00113	validation_1-merror:0.12655


[0]	validation_0-merror:0.09831	validation_1-merror:0.12896
[99]	validation_0-merror:0.00169	validation_1-merror:0.12670


In [166]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,comments_subjectivity,0.756091
1,comments_polarity,0.082117
2,date,0.056464
3,notes_polarity,0.028197
4,Latitude,0.027964
5,notes_subjectivity,0.026464
6,Longitude,0.022704


In [167]:
df_oof = pd.concat(oof)
df_oof[ycol] = np.expm1(df_oof[ycol])
df_oof['label'] = np.expm1(df_oof['label'])
# df_oof.to_csv('xgb_oof.csv'.format(mae), index=False, encoding='utf-8')

In [168]:
prediction['label'] = np.expm1(prediction['label'])
sub = prediction.copy(deep=True)
sub.to_csv('res.csv', index=False, encoding='utf-8')

In [169]:
sub.head()

Unnamed: 0,date,Latitude,Longitude,notes_polarity,notes_subjectivity,comments_polarity,comments_subjectivity,label
4,205,47.93348,-119.03591,0.0,0.0,0.0,0.0,1.718282
5,131,47.091118,-122.682562,0.371429,0.542857,0.0,0.5,2.320117
12,129,47.685672,-122.386464,-0.2,0.0,0.0,0.0,1.718282
63,129,48.767306,-122.44893,-0.225,0.444444,0.0,0.0,1.718282
64,121,47.581661,-121.98741,-0.097222,0.338889,0.0,0.0,1.718282
