In [13]:
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import cohen_kappa_score, accuracy_score, mean_absolute_error, f1_score
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from tqdm import tqdm
import xgboost as xgb
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
from datetime import datetime

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [14]:
seed = 2020

In [15]:
df_feature = pd.read_csv('processed_data.csv')

In [16]:
from sklearn.preprocessing import LabelEncoder
for f in tqdm(df_feature.select_dtypes('object')):
    lbl = LabelEncoder()
    df_feature[f] = lbl.fit_transform(df_feature[f].astype(str))

  0%|          | 0/4424 [00:00<?, ?it/s]


In [17]:
df_test = df_feature[df_feature['label']==1].drop(columns=['label']).copy()
df_train = df_feature.copy()

In [18]:
df_test.head()

Unnamed: 0,date,Latitude,Longitude,notes_polarity,notes_subjectivity,comments_polarity,comments_subjectivity,best_season
0,193,47.45051,-120.35903,0.0,0.0,0.0,0.0,0
1,220,46.210088,-119.759287,0.0,0.0,0.2,0.2,1
2,235,47.62984,-120.68945,0.0,0.0,0.2,0.2,1
3,210,47.445464,-120.267607,0.0,0.0,0.2,0.2,0
4,197,47.143276,-119.852737,0.0,0.0,0.0,0.0,0


In [19]:
df_train.head()

Unnamed: 0,date,label,Latitude,Longitude,notes_polarity,notes_subjectivity,comments_polarity,comments_subjectivity,best_season
0,193,1,47.45051,-120.35903,0.0,0.0,0.0,0.0,0
1,220,1,46.210088,-119.759287,0.0,0.0,0.2,0.2,1
2,235,1,47.62984,-120.68945,0.0,0.0,0.2,0.2,1
3,210,1,47.445464,-120.267607,0.0,0.0,0.2,0.2,0
4,197,1,47.143276,-119.852737,0.0,0.0,0.0,0.0,0


In [20]:
ycol = 'label'
feature_names = list(
    filter(lambda x: x not in [ycol], df_train.columns))

# model = xgb.XGBClassifier(min_child_weight=6,max_depth=15,
#                                 objective='multi:softmax',num_class=5)
model = xgb.XGBRegressor(num_leaves=64,
                         max_depth=8,
                         learning_rate=0.08,
                         n_estimators=10000000,
                         subsample=0.75,
                         feature_fraction=0.75,
                         reg_alpha=0.7,
                         reg_lambda=1.2,
                         random_state=seed,
                         metric=None,
                         tree_method='hist'
                         )
oof = []
prediction = df_test.copy()
prediction['label'] = 0
df_importance_list = []

kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    # xgb_model = model.fit(X_train,
    #                       Y_train,
    #                       eval_set=[(X_train, Y_train), (X_val, Y_val)],
    #                       verbose=1000,
    #                       eval_metric="merror",
    #                       early_stopping_rounds=500)
    xgb_model = model.fit(X_train,
                          Y_train,
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=1000,
                          eval_metric='mae',
                          early_stopping_rounds=500)

    pred_val = xgb_model.predict(
        X_val)
    df_oof = df_train.iloc[val_idx].copy()
    df_oof['label'] = pred_val
    oof.append(df_oof)

    pred_test = xgb_model.predict(
        df_test[feature_names])
    prediction['label'] += pred_test / 5

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': xgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del xgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



Parameters: { "feature_fraction", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-mae:1.43626	validation_1-mae:1.46577
[582]	validation_0-mae:0.04586	validation_1-mae:0.36408


Parameters: { "feature_fraction", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-mae:1.43939	validation_1-mae:1.45233
[574]	validation_0-mae:0.04712	validation_1-mae:0.36484


Parameters: { "feature_fraction", "num_leaves" } might not be used.

  This could be a false alarm, 

In [21]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,comments_subjectivity,0.731784
1,date,0.057713
2,comments_polarity,0.049139
3,Latitude,0.036238
4,notes_subjectivity,0.03599
5,Longitude,0.032126
6,notes_polarity,0.030967
7,best_season,0.026044


In [22]:
df_oof = pd.concat(oof)
df_oof[ycol] = np.expm1(df_oof[ycol])
df_oof['label'] = np.expm1(df_oof['label'])
# df_oof.to_csv('xgb_oof.csv'.format(mae), index=False, encoding='utf-8')

In [23]:
prediction['label'] = np.expm1(prediction['label'])
sub = prediction.copy(deep=True)
sub.to_csv('res.csv', index=False, encoding='utf-8')

In [24]:
sub.head()

Unnamed: 0,date,Latitude,Longitude,notes_polarity,notes_subjectivity,comments_polarity,comments_subjectivity,best_season,label
0,193,47.45051,-120.35903,0.0,0.0,0.0,0.0,0,2.55824
1,220,46.210088,-119.759287,0.0,0.0,0.2,0.2,1,1.710949
2,235,47.62984,-120.68945,0.0,0.0,0.2,0.2,1,1.740768
3,210,47.445464,-120.267607,0.0,0.0,0.2,0.2,0,1.743153
4,197,47.143276,-119.852737,0.0,0.0,0.0,0.0,0,3.171741
