# Data Preprocessing

## Read Data & Drop Columns

In [182]:
!pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/6e/a1/f7a22f144f33be78afeb06bfa78478e8284a64263a3c09b1ef54e673841e/category_encoders-2.0.0-py2.py3-none-any.whl (87kB)
[K     |████████████████████████████████| 92kB 5.3MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.0.0


In [0]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
pd.set_option('display.max_columns', 200)
%matplotlib inline

In [245]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [246]:
#read data
df_train_genba = pd.read_table('gdrive/My Drive/Colab Notebooks/SIGNATE/IGHD/data/train_genba.tsv')
df_train_goto = pd.read_table('gdrive/My Drive/Colab Notebooks/SIGNATE/IGHD/data/train_goto.tsv')
#concat Data
df = pd.merge(df_train_genba, df_train_goto, on = 'pj_no')
#set index
df = df.set_index('id')

  """Entry point for launching an IPython kernel.
  


In [0]:
#drop columns
df = df.drop(columns = {'pj_no',
'yoto2',
'kempei2',
'yoseki2',
'road1_hk',
'road1_sb',
'road1_fi',
'road1_mg',
'road2_hk',
'road2_sb',
'road2_fi',
'road2_mg',
'road3_hk',
'road3_sb',
'road3_fi',
'road3_mg',
'road4_hk',
'road4_sb',
'road4_fi',
'road4_mg',
'gk_sho_kyori',
'gk_chu_kyori',
'kborjs',
'toshikuiki2',
'minmenseki',
'hokakisei1',
'hokakisei2',
'hokakisei3',
'hokakisei4',
'kinshijiko',
'rosenka_hb',
'kijun_hb',
'mseki_rd_hb',
'mseki_dp_hb',
'tc_mseki_min_hb',
'tt_mseki_max_hb',
'tc_mseki_avg_hb',
'fi4m_yohi',
'fi3m_yohi',
'fi4m_kyori',
'fi3m_kyori',
'bus_yohi',
'bus_hon',
'sho_shoten',
'sho_market',
'shu_jutaku',
'shu_park',
'shu_shop',
'shu_factory',
'shu_hvline',
'shu_tower',
'shu_bochi',
'shu_sogi',
'shu_zoki',
'shu_kokyo',
'shu_highway',
'shu_kaido',
'shu_line_ari',
'shu_line_nashi',
'shu_soon',
'gk_yoc_tm',
'gk_sho_tm',
'gk_chu_tm',
'rs_e_kdate3',
'rs_e_parking',
'rs_e_zoki',
'rs_e_m_ari',
'rs_e_m_nashi',
'rs_e_tahata',
'rs_w_kdate3',
'rs_w_parking',
'rs_w_zoki',
'rs_w_m_ari',
'rs_w_m_nashi',
'rs_w_tahata',
'rs_s_kdate3',
'rs_s_parking',
'rs_s_zoki',
'rs_s_m_ari',
'rs_s_m_nashi',
'rs_s_tahata',
'rs_n_kdate3',
'rs_n_parking',
'rs_n_zoki',
'rs_n_m_ari',
'rs_n_m_nashi',
'rs_n_tahata',
'eki_nm1',
'eki_kyori1',
'bastei_nm1',
'teiho1',
'rosen_nm2',
'eki_nm2',
'bas_toho2',
'eki_kyori2',
'bastei_nm2',
'teiho2',
'pj_no',
'tt_mseki',
'road_st',
'setsudo_hi',
'kobetsu2',
'kobetsu3',
'kobetsu4',
'hy1f_date_su'})

In [0]:
#replace
df = df.replace({'（要）' : 1, 
                '（不要）' : 0, 
                '○' : 1, 
                '（有）' : 1, 
                '（無）' : 0})

## NaN preprocessing

In [0]:
#fill nan by 0
df.loc[:, ['sho_conv',
'sho_super',
'rs_e_kdate2',
'rs_w_kdate2',
'rs_s_kdate2',
'rs_n_kdate2'
]] = df.loc[:, ['sho_conv',
'sho_super',
'rs_e_kdate2',
'rs_w_kdate2',
'rs_s_kdate2',
'rs_n_kdate2'
]].fillna(0)

In [0]:
#fill nan by mean
mean_list = ['chiseki_kb_hb',
'kaoku_hb',
'tt_mseki_avg_hb',
'fukuin',
'magutchi']

for i in mean_list:
    df[i] = df[i].fillna(df[i].mean())

In [0]:
#finn nan by mode
mode_list = ['tateuri_su',
'tochiuri_su',
'joken_su',
'hy2f_date_su',
'hy3f_date_su',
'levelplan',
'setsudo_kj',
'jigata',
'hiatari']

for i in mode_list:
    df[i] = df[i].fillna(df[i].mode()[0])

In [0]:
#fillna by 0 otherwise 1
df['kobetsu1'] = df['kobetsu1'].fillna(0)
df['kobetsu1'] = df['kobetsu1'].apply(lambda x: 0 if x == 0 else 1)

## Create Features

In [0]:
#Extract City Name from 'jukyo' field
df['jukyo'] = df['jukyo'].str.replace('埼玉県', '')
df['jukyo'] = df['jukyo'].str.split('市', expand = True).drop(1, axis = 1)
df['jukyo'] = df['jukyo'].str.split('郡', expand = True)

In [0]:
# Rosen preprocessing
df['rosen_nm1'] = df['rosen_nm1'].map(lambda x: 'JR' if 'JR' in x else ('東武' if '東武' in x else ('西武' if '西武' in x else ('埼玉高速鉄道' if '埼玉高速鉄道' in x else 'others'))))

In [0]:
#Target Encoding
target_encoding_list = ['jukyo',
'yoto1',
'kempei1',
'yoseki1',
'josui',
'gesui',
'gas',
'usui',
'tateuri_su',
'tochiuri_su',
'joken_su',
'kaoku_um',
'yheki_umu',
'yheki_yohi',
'hw_status',
'toshikuiki1',
'kodochiku',
'chikukeikaku',
'keikakuroad',
'kaihatsukyoka',
't53kyoka',
'hokakyoka',
'bokachiiki',
'sho_conv',
'sho_super',
'rs_e_kdate2',
'rs_w_kdate2',
'rs_s_kdate2',
'rs_n_kdate2',
'rosen_nm1',
'bas_toho1',
'levelplan',
'setsudo_kj',
'jigata',
'hiatari',
'garage',
'kobetsu1']

for i in target_encoding_list:
    category_counts = i + '_' + 'category_counts'
    outcome_counts = i + '_' + 'outcomecounts'
    #Count each category and sum(price)
    grouped_category = df.groupby(i)[i].count().reset_index(name = category_counts)
    grouped_outcome = df.groupby(i)['keiyaku_pr'].sum().reset_index(name = outcome_counts)
    
    #Merge by key
    df = df.merge(grouped_category, how = "left", on = i)
    df = df.merge(grouped_outcome, how = "left", on = i)
    
    #Exclude by oneself
    target_mean_encoding = i + '_' + 'tme'
    df[target_mean_encoding] = (df[outcome_counts] - df['keiyaku_pr'])/(df[category_counts] - 1)
    df = df.drop(columns = {i, category_counts, outcome_counts})

In [0]:
#Standirzation
df_wo_y = df.drop('keiyaku_pr', axis = 1)
df_wo_y = (df_wo_y - df_wo_y.min()) / (df_wo_y.max() - df_wo_y.min()) 

In [0]:
df_std = pd.concat([df_wo_y, df['keiyaku_pr']], axis = 1)

### Model Set

In [0]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [0]:
#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(df_wo_y,  df['keiyaku_pr'], test_size = 0.2, random_state = 777)

In [278]:
reg_xgb = xgb.XGBRegressor(njobs = -1)
reg_xgb_cv = GridSearchCV(reg_xgb, {'max_depth': [2,4,6], 'n_estimators': [50,100,200]}, verbose=1)
reg_xgb_cv.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  if getattr(data, 'base', None) is not None and \


Fitting 3 folds for each of 9 candidates, totalling 27 fits


  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   37.9s finished
  if getattr(data, 'base', None) is not None and \




GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, njobs=-1,
                                    nthread=None, objective='reg:linear',
                                    random_state=0, reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [2, 4, 6],
                         'n_estimators': [50, 100, 200]},


In [281]:
reg_xgb = xgb.XGBRegressor(**reg_xgb_cv.best_params_)
reg_xgb.fit(X_train, y_train)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=None, n_estimators=200,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [287]:
#MSE
pred_train = reg_xgb.predict(X_train)
pred_test = reg_xgb.predict(X_test)
print(mean_squared_error(y_train, pred_train))
print(mean_squared_error(y_test, pred_test))

387688676.3225377
295454807260.0048


In [290]:
#MAPE
print(np.mean(np.abs((y_train - pred_train) / y_train)) * 100)
print(np.mean(np.abs((y_test - pred_test) / y_test)) * 100)

0.04104850695056502
0.22193160766195383
