In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np 
import pandas as pd
from collections import defaultdict
import time
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from collections import Counter
from sklearn.model_selection import KFold,train_test_split,GridSearchCV
from sklearn.linear_model import ElasticNet,LinearRegression,ElasticNetCV
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
import xgboost as xgb


pd.set_option('max_columns',135)
pd.set_option('max_rows',200)
sns.set(color_codes=True)


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [7]:
ROUTE = '/content/drive/My Drive/Colab Notebooks/Real_Estate_AVM/data/'
# training_data = pd.read_csv(ROUTE+'train_avm.csv').iloc[:,1:] #(37687, 134)
# testing_data =  pd.read_csv(ROUTE+'test_avm.csv').iloc[:,1:] #(6301, 133)
# training_data_dummy = pd.read_csv(ROUTE+'training_data_dummy.csv') #(37687, 153)
training_data_dummy = pd.read_csv(ROUTE+'training_data_dummy2.csv') #(37687, 143)
N_col = training_data_dummy.shape[1]
N_col

142

In [8]:
X, X_test, Y, Y_test = train_test_split(training_data_dummy.iloc[:,1:N_col], training_data_dummy.iloc[:,0], test_size=0.3, random_state=619)

In [12]:
X,Y = training_data_dummy.iloc[:,1:N_col], training_data_dummy.iloc[:,0]

# Function

In [4]:
def Score(true, pred):
    pe = np.abs(np.array(true) - np.array(pred))/true
    hit = [1 for i in pe if i<=0.1]
    hit_rate = round(np.sum(hit)/len(true),4)
    mape = np.mean(pe)
    if mape <1:
      s = hit_rate*10000+(1-mape)
    else:
      s = hit_rate*10000
  
      print('Watch out mape>1')
    return s, hit_rate, mape

# Modeling

## Linear Regression

In [None]:
kf = KFold(n_splits=5)

for train_index, test_index in kf.split(X):
    x_train, x_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
    
    reg = LinearRegression()
    reg.fit(x_train,y_train)

    y_pred = reg.predict(x_test)
    s,hr,mape = Score(y_test,y_pred)
    print(f'{s}, {hr}, {mape}')

## ElasticNet

In [None]:
for L1_RATIO in [0,0.25,0.5,0.75,1]:
  print(L1_RATIO)
  for train_index, test_index in kf.split(X):
    x_train, x_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
    regr = ElasticNet( random_state=619,l1_ratio=L1_RATIO)
    regr.fit(x_train, y_train)
    y_pred = regr.predict(X_test)
    s,hr,mape=Score(Y_test,y_pred)
    print(f'{s}, {hr}')


## GBR

In [None]:
gbr = GradientBoostingRegressor(n_estimators=16000, learning_rate=0.05,max_depth=8,max_features='sqrt',
                                    min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =619)
gbr.fit(X, Y)
y_pred = gbr.predict(X_test)
Score(Y_test,y_pred)

In [None]:
X,Y = training_data_dummy.iloc[:,1:N_col],training_data_dummy.iloc[:,0]
gbr_result_dict = defaultdict(list)
start = time.time()
kf = KFold(n_splits=5)
for idx,(train_index, test_index) in enumerate(kf.split(X)):
    x_train, x_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
    gbr = GradientBoostingRegressor(n_estimators=10000,learning_rate=0.05,max_depth=8, max_features='sqrt',min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =619)
    gbr.fit(x_train, y_train)
    y_pred = gbr.predict(x_test)
    Score(y_test,y_pred)
  
    s,hr,mape = Score(y_test,y_pred)

    gbr_result_dict['s'].append(s)
    gbr_result_dict['mape'].append(mape)
    print(f'{idx} done!')
end = time.time()

print('\n=====')
s_mu, s_std = np.mean(gbr_result_dict['s']),np.std(gbr_result_dict['s'])
mape_mu, mape_std = np.mean(gbr_result_dict['mape']),np.std(gbr_result_dict['mape'])

print(f'mean:{s_mu}, sd:{s_std}')
print(f'mean:{mape_mu}, sd:{mape_std}')

### cv

In [15]:
# n_estimators -> max_depth,min_samples_split -> min_samples_leaf -> max_features,sub_sample -> learning_rate
parameters = {
              'learning_rate': [0.1],
              'n_estimators':[500],
              'max_depth':[8],
              'min_samples_split':[8,15,25,40]
}


In [None]:
X,Y = training_data_dummy.iloc[:,1:N_col],training_data_dummy.iloc[:,0]
gbr = GradientBoostingRegressor(loss='huber', random_state =619)

gsearch = GridSearchCV(gbr, param_grid=parameters, cv=5,scoring='neg_mean_absolute_error')
gsearch.fit(X,Y)
print(gsearch.best_params_)

In [None]:
print(gsearch.best_params_)
result_df = pd.DataFrame(gsearch.cv_results_)
result_df[['params','mean_test_score','rank_test_score']]

## XGBR

In [None]:
xgbr = xgb.XGBRegressor(random_state =619,n_estimators=2250,learning_rate=0.05,max_depth=9)
# colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=3,min_child_weight=1.7817, reg_alpha=0.4640, reg_lambda=0.8571,subsample=0.5213, silent=1
xgbr.fit(X, Y)
y_pred = xgbr.predict(X_test)
Score(Y_test,y_pred)

In [None]:
X,Y = training_data_dummy.iloc[:,1:N_col],training_data_dummy.iloc[:,0]

# col = [i[0] for i in importance_ls if i[1]>100]
# X = X[col]

result_dict = defaultdict(list)
# start = time.time()
kf = KFold(n_splits=5)
for idx,(train_index, test_index) in enumerate(kf.split(X)):
    x_train, x_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
  
    xgbr = xgb.XGBRegressor(random_state =619,n_estimators=2250,learning_rate=0.05,max_depth=9,subsample=0.9,colsample_bytree=0.9)
    xgbr.fit(x_train, y_train)
    y_pred = xgbr.predict(x_test)
    s,hr,mape = Score(y_test,y_pred)
    
# print([s,hr,mape])
    result_dict['s'].append(s)
    result_dict['mape'].append(mape)
    print(f'{idx} done!')

end = time.time()

print('\n=====')
s_mu, s_std = np.mean(result_dict['s']),np.std(result_dict['s'])
mape_mu, mape_std = np.mean(result_dict['mape']),np.std(result_dict['mape'])

print(f'mean:{s_mu}, sd:{s_std}')
print(f'mean:{mape_mu}, sd:{mape_std}')

### cv

In [20]:
parameters = {
              
              'learning_rate': [0.1], #so called `eta` value
              'max_depth': [6,7,8,9],
              'n_estimators': [500]
              # 'min_child_weight': [4],
              # 'silent': [1],
              # 'subsample': [0.7],
              # 'colsample_bytree': [0.7],
             
              }

In [None]:
# N = 150
# select_var = [i[0] for i in importance_ls[0:N]]

X,Y = training_data_dummy.iloc[:,1:N_col],training_data_dummy.iloc[:,0]
# X = X[select_var]
xgbr = xgb.XGBRegressor()
gsearch = GridSearchCV(xgbr, param_grid=parameters, cv=5,scoring='neg_mean_absolute_error')
gsearch.fit(X,Y)
print(gsearch.best_params_)

In [None]:
print(gsearch.best_params_)
result_df = pd.DataFrame(gsearch.cv_results_)
result_df[['params','mean_test_score','rank_test_score']]

## LightGBM

In [22]:
# best param
param = {'objective':'mape',
         'feature_fraction': 0.9, 
         'learning_rate': 0.05, 
         'max_bin': 250, 
         'max_depth': 80, 
         'min_data_in_leaf': 5, 
         'num_iterations': 1600, 
         'num_leaves': 480}

In [None]:
X,Y = training_data_dummy.iloc[:,1:N_col],training_data_dummy.iloc[:,0]

result_dict = defaultdict(list)
start = time.time()
kf = KFold(n_splits=5)
for idx,(train_index, test_index) in enumerate(kf.split(X)):
    x_train, x_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
  
    train_data = lgb.Dataset(x_train, label=y_train)
    test_data = lgb.Dataset(x_test, label=y_test)
    clf_lgbm = lgb.train( params =param, train_set=train_data, valid_sets = [train_data,test_data],verbose_eval=500)
    y_pred = clf_lgbm.predict(x_test, num_iteration=clf_lgbm.best_iteration)
    s,hr,mape = Score(y_test,y_pred)

    result_dict['s'].append(s)
    result_dict['mape'].append(mape)
    print(f'{idx} done!')
end = time.time()

print('\n=====')
s_mu, s_std = np.mean(result_dict['s']),np.std(result_dict['s'])
mape_mu, mape_std = np.mean(result_dict['mape']),np.std(result_dict['mape'])

print(f'mean:{s_mu}, sd:{s_std}')
print(f'mean:{mape_mu}, sd:{mape_std}')

### cv

In [24]:
parameters = {
              'learning_rate': [0.04,0.05,0.06],
              'num_leaves':[450],
              'max_bin':[250],
              'max_depth':[80],
              'feature_fraction':[0.1],
              'min_data_in_leaf':[5],
              'num_iterations':[1400,1500,1600]
}


In [None]:
# N = 150
# select_var = [i[0] for i in importance_ls[0:N]]

X,Y = training_data_dummy.iloc[:,1:N_col],training_data_dummy.iloc[:,0]
# X = X[select_var]

gbm = lgb.LGBMRegressor(boosting_type='gbdt',objective = 'regression')
gsearch = GridSearchCV(gbm, param_grid=parameters, cv=5,scoring='neg_mean_absolute_error')
gsearch.fit(X,Y)
print(gsearch.best_params_)

In [None]:
print(gsearch.best_params_)
print(gsearch.best_score_)
result_df = pd.DataFrame(gsearch.cv_results_)
result_df[['params','mean_test_score','rank_test_score']]


### Feature Importance

In [26]:
# best param
param = {'objective':'mape',
         'feature_fraction': 0.9, 
         'learning_rate': 0.05, 
         'max_bin': 250, 
         'max_depth': 80, 
         'min_data_in_leaf': 5, 
         'num_iterations': 1600, 
         'num_leaves': 480}

In [None]:
train_data = lgb.Dataset(X, label=Y)
test_data = lgb.Dataset(X_test, label=Y_test)
clf_lgbm = lgb.train( params =param, train_set=train_data, valid_sets = [train_data,test_data])
y_pred = clf_lgbm.predict(X_test, num_iteration=clf_lgbm.best_iteration)
Score(Y_test,y_pred)

In [None]:
lgb.plot_importance(clf_lgbm, max_num_features=120,figsize=(10,20))
plt.title("Featurertances")
plt.show()

In [None]:
importance = clf_lgbm.feature_importance()
feature_name = clf_lgbm.feature_name()

temp_dict={}
for k,v in zip(feature_name,importance):
  temp_dict[k]=v

importance_ls = sorted(temp_dict.items(), key=lambda x:x[1],reverse=True)
len(importance_ls)

In [None]:
for N in [50,60,70,80,90,100,110,120,130,140,150]:
  print(N)
  select_var = [i[0] for i in importance_ls[0:N]]

  X,Y = training_data_dummy.iloc[:,1:153],training_data_dummy.iloc[:,0]
  X = X[select_var]

  result_dict = defaultdict(list)
  start = time.time()
  kf = KFold(n_splits=3)
  for idx,(train_index, test_index) in enumerate(kf.split(X)):
      x_train, x_test = X.iloc[train_index,:], X.iloc[test_index,:]
      y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
    
      train_data = lgb.Dataset(x_train, label=y_train)
      test_data = lgb.Dataset(x_test, label=y_test)
      clf_lgbm = lgb.train( params =param, train_set=train_data, valid_sets = [train_data,test_data],verbose_eval=500)
      y_pred = clf_lgbm.predict(x_test, num_iteration=clf_lgbm.best_iteration)
      s,hr,mape = Score(y_test,y_pred)

      result_dict['s'].append(s)
      result_dict['mape'].append(mape)
      # print(f'{idx} done!')
  end = time.time()

  print('\n=====')
  s_mu, s_std = np.mean(result_dict['s']),np.std(result_dict['s'])
  mape_mu, mape_std = np.mean(result_dict['mape']),np.std(result_dict['mape'])

  print(f'mean:{s_mu}, sd:{s_std}')
  print(f'mean:{mape_mu}, sd:{mape_std}')

