In [26]:
# install xgboost
!pip install xgboost



In [27]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt


In [28]:
data_path = '../data/cleandata/train_Zip_onehot.csv'
df = pd.read_csv(data_path)
df['Zip_cluster_0'] = df['Zip_cluster_0'].apply(lambda x: 1 if x else 0)
df['Zip_cluster_1'] = df['Zip_cluster_1'].apply(lambda x: 1 if x else 0)
df.head(5)

Unnamed: 0,ID,YearBuilt,SqFt,Story,Acres,Baths,Fireplaces,Value,Zip_cluster_0,Zip_cluster_1
0,1,1990,2102,1.0,0.77,2.0,1,203200,0,0
1,2,1986,1740,1.5,0.06,3.0,1,119096,0,0
2,3,1955,795,1.0,0.3,1.0,0,71666,0,0
3,4,1983,1152,1.0,0.68,2.0,0,131103,0,0
4,6,1924,1829,1.5,0.25,3.0,1,496425,1,0


In [29]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X = df.drop(['Value'], axis=1)
y = df['Value']

In [30]:
X_valid = pd.read_csv('../data/cleandata/test_Zip_onehot.csv')
y_valid = pd.read_csv('../data/findTest/test_est.csv')
def valid_loss(X, y, **kwargs):
    xgb_model = xgb.XGBRegressor(**kwargs)
    xgb_model.fit(X, y)
    y_pred = xgb_model.predict(X_valid)
    return np.mean(np.abs(y_pred - y_valid['Outcome']))


In [31]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
xgb_model.fit(X, y)
y_pred = xgb_model.predict(X_valid)
print(np.mean(np.abs(y_pred - y_valid['Outcome'])))


18622.99467737967


In [36]:
param_grid = {
    'n_estimators': [50, 100, 200, 250],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05],
    # 'subsample': [0.5, 0.7, 1],
    # 'gamma': [0, 0.1, 0.2, 0.5],
    # 'reg_alpha': [0, 0.1, 0.5, 1],
    # 'reg_lambda': [0, 0.1, 0.5, 1],
    'min_child_weight': [3, 5, 7]
}

from sklearn.model_selection import ParameterGrid
grid = ParameterGrid(param_grid)
print('total combination:', len(grid))
rmse = []
for param in grid:
    print('fitting: ', param, end=' ')
    rmse.append(valid_loss(X, y, **param))
    print('rmse:', rmse[-1])

best_param = list(grid)[np.argmin(rmse)]
print('best_param:', best_param)
print('best_rmse:', np.min(rmse))

total combination: 72
fitting:  {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 50} rmse: 48093.50108839683
fitting:  {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100} rmse: 31797.104213396826
fitting:  {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 200} rmse: 23289.665795714827
fitting:  {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 250} rmse: 23999.408764464828
fitting:  {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 50} rmse: 46875.65108839683
fitting:  {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 100} rmse: 29857.783900896826
fitting:  {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 200} rmse: 19448.231085014628
fitting:  {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 250} rmse: 18083.836889464827
fitting:  {'learning_rate': 0.

In [33]:
# make prediction
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42, **best_param)
xgb_model.fit(X, y)
y_pred = xgb_model.predict(X_valid)
print(np.mean(np.abs(y_pred - y_valid['Outcome'])))


12731.706595743834


In [34]:
df_out = pd.DataFrame({'ID':X_valid['ID'], 'Outcome':y_pred})
df_out.to_csv('../data/predictions/xgb.csv', index=False)

In [35]:
df_out

Unnamed: 0,ID,Outcome
0,1,138889.375
1,2,374354.0625
2,3,100762.695312
3,4,206070.46875
4,5,142741.546875
5,6,96998.898438
6,7,244417.90625
7,8,125052.84375
8,9,200724.25
9,10,144017.375
