## Set up data set

In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../ames') # path the the directory
import config
from data_prep import clean, add_features, dummify

housing = clean(config.HOUSING_CSV)
housing = add_features(housing)

## Create testing and training sets

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
X = housing[config.CHOSEN_VARIABLES].copy()
dummy_df = dummify(housing, config.VARS_TO_DUMMIFY, drop_first=False)
X = pd.concat([X,dummy_df], axis=1)
y = np.log(housing['SalePrice'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

## Tune and Run Model

In [1]:
from lightgbm import LGBMRegressor
lgb = LGBMRegressor(silent=False, random_state=42)

In [None]:
param_dist = {"max_depth": [25,50, 75, None],
              "learning_rate" : [0.01, 0.05, 0.1, 1],
              "num_leaves": [300,900,1200],
              "n_estimators": [200,400,1000]
             }
grid = GridSearchCV(lgb, n_jobs=-1, param_grid=param_dist, cv=5, scoring='neg_root_mean_squared_error')
grid.fit(X_train, y_train)

In [None]:
best_params = grid.best_params_
best_lgb_model = grid.best_estimator_
best_params

In [None]:
from sklearn.metrics import mean_squared_error
rms = mean_squared_error(y_test, best_lgb_model.predict(X_test), squared=False)
print(f'RMSE: {rms)'}
print(f'R2: {best_lgb_model.score(X_test,y_test)}')

## Feature Importance

In [None]:
from model_analysis import get_feature_importance, graph_importance
feats = get_feature_importance(cbr, X)
fig = graph_importance(feats, 'CatBoost')
fig.show()