# HOUSE SALE PRICING PREDICTION
## Author: Yujie Fu, Zhijie Chen, Jie Yang


Hi folks! This is a beginners notebook that covers all the main steps necessary to complete a beginning Machine Learning project.
https://www.kaggle.com/c/house-prices-advanced-regression-techniques

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LinearRegression # Linear regression
from sklearn.impute import KNNImputer # KnnImputer for missing value
from sklearn.model_selection import train_test_split # for splitting dataset into training sub dataset and validation subdataset
from sklearn.model_selection import GridSearchCV # Tune hyper-parameters
from sklearn.pipeline import Pipeline # No need to scale data manually
from scipy.stats import skew # For skewness
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures # Min-max scaling, standardized scaling
from sklearn.neural_network import MLPRegressor # Neural network
from sklearn.linear_model import Ridge # Ridge regression
from sklearn.ensemble import RandomForestRegressor # Random Forest
from xgboost import XGBRegressor # Boosting tree >> XGBRegressor
from catboost import CatBoostRegressor # Boosting tree >> Cat Boosting regressor
from sklearn.neighbors import KNeighborsRegressor # Knn
from sklearn.linear_model import Ridge #Ridge

##  DATA PREPROCESSING

Now, we need to preprocess data for the modeling part. The main steps are:

- **Looking at potential NaN**
    - numerical features : knn imputer using whole dataset (train+test), k=5
    - categorical features: treat null values as new category
- **Dealing with categorical features (e.g. Dummy coding)**
- **Handle skewness**
    - log transform on skewed values with skewness > 0.75 as well as targeted variable SalePrice
- **Normalization (combined in data modeling)**
    - standardized scaling
    - min-max scaling
    

In [None]:
# Data loading
data_train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
data_test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

# Record targeted variable before dropping
yTr = data_train["SalePrice"]

# SalePrice is highly skewed, log transform it
yTr = np.log1p(yTr)


##  Drop Features

In [None]:
#kick out null_values>threshold   FireplaceQu,Alley,MiscFeature,Fence,PoolQC   all has already been moved in the drop features
drop_null_threshold=0.3
null_percentage=pd.DataFrame(data_train.isnull().sum()/data_train.shape[0],columns=['null_percentage'])
drop_null_list=(null_percentage.loc[null_percentage['null_percentage']>drop_null_threshold,].index)  #this variable is the columns of the names for any column exceed the threshold

#next time may be we can use the drop_null_list+['id']+['SalePrice'] to drop columns

# Drop features
data_train = data_train.drop(['Id','PoolQC','MiscFeature','Alley','Fence','FireplaceQu','SalePrice'],axis = 1)
data_test = data_test.drop(['Id','PoolQC','MiscFeature','Alley','Fence','FireplaceQu'],axis = 1)

In [None]:
# Prepare to build a whole dataset,null values will be filled based on whole data knn imputer
data_whole = pd.concat([data_train, data_test])

# Get categotical features names and numeric features name
cat_feats = data_whole.dtypes[data_train.dtypes == "object"].index
num_feats = data_whole.dtypes[data_train.dtypes != "object"].index

data_whole_num = data_whole.loc[:,num_feats]

# Fill out the null values using knn impute
imputer = KNNImputer(n_neighbors=5)
data_whole_num = pd.DataFrame(imputer.fit_transform(data_whole_num), columns = data_whole_num.columns)
data_whole.index = data_whole_num.index

data_whole = pd.concat([data_whole_num, data_whole.loc[:,cat_feats]], axis = 1)

# Categorical variables will be treated as new category
data_whole.fillna("NONE")
data_whole = pd.get_dummies(data_whole, dtype = "Float64")

# Handle skewness
skew_threshold = 0.75

skewed_feats = data_whole.loc[:, num_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > skew_threshold]
skewed_feats = skewed_feats.index.tolist()
data_whole[skewed_feats] = np.log1p(data_whole[skewed_feats]) # log1p: log(1+x)



## Change Outlier to Nan

In [None]:
check outlier and change into Nan
outlier_threshold=1.7
quan_columns=[]
for col in quan_columns:
    Q1=np.percentile(data_train[col],25)
    Q3=np.percentile(data_train[col],75)
    lower_outlier_bound=Q1-outlier_threshold*(Q3-Q1)
    upper_outlier_bound=Q3+outlier_threshold*(Q3-Q1)
    outlier_list_col = data_train[(data_train[col] <  lower_outlier_bound) | (data_train[col] > upper_outlier_bound)].index
    data_train[col][outlier_list_col]=None

In [None]:
# Prepare xTr, xTe for model building
xTr = data_whole.loc[0:len(data_train)-1,:]
xTe = data_whole.loc[len(data_train):,:]

##  DATA MODELING
- **Knn regression**
- **Random forest**
- **Boosting trees**
    - xgb boosting
    - cat boosting
- **Neural network**

You can uncomment block of codes to run the model. Personally, I highly recommend run one model at a time.

## Knn regression

In [None]:
# # Features are standardized, tries k = 1...6.
# estimator_knn = Pipeline(steps=[('scalar',StandardScaler()),('regressor', KNeighborsRegressor())])
# parameters_knn = {'regressor__n_neighbors':[1,2,3,4,5,6]}
# model_knn = GridSearchCV(estimator_knn, param_grid=parameters_knn, cv=5, scoring='neg_mean_squared_error')

# # train the model and get best parameters and scores
# model_knn.fit(xTr,yTr)
# print("Best estimator:")
# print(model_knn.best_estimator_)
# print("Best validation RMSE:")
# print(np.sqrt(-model_knn.best_score_))

## Cat boosting regressor(Best)

In [None]:
# estimator_cat_b = Pipeline(steps=[('scaler',StandardScaler()), ('cat_b',CatBoostRegressor(verbose=False))])
# parameters_cat_b = {'cat_b__iterations': [6000],
#                     'cat_b__learning_rate': [0.005],
#                     'cat_b__depth': [4, 6, 10],
#                     'cat_b__l2_leaf_reg': [1]}
# model_cat_b = GridSearchCV(estimator_cat_b, param_grid=parameters_cat_b, cv=5, scoring='neg_mean_squared_error')

# # train the model and get best parameters and scores
# model_cat_b.fit(xTr,yTr)
# print("Best estimator:")
# print(model_cat_b.best_estimator_)
# print("Best validation RMSE:")
# print(np.sqrt(-model_cat_b.best_score_))

## XGB boosting tree Regressor

In [None]:
# estimator_xgb = Pipeline(steps=[('scaler',MinMaxScaler()), ('xgb', XGBRegressor())])
# parameters_xgb = {'xgb__reg_alpha': [0.1,0.5],'xgb__learning_rate': [0.02,0.04], 'xgb__n_estimators' : [200,100,150], 'xgb__max_depth':[3,4,5]}
# model_xgb = GridSearchCV(estimator_xgb, param_grid=parameters_xgb, cv=5, scoring='neg_mean_squared_error')

# # train the model and get best parameters and scores
# model_xgb.fit(xTr,yTr)
# print("Best estimator:")
# print(model_xgb.best_estimator_)
# print("Best validation RMSE:")
# print(np.sqrt(-model_xgb.best_score_))

## Random Forest

In [None]:
# estimator_rf = Pipeline(steps=[('scaler',MinMaxScaler()), ('RF', RandomForestRegressor(criterion='mse', random_state=10, bootstrap=False))])
# parameters_rf = {'RF__n_estimators': [200,250], 'RF__max_features': ['auto','sqrt']}
# model_rf = GridSearchCV(estimator_rf, param_grid=parameters_rf, cv=5, scoring='neg_mean_squared_error')

# # train the model and get best parameters and scores
# model_rf.fit(xTr,yTr)
# print("Best estimator:")
# print(model_rf.best_estimator_)
# print("Best validation RMSE:")
# print(np.sqrt(-model_rf.best_score_))

## Neural network

In [None]:
# estimator_nn = Pipeline(steps=[('scaler',StandardScaler()), ('mlp',MLPRegressor(max_iter = 3000))])
# parameters_nn = {'mlp__hidden_layer_sizes': [(300,200,300),(300,200,50),(100,100,100)], 'mlp__alpha':[1,0.1,1e-2]}
# model_nn = GridSearchCV(estimator_nn, param_grid=parameters_nn, cv=5, scoring='neg_mean_squared_error',verbose=0, n_jobs=-1)

# # train the model and get best parameters and scores
# model_nn.fit(xTr,yTr)
# print("Best estimator:")
# print(model_nn.best_estimator_)
# print("Best validation RMSE:")
# print(np.sqrt(-model_nn.best_score_))

## Ridge

In [None]:
estimator_ridge = Pipeline(steps=[('scaler',StandardScaler()), ('Ridge',Ridge())])
parameters_ridge = {'Ridge__alpha': [0.2,0.4,0.6,0.8,1.0]}
model_ridge = GridSearchCV(estimator_ridge, param_grid=parameters_ridge, cv=5, scoring='neg_mean_squared_error')

# train the model and get best parameters and scores
model_ridge.fit(xTr,yTr)
print("Best estimator:")
print(model_ridge.best_estimator_)
print("Best validation RMSE:")
print(np.sqrt(-model_ridge.best_score_))

## Submission

In [None]:
# # Get submission.csv, be ready to submit

# # Convert SalePrice back
# final = np.expm1(model_cat_b.predict(xTe)) # np.expm1: inverse of np.log1p

# data_test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
# out_df = pd.DataFrame()
# out_df["Id"] = data_test["Id"]
# out_df["SalePrice"] = final
# out_df
# out_df.to_csv("./submission.csv",index = False)

## Explotary procedure

Uncomment to run codes

In [None]:
# decision tree regressor_by Zhijie
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
estimator=DecisionTreeRegressor(random_state=22)

#set parameter, here I only use max_depth as reference
param={'max_depth':[3,4,5,6,7]}

#use grid_search corssvalidation 
gc=GridSearchCV(estimator,param_grid=param,cv=5,scoring='neg_mean_squared_error')

gc.fit(xTr,yTr)

# clf=gc.best_estimator_.predict(xVal)
# MSE=-gc.score(xVal,yVal)
# RMSE=MSE**0.5
# RMSE

In [None]:
from sklearn.ensemble import RandomForestRegressor
estimator=RandomForestRegressor(random_state=22)
param={'n_estimators':[100,200,500,1000]}
gc=GridSearchCV(estimator,param_grid=param,cv=5,scoring='neg_mean_squared_error')
gc.fit(xTr,yTr)
clf=gc.best_estimator_.predict(xVal)
MSE=-gc.score(xVal,yVal)
RMSE=MSE**0.5
RMSE

In [None]:
# KNN regressor by YJ

from sklearn import datasets
from sklearn.model_selection import GridSearchCV  #gridsearch for securing the best parameter
from sklearn.neighbors import KNeighborsRegressor

parameters={'n_neighbors':[1,3,5,7,9,11,13,15]}
kng=KNeighborsRegressor()  #Note: here assigned parameter is not the best

# Find best K with GridSearchCV
clf=GridSearchCV(kng,parameters,cv=5)  #5折
clf.fit(xTr,yTr)

# Best parameter K is 7 here
print("Best score/accuracy：%.2f"%clf.best_score_,"Best K:",clf.best_params_)

kng=KNeighborsRegressor(n_neighbors=7) #Using 7 as the K

kng.fit(xVal,yVal)
kng_prediction=kng.predict(xVal)
np.mean((kng_prediction - yVal)**2)**0.5