# House Prices Competition

### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(rc={'figure.figsize':(22,8.27)})

### Load Data

In [None]:
train = pd.read_csv('./data/train.csv', index_col='Id')
test = pd.read_csv('./data/test.csv', index_col='Id')

In [None]:
train.head()

### First outliers

In [None]:
train['GrLivArea'].sort_values(ascending=False).head(), test['GrLivArea'].sort_values(ascending=False).head()

In [None]:
# find outliers
plt.scatter(train['GrLivArea'], train['SalePrice'])

#### First method - drop outliers from train set

In [None]:
# # filter for outliers
# area_outliers_mask = (train['GrLivArea'] > 4000)

# # drop outliers from training set
# train.drop(train.loc[area_outliers_mask].index, axis=0, inplace=True)

In [None]:
# plt.scatter(train['GrLivArea'], train['SalePrice'])

#### Second method - mark all outliers

In [None]:
# filters for train and test sets
outlier_train_mask = (train['GrLivArea'] > 4500)
outlier_test_mask = (test['GrLivArea'] > 4500)

train['Outlier'] = np.zeros(train.shape[0])
test['Outlier'] = np.zeros(test.shape[0])

train.loc[outlier_train_mask, 'Outlier'] = 100
test.loc[outlier_test_mask, 'Outlier'] = 100

### All data

In [None]:
# all data
data = pd.concat([train, test])

In [None]:
data.shape

### Drop columns with too many missing values

In [None]:
data.isnull().sum(axis=0).head(10)

In [None]:
sns.heatmap(data.isnull())

In [None]:
# drop all columns that have more than 70% of NaNs
for col in data.columns:
    if data[col].isna().sum() > 0.7 * data.shape[0]:
        data.drop(col, axis=1, inplace=True)
        print(col)

In [None]:
sns.heatmap(data.isnull())

In [None]:
data.info()

### Drop columns with different categories in train and test sets

In [None]:
cols_to_drop = []

for col in data.drop('SalePrice', axis=1).columns:
    if train[col].dtype not in ['float64', 'int64']:
        if (data.loc[data[col].notnull(), col].unique() != train.loc[train[col].notnull(), col].unique()).any():
            cols_to_drop.append(col)
            print(col)

# data.drop(cols_to_drop, axis=1, inplace=True)

### Fill NaNs 
<b>in categorical features</b><br>
<i> - with mode</i><br>
<i> - with "Missing" if the number of missing values is too high</i><br>
<b>in numeric features</b><br>
<i> - with median</i><br>
<i> - with -999 if the number of missing values is too high</i>

In [None]:
# define columns with missing values
na_columns = data.drop('SalePrice', axis=1).columns[(data.drop('SalePrice', axis=1).isna().sum() > 0)]

In [None]:
# fill NaNs
for col in na_columns:
    if data[col].isnull().sum() < 0.4 * data.shape[0]:
        if str(data[col].dtype) == 'float64':
            data[col].fillna(data[col].median(), inplace=True)
        else:
            data[col].fillna(data[col].mode(), inplace=True)
            data[col].replace({None: data[col].mode()[0]}, inplace=True)
    else:
        if str(data[col].dtype) == 'object':
            data[col].fillna('Missing', inplace=True)
            data[col].replace({None: 'Missing'}, inplace=True)
        else:
            data[col].fillna(-999, inplace=True)

In [None]:
sns.heatmap(data.isnull())

In [None]:
# list of the numbers of unique values of the non-integer categorical features
values_arr = []
for col in data.columns:
    if data[col].dtype not in ['float64', 'int64']:
        values_arr.append(len(data[col].unique()))
np.array(values_arr)

### Feature engineering

In [None]:
# age of the house
data['HouseAge'] = data['YrSold'] - data['YearBuilt']

# years since the last remodeling
data['ModelAge'] = data['YrSold'] - data['YearRemodAdd']

# is remodeled or not
# remodeled_mask = (data['YearBuilt'] != data['YearRemodAdd'])
# data['Remodeled'] = np.zeros(data.shape[0])
# data.loc[remodeled_mask, 'Remodeled'] = 1

# drop previous columns
data.drop(['YearBuilt', 'YearRemodAdd'], axis=1, inplace=True)

### Encode categorical features

In [None]:
from sklearn.preprocessing import LabelBinarizer

In [None]:
lb = LabelBinarizer()
for col in data.drop('SalePrice', axis=1).columns:
    if data[col].dtype not in ['float64', 'int64']:
        matrix = lb.fit_transform(data[col]).T
        unique = data[col].unique()
        # if columns is binary, LabelBinarizer returns 1d-array instead of 2d-matrix
        if matrix.shape[0] == 1:
            data[col] = matrix[0]
        else:
            for i in range(len(unique)):
                data[str(col + unique[i])] = matrix[i]
            # remove previous column
            data.drop(col, axis=1, inplace=True)

In [None]:
data.head()

In [None]:
data.drop_duplicates(inplace=True)

### Train-test split

In [None]:
y_train = train['SalePrice']
data.drop(['SalePrice'], axis=1, inplace=True)

train = data.iloc[:train.shape[0]]
test = data.iloc[train.shape[0]:]

### Model Selection

In [None]:
# import models
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
# import instruments for model evaluation
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse

In [None]:
# define error function [RMSE(log(y), log(y_pred))]
# RMSE - Root-Mean-Square Error
def error(Y_real, Y_pred):
    return np.sqrt(mse(np.log(Y_real), np.log(Y_pred)))

In [None]:
# list of models
models = [
    XGBRegressor(),
    RandomForestRegressor()
]

# K-Fold splitter
kfold = KFold(n_splits=10)

# Model evaluation DataFrame
log_df = pd.DataFrame(columns=['Model', 'Error'])
log_dict = {}

for train_ind, test_ind in kfold.split(train, y_train):
    X_train, Y_train = train.iloc[train_ind], y_train.iloc[train_ind]
    X_test, Y_test = train.iloc[test_ind], y_train.iloc[test_ind]
    
    for model in models:
        name = model.__class__.__name__
        
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)
        
        err = error(Y_test, Y_pred)
        
        if name in log_dict:
            log_dict[name] += err
        else:
            log_dict[name] = err

for model in log_dict:
    row = {'Model': model, 'Error': log_dict[model] / 10}
    log_df = log_df.append(row, ignore_index=True)

log_df

### Prediction of the best model

In [None]:
best = XGBRegressor()
best.fit(train, y_train)

predictions = best.predict(test)

In [None]:
plt.plot(train.columns, best.feature_importances_)

In [None]:
predictions

### Export results

In [None]:
submission_df = pd.read_csv('./data/sample_submission.csv', index_col='Id')
submission_df['SalePrice'] = predictions

In [None]:
# # manually change the price of the outlier in the test set
# outlier_mask = (test['GrLivArea'] > 5000)
# submission_df.loc[outlier_mask, 'SalePrice'] = 200000

In [None]:
submission_df.to_csv('./result/XGB_with_outliers.csv')