In [None]:
import numpy as np # linear algebra
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('max_colwidth', None)

import seaborn as sns
import matplotlib.pyplot as plt


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train=pd.read_csv('/kaggle/input/ames-housing-dataset/AmesHousing.csv')
test=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
train2=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
train.head()

In [None]:
train.columns = train.columns.str.replace(' ', '')
train=train.rename(columns={"YearRemod/Add": "YearRemodAdd"})

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train2.head()

In [None]:
print("Ames Dataset SIZE",len(train))
print("Housing Dataset SIZE",len(train2))
print("Housing Test Dataset SIZE",len(test))

Finding duplicates in data

In [None]:
data=pd.concat([train,train2,test], axis=0, sort=False)

In [None]:
print("Housing Dataset SIZE:",len(data))

In [None]:
useless = ['Id','PID','Order','SalePrice'] 
data = data.drop(useless, axis = 1)


In [None]:
duplicate = data[data.duplicated(keep='last')].index
len(duplicate)

In [None]:
duplicate[382:]

In [None]:
duplicate[390:]

we want to delete duplicates till index 2902 which means only from the ames dataset

In [None]:
duplicate=duplicate[0:390]

In [None]:
duplicate[386:]

In [None]:
train = train.drop(duplicate, axis = 0)

In [None]:
print('Length of the Ames Dataset now',len(train))

In [None]:
training=pd.concat([train,train2], axis=0, sort=False)

In [None]:
useless = ['Id','PID','Order'] 
training = training.drop(useless, axis = 1)

In [None]:
from scipy.stats import norm
(mu, sigma) = norm.fit(training['SalePrice'])
plt.figure(figsize = (10,5))
sns.distplot(training['SalePrice'], kde = True, hist=True, fit = norm)
plt.title('SalePrice distribution vs Normal Distribution', fontsize = 11)
plt.xlabel("House Sale Price in $", fontsize = 10)
plt.show()

In literature, acceptable values for skewness are between -0.5 and 0.5 while -2 and 2 for Kurtosis. Looking at the plot, we can clearly see how the distribution does not seem to be normal, but highly right-skewed. The non-normality of our distribution is also supported by the Shapiro test for normality (p-value really small that allows us to reject the hypotesis of normality). Despite that, let's leave it like that for now, we'll deal with that later in the notebook.

In [None]:
from scipy import stats
shap = stats.shapiro(training['SalePrice'])
print('Skewness : %f' % abs(training['SalePrice']).skew())
print('Kurtosis : %f' % abs(training['SalePrice']).kurt())
print('Shapiro_Test_statistic : %f' % shap.statistic )
print('Shapiro_Test_pvalue : %f' % shap.pvalue )

In [None]:
f, ax = plt.subplots(figsize=(52, 36))
mat = training.corr('pearson')
mask = np.triu(np.ones_like(mat, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(mat, mask=mask, cmap=cmap, vmax=1, center=0, annot = True,
            square=True, linewidths=.5, cbar_kws={"shrink": .6})
plt.show()

In [None]:
# OverallQuall - SalePrice [Pearson = -0.011
fig,ax=plt.subplots(1,3,figsize=(20,10))
sns.stripplot(data=train,x='BsmtFinSF2',y='SalePrice',ax=ax[1])
sns.violinplot(data=train,x='BsmtFinSF2',y='SalePrice',ax=ax[2])
sns.boxplot(data=train,x='BsmtFinSF2',y='SalePrice',ax=ax[0])
plt.show()

In [None]:
# GrLivArea vs SalePrice [corr = 0.71]

Pearson_GrLiv = 0.71
plt.figure(figsize = (12,6))
sns.regplot(data=train, x = 'GrLivArea', y='SalePrice', scatter_kws={'alpha':0.2})
plt.title('GrLivArea vs SalePrice', fontsize = 12)
plt.legend(['$Pearson=$ {:.2f}'.format(Pearson_GrLiv)], loc = 'best')
plt.show()

In [None]:
# YearBuilt vs SalePrice

Pearson_YrBlt = 0.56
plt.figure(figsize = (12,6))
sns.regplot(data=train, x = 'YearBuilt', y='SalePrice', scatter_kws={'alpha':0.2})
plt.title('YearBuilt vs SalePrice', fontsize = 12)
plt.legend(['$Pearson=$ {:.2f}'.format(Pearson_YrBlt)], loc = 'best')
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(x='YrSold',y='SalePrice',data=train,estimator=np.median)
plt.title('Median of Sale Price by Year')
plt.xlabel('Year of Selling')
plt.ylabel('Median of Price')
plt.show()

In [None]:
# Separating Target and Features

target = training['SalePrice']
test_id = test['Id']
test = test.drop(['Id'],axis = 1)
training2 = training.drop(['SalePrice'], axis = 1)


# Concatenating train & test set

train_test = pd.concat([training2,test], axis=0, sort=False)

In [None]:
len(train_test)

In [None]:
nan=pd.DataFrame(train_test.isna().sum(),columns=['Nan_sum'])
nan['feat']=nan.index
nan=nan[nan['Nan_sum']>0]
nan['Percentage']=(nan['Nan_sum']/1460)*100
nan['Percentage of total data']=(nan['Nan_sum']/5459)*100
nan=nan.sort_values(by=['Nan_sum'])
nan.insert(0,'Serial No.',range(1,len(nan)+1))
nan

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x=nan['feat'],y=nan['Percentage'])
plt.xticks(rotation=40)
plt.title('Features Containing Nan')
plt.xlabel('Features')
plt.ylabel('% of Missing Data')
plt.show()

In [None]:
# Converting non-numeric predictors stored as numbers into string

train_test['MSSubClass'] = train_test['MSSubClass'].apply(str)
train_test['YrSold'] = train_test['YrSold'].apply(str)
train_test['MoSold'] = train_test['MoSold'].apply(str)
train_test['OverallQual'] = train_test['OverallQual'].apply(str)
train_test['OverallCond'] = train_test['OverallCond'].apply(str)

In [None]:
# Filling Categorical NaN (That we know how to fill due to the description file )

train_test['Functional'] = train_test['Functional'].fillna('Typ')
train_test['Electrical'] = train_test['Electrical'].fillna("SBrkr")
train_test['KitchenQual'] = train_test['KitchenQual'].fillna("TA")
train_test['Exterior1st'] = train_test['Exterior1st'].fillna(train_test['Exterior1st'].mode()[0])
train_test['Exterior2nd'] = train_test['Exterior2nd'].fillna(train_test['Exterior2nd'].mode()[0])
train_test['SaleType'] = train_test['SaleType'].fillna(train_test['SaleType'].mode()[0])
train_test["PoolQC"] = train_test["PoolQC"].fillna("None")
train_test["Alley"] = train_test["Alley"].fillna("None")
train_test['FireplaceQu'] = train_test['FireplaceQu'].fillna("None")
train_test['Fence'] = train_test['Fence'].fillna("None")
train_test['MiscFeature'] = train_test['MiscFeature'].fillna("None")
for col in ('GarageArea', 'GarageCars'):
    train_test[col] = train_test[col].fillna(0)
        
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    train_test[col] = train_test[col].fillna('None')
    
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    train_test[col] = train_test[col].fillna('None')
    
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea','BsmtUnfSF', 'TotalBsmtSF'):
    train_test[col] = train_test[col].fillna(0)

train_test['LotFrontage'] = train_test['LotFrontage'].fillna(train['LotFrontage'].median())
    
    # Checking the features with NaN remained out

for col in train_test:
    if train_test[col].isna().sum() > 0:
        print(train_test[col][1])

In [None]:
train_test["SqFtPerRoom"] = train_test["GrLivArea"] / (train_test["TotRmsAbvGrd"] +
                                                       train_test["FullBath"] +
                                                       train_test["HalfBath"] +
                                                       train_test["KitchenAbvGr"])

train_test['Total_Home_Quality'] = train_test['OverallQual'] + train_test['OverallCond']

train_test['Total_Bathrooms'] = (train_test['FullBath'] + (0.5 * train_test['HalfBath']) +
                               train_test['BsmtFullBath'] + (0.5 * train_test['BsmtHalfBath']))

train_test["HighQualSF"] = train_test["1stFlrSF"] + train_test["2ndFlrSF"]
train_test['renovated']=train_test['YearRemodAdd']+train_test['YearBuilt']

In [None]:
# Removing the useless variables

useless = ['GarageYrBlt','YearRemodAdd'] 
train_test = train_test.drop(useless, axis = 1)

In [None]:
# Creating dummy variables from categorical features

train_test_dummy = pd.get_dummies(train_test)
from scipy.stats import skew
numeric_features = train_test_dummy.dtypes[train_test_dummy.dtypes != object].index
skewed_features = train_test_dummy[numeric_features].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew = skewed_features[skewed_features > 0.5]
skew_index = high_skew.index

In [None]:
# Normalize skewed features using log_transformation
    
for i in skew_index:
    train_test_dummy[i] = np.log1p(train_test_dummy[i] )

Checking for Nan values after dummy

In [None]:
nan=pd.DataFrame(train_test_dummy.isna().sum(),columns=['Nan_sum'])
nan['feat']=nan.index
nan=nan[nan['Nan_sum']>0]
nan['Percentage']=(nan['Nan_sum']/1460)*100
nan['Perc']=(nan['Nan_sum']/5459)*100
nan=nan.sort_values(by=['Nan_sum'])
nan.insert(0,'Serial No.',range(1,len(nan)+1))
nan

checking if the values are in infinity or not after log transformation

In [None]:
inf=pd.DataFrame(np.isinf(train_test_dummy).sum() ,columns=['Inf_sum'])
inf['feat']=inf.index
inf=inf[inf['Inf_sum']>0]
inf=inf.sort_values(by=['Inf_sum'])
inf.insert(0,'Serial No.',range(1,len(inf)+1))
inf

In [None]:
import statsmodels.api as sm
# SalePrice before transformation

fig, ax = plt.subplots(1,2, figsize= (15,5))
fig.suptitle(" qq-plot & distribution SalePrice ", fontsize= 15)

sm.qqplot(target, stats.t, distargs=(4,),fit=True, line="45", ax = ax[0])
#research sm 
sns.distplot(target, kde = True, hist=True, fit = norm, ax = ax[1])
plt.show()

transforming the sale price 

In [None]:
# SalePrice after transformation

target_log = np.log1p(target)

fig, ax = plt.subplots(1,2, figsize= (15,5))
fig.suptitle("qq-plot & distribution SalePrice ", fontsize= 15)

sm.qqplot(target_log, stats.t, distargs=(4,),fit=True, line="45", ax = ax[0])
sns.distplot(target_log, kde = True, hist=True, fit = norm, ax = ax[1])
plt.show()

In [None]:
import shap
from xgboost import XGBRegressor
from catboost import Pool
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from mlxtend.regressor import StackingRegressor
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error

In [None]:
train_test.iloc[3999:4005]

In [None]:
# Train-Test separation

X_train = train_test_dummy[0:4000]
X_test = train_test_dummy[4000:]

# Creation of the RMSE metric:
    
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model):
    rmse = np.sqrt(-cross_val_score(model, X_train, target_log, scoring="neg_mean_squared_error", cv=kf))
    return (rmse)

checking for nan values in training set

In [None]:
nan=pd.DataFrame(X_train.isna().sum(),columns=['Nan_sum'])
nan['feat']=nan.index
nan=nan[nan['Nan_sum']>0]
nan['Percentage']=(nan['Nan_sum']/1460)*100
nan['Perc']=(nan['Nan_sum']/4000)*100
nan=nan.sort_values(by=['Nan_sum'])
nan.insert(0,'Serial No.',range(1,len(nan)+1))
nan

checking for nan values in test set

In [None]:
nan=pd.DataFrame(X_test.isna().sum(),columns=['Nan_sum'])
nan['feat']=nan.index
nan=nan[nan['Nan_sum']>0]
nan['Percentage']=(nan['Nan_sum']/1460)*100
nan['Perc']=(nan['Nan_sum']/2919)*100
nan=nan.sort_values(by=['Nan_sum'])
nan.insert(0,'Serial No.',range(1,len(nan)+1))
nan

In [None]:
# 10 Fold Cross validation

kf = KFold(n_splits=11, random_state=42, shuffle=True)

cv_scores = []
cv_std = []

baseline_models = ['Linear_Reg.','Bayesian_Ridge_Reg.','LGBM_Reg.','SVR',
                   'Dec_Tree_Reg.','Random_Forest_Reg.', 'XGB_Reg.',
                   'Grad_Boost_Reg.','Cat_Boost_Reg.','Stacked_Reg.','Stacked_Reg2']

In [None]:
# Cat Boost Regressor

catb = CatBoostRegressor()
score_catb = cv_rmse(catb)
cv_scores.append(score_catb.mean())
cv_std.append(score_catb.std())

# Stacked Regressor

stack_gen = StackingRegressor(regressors=(CatBoostRegressor(),
                                          BayesianRidge()),
                              meta_regressor = CatBoostRegressor(),
                              use_features_in_secondary = True)

In [None]:
score_stack_gen = cv_rmse(stack_gen)
cv_scores.append(score_stack_gen.mean())
cv_std.append(score_stack_gen.std())


In [None]:
# Stacked Regressor

stack_gen2 = StackingRegressor(regressors=(CatBoostRegressor(),
                                          XGBRegressor()),
                              meta_regressor = CatBoostRegressor(),
                              use_features_in_secondary = True)

score_stack_gen2 = cv_rmse(stack_gen2)
cv_scores.append(score_stack_gen2.mean())
cv_std.append(score_stack_gen2.std())



In [None]:
cat = CatBoostRegressor()
cat_model = cat.fit(X_train,target_log,
                     plot=True,
                     verbose = 0)

In [None]:
feat_imp = cat_model.get_feature_importance(prettified=True)
feat_imp.head()

In [None]:
# Plotting top 30 features' importance

plt.figure(figsize = (12,8))
sns.barplot(feat_imp['Importances'][:30],feat_imp['Feature Id'][:30], orient = 'h')
plt.show()

In [None]:
params = {'iterations': 6000,
          'learning_rate': 0.005,
          'depth': 4,
          'l2_leaf_reg': 1,
          'eval_metric':'RMSE',
          'early_stopping_rounds': 200,
          'verbose': 200,
          'random_seed': 42}
         
cat_f = CatBoostRegressor(**params)
cat_model_f = cat_f.fit(X_train,target_log,
                     plot=True,
                     verbose = False)

In [None]:
test_pred = cat_f.predict(X_test)
submission = pd.DataFrame(test_id, columns = ['Id'])
test_pred = np.expm1(test_pred)
submission['SalePrice'] = test_pred 
submission.head()
submission.to_csv("cat.csv", index = False, header = True)

In [None]:
stack_f=stack_gen.fit(X_train,target_log)
test_stack = stack_gen.predict(X_test)
submission = pd.DataFrame(test_id, columns = ['Id'])
test_pre = np.expm1(test_stack)
submission['SalePrice'] = test_pre

submission.to_csv("stack.csv", index = False, header = True)

In [None]:
stack_f2=stack_gen2.fit(X_train,target_log)
test_stack = stack_gen2.predict(X_test)
submission = pd.DataFrame(test_id, columns = ['Id'])
test_pre = np.expm1(test_stack)
submission['SalePrice'] = test_pre

submission.to_csv("submission.csv", index = False, header = True)