In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score, mean_squared_log_error, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from xgboost import XGBRegressor
from mlxtend.regressor import StackingCVRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
training = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
training.info()
plt.savefig('info.png')


* Remove Id, not needed for model

In [None]:
test1 = test.drop(['Id'], axis = 1)
train1 = training.drop(['Id'], axis = 1)
test1.shape

Combine all data so both datasets get the missing value treatment

In [None]:
train1 = train1.drop(['SalePrice'], axis = 1)
all_data =  pd.concat([train1, test1]).reset_index(drop=True)
all_data.shape

# Data Exploration
**Distribution of SalePrice**

In [None]:
sns.displot(data = training, x='SalePrice', kde=True);
plt.xticks(rotation=90)
plt.savefig('saleprice.png', bbox_inches = 'tight')

In [None]:
plt.figure(figsize=(9, 4))
sns.boxplot(data = training, x='SalePrice');
plt.savefig('salepricebox.png', bbox_inches = 'tight')

In [None]:
#correlation matrix
corrmat = training.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True,xticklabels=True, yticklabels=True, cmap='Blues')
plt.savefig('corr.png', bbox_inches = 'tight')

In [None]:
tr_corr = training.corr()['SalePrice'][:-1] # -1 because the latest row is SalePrice
golden_features_list = tr_corr[abs(tr_corr) > 0.5].sort_values(ascending=False)
print("There is {} strongly correlated values with SalePrice:\n{}".format(len(golden_features_list), golden_features_list))

In [None]:
tr_corr.sort_values(ascending=False)

In [None]:
#saleprice correlation matrix
k = 11 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(training[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True,cmap = 'Blues', annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.savefig('corr10.png', bbox_inches = 'tight')
plt.show()

### Since OverallQual has the highest correlation to SalePrice, I will create a box plot to see the distribution of each quality value. 

In [None]:
sns.boxplot(data = training, x='OverallQual', y='SalePrice');
plt.savefig('qualprice.png', bbox_inches = 'tight')

### Exploring to see why OverallCond doesn't correlate well to the SalePrice

In [None]:
sns.boxplot(data = training, x='OverallCond', y='SalePrice');
plt.savefig('condprice.png', bbox_inches = 'tight')

In [None]:
sns.boxplot(data = training, x='OverallCond', y='OverallQual');

### Moving on to GrLivArea

In [None]:
sns.scatterplot(data = training, x='GrLivArea', y='SalePrice')
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.savefig('grliveprice.png', bbox_inches = 'tight')
plt.show()

In [None]:
sns.scatterplot(data = training, x='GrLivArea', y='SalePrice',hue="OverallQual")
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.show()

In [None]:
features = ["GarageArea", "TotalBsmtSF", "1stFlrSF", "LotArea",'1stFlrSF',
            'FullBath','TotRmsAbvGrd','YearBuilt','YearRemodAdd']   


for feature in features:
    plt.figure(figsize=(16,5));
    sns.scatterplot(x=feature, y="SalePrice", hue="OverallQual", data=training, 
                legend="full", linewidth=0.2, alpha=0.9)
    plt.legend(bbox_to_anchor=(1, 1), loc=2)
    plt.title(f"SalePrice vs {feature}")
    plt.show()

**Find the features with missing values**

In [None]:
total1 = all_data.isnull().sum().sort_values(ascending = False)
percent1 = ((all_data.isnull().sum()/all_data.isnull().count())*100).sort_values(ascending = False)
missing1 = pd.concat([total1, percent1], axis = 1, keys = ['Total', 'Percent'])
missing1 = missing1.loc[missing1['Percent'] > 0]
fig,ax = plt.subplots(figsize = (20,5))
sns.barplot(x=missing1.index, y='Percent',data=missing1, palette='Dark2')
plt.xticks(rotation=90)
plt.title('Percentage of missing values for each feature')

plt.show(fig)

**More detailed info of the missing values**

In [None]:
missing1

In [None]:
missing1.shape

In [None]:
all_data['MSZoning'] = all_data.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))


In [None]:
temp = all_data[all_data['PoolArea'] > 0]
temp2 = temp.loc[:, ['OverallQual','PoolArea','PoolQC']]
#temp2.mode()
temp2

In [None]:
x = 2420,2503,2599
for i in x:
    all_data.iloc[i, all_data.columns.get_loc('PoolQC')] = 'Fa'


In [None]:
all_data['PoolQC'] = all_data['PoolQC'].fillna('NA')

In [None]:
total2 = all_data.isnull().sum().sort_values(ascending = False)
percent2 = (all_data.isnull().sum()/all_data.isnull().count()).sort_values(ascending = False)
missing2 = pd.concat([total2, percent2], axis = 1, keys = ['Total', 'Percent'])
missing2.head(40)

In [None]:
temp = all_data[all_data['MiscVal'] > 0]
temp2 = temp.loc[:, ['MiscFeature','MiscVal']]
temp2[temp2['MiscFeature']== 'NaN']

In [None]:
all_data['MiscFeature'] = all_data['MiscFeature'].fillna('NA')

In [None]:
all_data['Alley'] = all_data['Alley'].fillna('NA')

In [None]:
all_data['Fence'] = all_data.groupby('MSZoning')['Fence'].transform(lambda x: x.fillna(x.mode()[0]))
all_data['Fence'] = all_data['Fence'].fillna('NA')

In [None]:
temp = all_data[all_data['Fireplaces'] > 0]
temp2 = temp.loc[:, ['Fireplaces','FireplaceQu']]
temp2[temp2['FireplaceQu']== 'NaN']

In [None]:
all_data['FireplaceQu'] = all_data['FireplaceQu'].fillna('NA')

In [None]:
#set all GarageType to NA
all_data['GarageType'] = all_data['GarageType'].fillna('NA')

In [None]:
temp = all_data[all_data['GarageYrBlt'].isna()]
temp2 = temp.loc[:, ['GarageType','GarageYrBlt','GarageFinish','GarageQual','GarageCond']]
temp2

In [None]:
all_data['GarageYrBlt'] = all_data['GarageYrBlt'].fillna(0)

for col in ['GarageFinish', 'GarageQual', 'GarageCond']:
    all_data[col] = all_data[col].fillna('NA')

In [None]:
temp = all_data[all_data['GarageCars'].isna()]
temp2 = temp.loc[:, ['GarageType','GarageCars','GarageArea']]
temp2

In [None]:
all_data['GarageCars'] = all_data.groupby(['Neighborhood','GarageType'])['GarageCars'].transform(lambda x: x.fillna(x.mode()[0]))
all_data['GarageArea'] = all_data.groupby(['Neighborhood','GarageType'])['GarageArea'].transform(lambda x: x.fillna(x.mode()[0]))

In [None]:
all_data['LotFrontage'] = all_data.groupby('MSZoning')['LotFrontage'].transform(lambda x: x.fillna(x.mode()[0]))

In [None]:
bsmt_col = ['TotalBsmtSF', 'BsmtFinSF2','BsmtUnfSF','BsmtFinSF1']
for col in bsmt_col:
    all_data[col] = all_data[col].fillna(0)

In [None]:
temp = all_data[all_data['BsmtCond'].isna()]
temp2 = temp.loc[:, ['TotalBsmtSF','BsmtCond']]
temp2['TotalBsmtSF'].unique()
temp2[temp2['TotalBsmtSF']!=0]

In [None]:
x = 2040,2185,2524
for i in x:
    all_data.iloc[i, all_data.columns.get_loc('BsmtCond')] = 'TA'

In [None]:
all_data['BsmtCond'] = all_data['BsmtCond'].fillna('NA')

In [None]:
all_data['Utilities'] = all_data.groupby('MSZoning')['Utilities'].transform(lambda x: x.fillna(x.mode()[0]))
all_data['Electrical'] = all_data.groupby('MSZoning')['Electrical'].transform(lambda x: x.fillna(x.mode()[0]))
for col in ('SaleType', 'Exterior1st', 'Exterior2nd'):
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

In [None]:
for col in ('BsmtHalfBath', 'BsmtFullBath','MasVnrArea'):
    all_data[col] = all_data[col].fillna(0)

In [None]:
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

In [None]:
all_data['Functional'] = all_data['Functional'].fillna(all_data['Functional'].mode()[0])

In [None]:
bsmt_col = ['BsmtExposure','BsmtQual','BsmtFinType2', 'BsmtFinType1']
for col in bsmt_col:
    all_data[col] = all_data[col].fillna(0)

In [None]:
all_data['MasVnrType'] = all_data['MasVnrType'].fillna('None')

In [None]:
total1 = all_data.isnull().sum().sort_values(ascending = False)
percent3 = ((all_data.isnull().sum()/all_data.isnull().count())*100).sort_values(ascending = False)
missing3 = pd.concat([total1, percent3], axis = 1, keys = ['Total', 'Percent'])
missing3 = missing3.loc[missing3['Percent'] > 0]
missing3

In [None]:
all_data = pd.get_dummies(all_data)
tr = all_data.iloc[:1460, :]
tes = all_data.iloc[1460:, :]
tes.shape

In [None]:
x = tr
y = training.iloc[:,-1]

In [None]:
def NumPyRMSLE(y_true: list, y_pred: list) -> float:
    """
    The Root Mean Squared Log Error (RMSLE) metric using only NumPy
    
    :param y_true: The ground truth labels given in the dataset
    :param y_pred: Our predictions
    :return: The RMSLE score
    """
    n = len(y_true)
    msle = np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
    return msle

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)
RF = RandomForestRegressor(max_depth=70, max_features= 'auto', n_estimators=100, 
                           min_samples_leaf=1, min_samples_split=0.001).fit(x_train, y_train)
DT = DecisionTreeRegressor().fit(x_train, y_train)
LR = LinearRegression().fit(x_train, y_train)
GBR = GradientBoostingRegressor().fit(x_train, y_train)
XGB = XGBRegressor(max_depth=4, min_child_weight=4, subsample=0.9,
                   colsample_bytree=0.4, eta = 0.1).fit(x_train, y_train)
Las = Lasso().fit(x_train, y_train)
Rid = Ridge().fit(x_train, y_train)

stack_gen = StackingCVRegressor(regressors=(RF, GBR, XGB), meta_regressor=XGB, use_features_in_secondary=True).fit(x_train, y_train)

models = [LR, DT, RF, GBR, XGB, Las, Rid, stack_gen]

RMSE = [mean_squared_error(y_test, mod.predict(x_test))**0.5 for mod in models]
RMSLE = [(NumPyRMSLE(y_test, mod.predict(x_test))) for mod in models]
MAPE = [mean_absolute_percentage_error(y_test, mod.predict(x_test)) for mod in models]
R2_Score = [r2_score(y_test, mod.predict(x_test)) for mod in models]

Models = ['Linear Regression','Decision Tree','Random Forest','Gradient Boosting','XgBoost', 'Lasso', 'Ridge', 'Stacked']

evaluation = pd.DataFrame({'Models':Models,'RMSLE':RMSLE,'R2_Score':R2_Score})
evaluation

In [None]:
tes = tes[tr.columns]
pred = XGB.predict(tes)
submission = pd.DataFrame({'Id': test['Id'],'Saleprice': pred})
submission.to_csv('submission.csv',index=False)