In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
import lightgbm as lgb

## Import Data

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

## Data Visualization

In [None]:
train_df.columns

In [None]:
train_df.describe()

In [None]:
train_df.dtypes[train_df.dtypes != 'object']

In [None]:
plt.scatter(x='LotFrontage', y='SalePrice', data= train_df)

In [None]:
train_df.query('LotFrontage > 300')
#Drop 935, 1299

In [None]:
plt.scatter(x='LotArea', y='SalePrice', data= train_df)

In [None]:
train_df.query('LotArea > 55000')
#250, 314, 336, 452, 707
#maybe 1397

In [None]:
stats.zscore(train_df['LotArea']).sort_values().tail(10)

In [None]:
plt.scatter(x='OverallQual', y='SalePrice', data=train_df)

In [None]:
train_df.query('OverallQual == 8 and SalePrice > 500000')
#Maybe 524, 458, 770

In [None]:
plt.scatter(x='OverallCond', y='SalePrice', data=train_df)

In [None]:
train_df.query('OverallCond == 6 and SalePrice > 700000')
#379, 1183, 692

In [None]:
plt.scatter(x='YearBuilt', y='SalePrice', data=train_df)

In [None]:
train_df.query('YearBuilt < 2000 and SalePrice >  650000')
#186

In [None]:
plt.scatter(x='YearRemodAdd', y='SalePrice', data=train_df)

In [None]:
train_df.query('YearRemodAdd < 2000 and SalePrice >  650000')
#314

In [None]:
plt.scatter(x='MasVnrArea', y='SalePrice', data=train_df)

In [None]:
train_df.query('MasVnrArea > 1300')
#298, 1170

In [None]:
plt.scatter(x='BsmtFinSF1', y='SalePrice', data=train_df)

In [None]:
train_df.query('BsmtFinSF1 > 5000')
#1299

In [None]:
plt.scatter(x='BsmtFinSF2', y='SalePrice', data=train_df)

In [None]:
train_df.query('BsmtFinSF2 > 1300')
#323

In [None]:
plt.scatter(x='BsmtUnfSF', y='SalePrice', data=train_df)

In [None]:
train_df.query('BsmtUnfSF < 1000 & SalePrice > 700000')
#692, 1183

In [None]:
plt.scatter(x='TotalBsmtSF', y='SalePrice', data=train_df)

In [None]:
train_df.query('TotalBsmtSF > 6000')
#1299

In [None]:
plt.scatter(x='2ndFlrSF', y='SalePrice', data=train_df)

In [None]:
train_df.query('`1stFlrSF` > 4000')
#1299

In [None]:
plt.scatter(x='LowQualFinSF', y='SalePrice', data=train_df)

In [None]:
train_df.query('LowQualFinSF > 550')
#186

In [None]:
plt.scatter(x='GrLivArea', y='SalePrice', data=train_df)

In [None]:
train_df.query('GrLivArea > 4500')
#524, 1299

In [None]:
plt.scatter(x='BsmtFullBath', y='SalePrice', data=train_df)

In [None]:
train_df.query('BsmtFullBath == 3')
#739

In [None]:
plt.scatter(x='BsmtHalfBath', y='SalePrice', data=train_df)

In [None]:
stats.zscore(train_df['BsmtHalfBath']).unique()

In [None]:
train_df.query('BsmtHalfBath == 2')
#598, 955

In [None]:
plt.scatter(x='FullBath', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='HalfBath', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='BedroomAbvGr', y='SalePrice', data=train_df)

In [None]:
train_df.query('BedroomAbvGr == 8')
#636

In [None]:
plt.scatter(x='KitchenAbvGr', y='SalePrice', data=train_df)

In [None]:
train_df.query('KitchenAbvGr == 3')
#49, 810

In [None]:
plt.scatter(x='TotRmsAbvGrd', y='SalePrice', data=train_df)

In [None]:
train_df.query('TotRmsAbvGrd == 14')
#636

In [None]:
plt.scatter(x='Fireplaces', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='GarageYrBlt', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='GarageCars', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='GarageArea', y='SalePrice', data=train_df)

In [None]:
train_df.query('GarageArea > 1200')
#826 ,1062, 1191, 1299

In [None]:
plt.scatter(x='WoodDeckSF', y='SalePrice', data=train_df)

In [None]:
stats.zscore(train_df['WoodDeckSF']).sort_values().tail(5)
#54

In [None]:
plt.scatter(x='OpenPorchSF', y='SalePrice', data=train_df)

In [None]:
stats.zscore(train_df['OpenPorchSF']).sort_values().tail(5)
#584, 496, 1329

In [None]:
plt.scatter(x='EnclosedPorch', y='SalePrice', data=train_df)

In [None]:
train_df.query('EnclosedPorch > 500')
#198

In [None]:
plt.scatter(x='3SsnPorch', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='ScreenPorch', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='PoolArea', y='SalePrice', data=train_df)

In [None]:
train_df.query('PoolArea > 500')
#1183

In [None]:
values = [935, 1299, 250, 314, 336, 452, 707, 1397, 524, 458, 770, 379, 1183, 692, 186, 314, 298, 1170, 1299, 323, 692, 1183, 1299, 186, 524, 1299, 739, 598, 955, 636, 49, 810, 826 ,1062, 1191, 1299, 54, 584, 496, 1329, 198, 1183]
values = sorted(set(values))

In [None]:
train_df = train_df[train_df.Id.isin(values) == False]

In [None]:
pd.DataFrame(train_df.isnull().sum().sort_values(ascending = False)).head(20)

In [None]:
train_df['Alley'].unique()

In [None]:
train_df['Alley'].fillna('No', inplace = True)
test_df['Alley'].fillna('No', inplace = True)

In [None]:
sns.catplot(data = train_df, x='Alley', y='SalePrice', kind='box')

In [None]:
train_df['Fence'].unique()

In [None]:
train_df['Fence'].fillna('No', inplace = True)
test_df['Fence'].fillna('No', inplace = True)

In [None]:
sns.catplot(data = train_df, x='Fence', y='SalePrice', kind='box')

In [None]:
train_df['MasVnrType'].unique()

In [None]:
train_df['MasVnrType'].fillna('No', inplace = True)
test_df['MasVnrType'].fillna('No', inplace = True)

In [None]:
sns.catplot(data = train_df, x='MasVnrType', y='SalePrice', kind='box')

In [None]:
train_df['MasVnrArea'].fillna(0, inplace = True)
test_df['MasVnrArea'].fillna(0, inplace = True)

In [None]:
train_df['FireplaceQu'].unique()

In [None]:
train_df['FireplaceQu'].fillna('No', inplace = True)
test_df['FireplaceQu'].fillna('No', inplace = True)

In [None]:
sns.catplot(data = train_df, x='FireplaceQu', y='SalePrice', kind='box')

In [None]:
train_df['Fireplaces'].describe()

In [None]:
sns.catplot(data = train_df, x='Fireplaces', y='SalePrice', kind='box')

In [None]:
train_df['LotFrontage'].fillna('0', inplace = True)
test_df['LotFrontage'].fillna('0', inplace = True)

In [None]:
train_df['GarageYrBlt'].corr(train_df['YearBuilt'])

In [None]:
train_df['GarageCond'].unique()

In [None]:
train_df['GarageCond'].fillna('No', inplace = True)
test_df['GarageCond'].fillna('No', inplace = True)

In [None]:
sns.catplot(data = train_df, x='GarageCond', y='SalePrice', kind='box')

In [None]:
train_df['GarageType'].unique()

In [None]:
train_df['GarageType'].fillna('No', inplace = True)
test_df['GarageType'].fillna('No', inplace = True)

In [None]:
sns.catplot(data = train_df, x='GarageType', y='SalePrice', kind='box')

In [None]:
train_df['GarageFinish'].unique()

In [None]:
train_df['GarageFinish'].fillna('No', inplace = True)
test_df['GarageFinish'].fillna('No', inplace = True)

In [None]:
sns.catplot(data = train_df, x='GarageFinish', y='SalePrice', kind='box')

In [None]:
train_df['GarageQual'].unique()

In [None]:
train_df['GarageQual'].fillna('No', inplace = True)
test_df['GarageQual'].fillna('No', inplace = True)

In [None]:
sns.catplot(data = train_df, x='GarageQual', y='SalePrice', kind='box')

In [None]:
train_df['BsmtFinType2'].unique()

In [None]:
train_df['BsmtFinType2'].fillna('Unf', inplace = True)
test_df['BsmtFinType2'].fillna('Unf', inplace = True)

In [None]:
sns.catplot(data = train_df, x='BsmtFinType2', y='SalePrice', kind='box')

In [None]:
train_df['BsmtExposure'].unique()

In [None]:
train_df['BsmtExposure'].fillna('No', inplace = True)
test_df['BsmtExposure'].fillna('No', inplace = True)

In [None]:
sns.catplot(data = train_df, x='BsmtExposure', y='SalePrice', kind='box')

In [None]:
train_df['BsmtQual'].unique()

In [None]:
train_df['BsmtQual'].fillna('No', inplace = True)
test_df['BsmtQual'].fillna('No', inplace = True)

In [None]:
sns.catplot(data = train_df, x='BsmtQual', y='SalePrice', kind='box')

In [None]:
train_df['BsmtCond'].unique()

In [None]:
train_df['BsmtCond'].fillna('No', inplace = True)
test_df['BsmtCond'].fillna('No', inplace = True)

In [None]:
sns.catplot(data = train_df, x='BsmtCond', y='SalePrice', kind='box')

In [None]:
train_df['BsmtFinType1'].unique()

In [None]:
train_df['BsmtFinType1'].fillna('Unf', inplace = True)
test_df['BsmtFinType1'].fillna('Unf', inplace = True)

In [None]:
sns.catplot(data = train_df, x='BsmtFinType1', y='SalePrice', kind='box')

In [None]:
train_df['Electrical'].unique()

In [None]:
train_df['Electrical'].fillna('SBrkr', inplace = True)
test_df['Electrical'].fillna('SBrkr', inplace = True)

In [None]:
sns.catplot(data = train_df, x='Electrical', y='SalePrice', kind='box')

In [None]:
train_df = train_df.drop(columns = ['PoolQC','MiscFeature','Alley','Fence','GarageYrBlt','GarageCond','BsmtFinType2','Id'])
test_df = test_df.drop(columns = ['PoolQC','MiscFeature','Alley','Fence','GarageYrBlt','GarageCond','BsmtFinType2'])

## Feature Engineering

In [None]:
train_df['houseage'] = train_df['YrSold'] - train_df['YearBuilt']
test_df['houseage'] = test_df['YrSold'] - test_df['YearBuilt']

In [None]:
train_df['houseremodelage'] = train_df['YrSold'] - train_df['YearRemodAdd']
test_df['houseremodelage'] = test_df['YrSold'] - test_df['YearRemodAdd']

In [None]:
train_df['totalsf'] = train_df['1stFlrSF'] + train_df['2ndFlrSF'] + train_df['BsmtFinSF1'] + train_df['BsmtFinSF2']
test_df['totalsf'] = test_df['1stFlrSF'] + test_df['2ndFlrSF'] + test_df['BsmtFinSF1'] + test_df['BsmtFinSF2']

In [None]:
train_df['totalarea'] = train_df['GrLivArea'] + train_df['TotalBsmtSF']
test_df['totalarea'] = test_df['GrLivArea'] + test_df['TotalBsmtSF']

In [None]:
train_df['totalbaths'] = train_df['BsmtFullBath'] + train_df['FullBath'] + 0.5 * (train_df['BsmtHalfBath']) + train_df['HalfBath']
test_df['totalbaths'] = test_df['BsmtFullBath'] + test_df['FullBath'] + 0.5 * (test_df['BsmtHalfBath']) + test_df['HalfBath']

In [None]:
train_df['totalporchsf'] = train_df['OpenPorchSF'] + train_df['3SsnPorch'] + train_df['EnclosedPorch'] + train_df['ScreenPorch'] + train_df['WoodDeckSF']
test_df['totalporchsf'] = test_df['OpenPorchSF'] + test_df['3SsnPorch'] + test_df['EnclosedPorch'] + test_df['ScreenPorch'] + test_df['WoodDeckSF']

In [None]:
train_df = train_df.drop(columns = ['YrSold','YearBuilt','YearRemodAdd','2ndFlrSF','BsmtFinSF1','BsmtFinSF2','BsmtFullBath','BsmtHalfBath','HalfBath','OpenPorchSF','3SsnPorch','EnclosedPorch','ScreenPorch','WoodDeckSF'])
test_df = test_df.drop(columns = ['YrSold','YearBuilt','YearRemodAdd','2ndFlrSF','BsmtFinSF1','BsmtFinSF2','BsmtFullBath','BsmtHalfBath','HalfBath','OpenPorchSF','3SsnPorch','EnclosedPorch','ScreenPorch','WoodDeckSF'])

In [None]:
correlation_matrix = train_df.corr(numeric_only = True)
plt.figure(figsize = (20,15))
sns.heatmap(correlation_matrix, annot = True,cmap = 'coolwarm', fmt = '.2f')

In [None]:
#drop GarageArea or GarageCars 

In [None]:
train_df = train_df.drop(columns = ['GarageArea'])
test_df = test_df.drop(columns = ['GarageArea'])

In [None]:
sns.histplot(
    train_df,
    x=train_df['SalePrice']
)

In [None]:
train_df['SalePrice'] = np.log1p(train_df['SalePrice'])

In [None]:
sns.histplot(
    train_df,
    x=train_df['SalePrice']
)

## Feature Encoding

In [None]:
train_df.dtypes[train_df.dtypes != 'object']

In [None]:
train_df.dtypes[train_df.dtypes == 'object']

In [None]:
#Ordinal Encoding
ode_cols = ['LotShape', 'LandContour', 'Utilities', 'LandSlope','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','HeatingQC','KitchenQual','FireplaceQu','GarageFinish','GarageQual']

In [None]:
#One-hot encoding
ohe_cols = ['MSZoning', 'Street', 'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType','Foundation','Heating','CentralAir','Electrical','Functional','GarageType','PavedDrive','SaleType','SaleCondition']

In [None]:
num_cols = train_df.select_dtypes(include=['int64', 'float64']).columns
num_cols = num_cols.drop('SalePrice')

In [None]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy = 'mean')),
    ('scaler', StandardScaler())
])

In [None]:
ode_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy = 'most_frequent')),
    ('ode', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1))
])

In [None]:
ohe_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy = 'most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
])

In [None]:
col_trans = ColumnTransformer(transformers = [
    ('num_p', num_pipeline, num_cols),
    ('ode_p', ode_pipeline, ode_cols),
    ('ohe_p', ohe_pipeline, ohe_cols)],
    remainder= 'passthrough', 
    n_jobs = -1)

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessing', col_trans)
])

In [None]:
X = train_df.drop('SalePrice', axis = 1)
y = train_df['SalePrice']

In [None]:
x_preprocessed = pipeline.fit_transform(X)

## Traina 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_preprocessed, y, test_size = 0.2, random_state = 25)

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
y_pred_lr = lr.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred_lr)

In [None]:
root_mean_squared_error(y_test, y_pred_lr)

In [None]:
RFR = RandomForestRegressor(random_state = 13)

In [None]:
param_grid_RFR = {
    'max_depth' : [5, 10, 15],
    'n_estimators' : [100, 250, 500],
    'min_samples_split' : [3, 5 ,10]
}

In [None]:
rfr_cv = GridSearchCV(RFR, param_grid_RFR, cv=5, scoring = 'neg_root_mean_squared_error', n_jobs = -1)

In [None]:
rfr_cv.fit(X_train, y_train)

In [None]:
y_pred_rfr = rfr_cv.predict(X_test)

In [None]:
-rfr_cv.best_score_

In [None]:
y_pred_rfr = rfr_cv.predict(X_test)

In [None]:
root_mean_squared_error(y_test, y_pred_rfr)

In [None]:
rfr_cv.best_params_

In [None]:
XGB = XGBRegressor(random_state=13)

In [None]:
param_grid_XGB = {
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [300],
    'max_depth': [3],
    'min_child_weight': [1,2,3],
    'gamma': [0, 0.1, 0.2],
    'subsample' : [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8,0.9,1.0]
}

In [None]:
XGB_cv = GridSearchCV(XGB, param_grid_XGB, cv=3, scoring = 'neg_root_mean_squared_error', n_jobs = -1)

In [None]:
XGB_cv.fit(X_train, y_train)

In [None]:
-XGB_cv.best_score_

In [None]:
y_pred_XGB = XGB_cv.predict(X_test)

In [None]:
root_mean_squared_error(y_test, y_pred_XGB)

In [None]:
XGB_cv.best_params_

In [None]:
ridge = Ridge()

In [None]:
param_grid_ridge = {
    'alpha': [0.05, 0.1, 1, 3, 5, 10],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag']
}

In [None]:
ridge_cv = GridSearchCV(ridge, param_grid_ridge, cv=5, scoring = 'neg_root_mean_squared_error', n_jobs = -1)

In [None]:
ridge_cv.fit(X_train, y_train)

In [None]:
-ridge_cv.best_score_

In [None]:
y_pred_ridge = ridge_cv.predict(X_test)

In [None]:
root_mean_squared_error(y_test, y_pred_ridge)

In [None]:
GBR = GradientBoostingRegressor()

In [None]:
param_grid_GBR = {
    'learning_rate': [0.001, 0.01, 0.1],
    'max_depth' : [12, 15, 20],
    'n_estimators' : [200, 300, 1000],
    'min_samples_leaf' : [10, 25 ,50],
    'max_features': [0.01, 0.1, 0.7]
}

In [None]:
GBR_cv = GridSearchCV(GBR, param_grid_GBR, cv=5, scoring = 'neg_root_mean_squared_error', n_jobs = -1)

In [None]:
GBR_cv.fit(X_train, y_train)

In [None]:
-GBR_cv.best_score_

In [None]:
y_pred_GBR = GBR_cv.predict(X_test)

In [None]:
root_mean_squared_error(y_test, y_pred_GBR)

In [None]:
lgbm_regressor = lgb.LGBMRegressor()

In [None]:
param_grid_lgbm = {
    'boosting_type': ['gbdt', 'dart'],
    'num_leaves': [20, 30, 40],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300]
}

In [None]:
lgbm_cv = GridSearchCV(lgbm_regressor, param_grid_lgbm, cv=3, scoring = 'neg_root_mean_squared_error', n_jobs = -1)

In [None]:
lgbm_cv.fit(X_train, y_train)

In [None]:
-lgbm_cv.best_score_

In [None]:
y_pred_lgbm = lgbm_cv.predict(X_test)

In [None]:
root_mean_squared_error(y_test, y_pred_lgbm)

In [None]:
catboost = CatBoostRegressor(loss_function='RMSE', verbose=False)

In [None]:
param_grid_cat = {
    'iterations': [100, 500, 1000],
    'depth': [4,6,8,10],
    'learning_rate': [0.01,0.05,0.1,0.5]
}

In [None]:
cat_cv = GridSearchCV(catboost, param_grid_cat, cv=3, scoring = 'neg_root_mean_squared_error', n_jobs = -1)

In [None]:
cat_cv.fit(X_train, y_train)

In [None]:
-cat_cv.best_score_

In [None]:
y_pred_cat = cat_cv.predict(X_test)

In [None]:
root_mean_squared_error(y_test, y_pred_cat)

In [None]:
vr = VotingRegressor([('gbr', GBR_cv.best_estimator_),
                      ('xgb', XGB_cv.best_estimator_),
                      ('lgbm', lgbm_cv.best_estimator_)],
                      weights = [3,2,1])

In [None]:
vr.fit(X_train, y_train)

In [None]:
y_pred_vr = vr.predict(X_test)

In [None]:
root_mean_squared_error(y_test, y_pred_vr)

In [None]:
estimators = [
    ('gbr', GBR_cv.best_estimator_),
    ('xgb', XGB_cv.best_estimator_),
    ('cat', cat_cv.best_estimator_),
    ('lgbm', lgbm_cv.best_estimator_),
    ('rfr', rfr_cv.best_estimator_),
]

In [None]:
stackreg = StackingRegressor(
    estimators = estimators,
    final_estimator = vr
)

In [None]:
stackreg.fit(X_train, y_train)

In [None]:
y_pred_stackreg = stackreg.predict(X_test)

In [None]:
root_mean_squared_error(y_test, y_pred_stackreg)

In [None]:
df_test_preprocessed = pipeline.transform(test_df)

In [None]:
y_ridge = np.exp(ridge_cv.predict(df_test_preprocessed))

df_y_ridge_out = test_df[['Id']]
df_y_ridge_out['SalePrice'] = y_ridge
df_y_ridge_out.to_csv('HousePredictionRidge.csv', index = False)

In [None]:
root_mean_squared_error(y_test, y_pred_lr)

In [None]:
root_mean_squared_error(y_test, y_pred_rfr)

In [None]:
root_mean_squared_error(y_test, y_pred_XGB)

In [None]:
root_mean_squared_error(y_test, y_pred_ridge)

In [None]:
root_mean_squared_error(y_test, y_pred_GBR)

In [None]:
root_mean_squared_error(y_test, y_pred_lgbm)

In [None]:
root_mean_squared_error(y_test, y_pred_cat)

In [None]:
root_mean_squared_error(y_test, y_pred_vr)

In [None]:
root_mean_squared_error(y_test, y_pred_stackreg)