![house](https://care4properties.co.uk/wp-content/uploads/2022/10/House-prices-October-2022.png)

# Import libraries

In [None]:
from sklearn.metrics import mean_squared_error
import category_encoders as ce

from lightgbm import LGBMRegressor

import pandas as pd
from scipy.stats import norm
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Review data

In [None]:
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
submission = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv', index_col=0)

In [None]:
train.head()

In [None]:
test.head()

# Visualization

In [None]:
# correlation
cols = list(train.corrwith(train.SalePrice)[train.corrwith(train.SalePrice).abs()>0.5].index)
cols.remove('SalePrice')

fig, ax = plt.subplots(5, 2, figsize=(15, 25))

for i in range(5):
    for j in range(2):
        sns.scatterplot(ax=ax[i, j], x=train[cols[i*2+j]], y=train['SalePrice'], color='purple')

In [None]:
# correlation
corrmat = train.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True, cmap='Purples')

In [None]:
# copy
trainc = train.copy()

In [None]:
# histogram and normal probability plot
sns.distplot(trainc['SalePrice'], fit=norm)
plt.grid()
fig = plt.figure()
res = stats.probplot(trainc['SalePrice'], plot=plt)
plt.grid()

In [None]:
# applying log transformation
trainc['SalePrice'] = np.log(trainc['SalePrice'])

# transformed histogram and normal probability plot
sns.distplot(trainc['SalePrice'], fit=norm)
plt.grid()
fig = plt.figure()
res = stats.probplot(trainc['SalePrice'], plot=plt)
plt.grid()

In [None]:
# histogram and normal probability plot
sns.distplot(trainc['TotalBsmtSF'], fit=norm)
plt.grid()
fig = plt.figure()
res = stats.probplot(trainc['TotalBsmtSF'], plot=plt)
plt.grid()

In [None]:
# applying log transformation
trainc['HasBsmt'] = pd.Series(len(trainc['TotalBsmtSF']), index=trainc.index)
trainc['HasBsmt'] = 0 
trainc.loc[trainc['TotalBsmtSF']>0,'HasBsmt'] = 1
trainc.loc[trainc['HasBsmt']==1,'TotalBsmtSF'] = np.log(trainc['TotalBsmtSF'])
# transformed histogram and normal probability plot
sns.distplot(trainc[trainc['TotalBsmtSF']>0]['TotalBsmtSF'], fit=norm)
fig = plt.figure()
res = stats.probplot(trainc[trainc['TotalBsmtSF']>0]['TotalBsmtSF'], plot=plt)

In [None]:
sns.scatterplot(x = train['GrLivArea'], y = train['SalePrice'], color='green')

# Processing

In [None]:
# Deleting outliers
train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index, inplace=True)
sns.scatterplot(x = train['GrLivArea'], y = train['SalePrice'], color='green')

In [None]:
# data aggregation
df = pd.concat([train.drop('SalePrice',1),test]).drop('Id',1).reset_index(drop=True)

In [None]:
# missing values
df.isna().sum()[df.isna().sum()>0].sort_values(ascending=False)

In [None]:
# fill nan values
df["LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
for col in ('GarageType','GarageFinish','GarageQual','GarageCond','PoolQC','MiscFeature','Alley','Fence','FireplaceQu','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','MasVnrType'):
    df[col] = df[col].fillna('None')
for col in ('GarageYrBlt','GarageArea','GarageCars','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','MasVnrArea','MSZoning','SaleType','Exterior1st','Exterior2nd','KitchenQual','Electrical'):
    df[col] = df[col].fillna(0)
df["Functional"] = df["Functional"].fillna("Typ")
df = df.drop(['Utilities'], axis=1)

In [None]:
# Log-transformation of the target variable
train['SalePrice'] = np.log(train['SalePrice'])

for i in ['GrLivArea', '1stFlrSF', 'LotArea']:
    df[i] = np.log(df[i])

df['HasBsmt'] = 0
df.loc[df['TotalBsmtSF']>0,'HasBsmt'] = 1
df.loc[df['HasBsmt']==1,'TotalBsmtSF'] = np.log(df['TotalBsmtSF'])

In [None]:
# split
y = train.SalePrice.reset_index(drop=True)

n = train.shape[0]
train = df[:n]
test = df[n:]

X = train.copy()

In [None]:
# encode object columns
obj_col = list(X.columns[X.dtypes=='object'])

target_encoder = ce.TargetEncoder()

X[obj_col] = target_encoder.fit_transform(X[obj_col], y)
test[obj_col] = target_encoder.transform(test[obj_col])

# Train model

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

In [None]:
# LGBMRegressor model
model = LGBMRegressor()
model.fit(X_train, y_train, eval_set = (X_test, y_test), early_stopping_rounds = 5)

In [None]:
# evaluation
train_pred = model.predict(X_test)
print("MSE:",np.sqrt(mean_squared_error(y_test, train_pred)))

# Submission

In [None]:
submission.SalePrice = np.exp(model.predict(test))
submission.to_csv('submission.csv')
pd.read_csv('submission.csv', index_col=0)