# Imports

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Import additional libraries that I plan on using**

In [None]:
# more imports
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

**Load training and test datasets**

In [None]:
# load data
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# Exploring the Data

In [None]:
train.head()

81 columns!

Lots of missing data

In [None]:
train.info()

**Take a look at SalePrice**

In [None]:
train['SalePrice'].describe()

Observations:
* The max is way higher than the 75th percentile
* Standard deviation seems pretty high
* There could be outliers that will mess with regression

**Visualize the distribution of SalePrice**

In [None]:
sns.distplot(train['SalePrice'])
plt.xticks(rotation=0);

Observations:
* Asymmetric distribution
* Right tail is really long
* Most of the houses cost around $200,000 but there are some really expensive houses in the dataset

**Check the skewness value**
* Skewed to the right
* Linear regression assumes normally distributed data

In [None]:
print('Skewness = ',train['SalePrice'].skew())

**Create a correlation map**

In [None]:
corr = train.corr()

plt.figure(figsize=(15,12))

sns.heatmap(corr)
plt.show();

Highly correlated variables (some of these are extremely obvious)
* TotRmsAbvGrd and GrLivArea
* GarageYrBlt and YearBuilt
* 1stFlrSF and TotalBsmtSF
* OverallQual and SalePrice
* GarageCars and GarageArea

## SalePrice Correlation

**List the variables that are most correlated with SalePrice**

In [None]:
top10_corr = corr['SalePrice'].sort_values(ascending=False)[:10]
top_features = top10_corr.index[1:]  # store top_features for later

top10_corr

### Scatterplots

In [None]:
# Top features and SalePrice
fig,ax = plt.subplots(nrows = 9,ncols = 1,figsize = (5, 25))
for i in range(len(top_features)):    

    ax[i].scatter(x = train[top_features[i]], y = train['SalePrice'])
    ax[i].set_xlabel('%s'%(top_features[i]))
    ax[i].set_ylabel('SalePrice')

plt.tight_layout()
plt.show();

There are definitely some outliers
* A few of the scatterplots have two or three dots siting in the bottom right
* Must be removed in order to create an accurate model

# Data Cleaning/Processing

**Identify and drop outliers**
* There could be better ways to deal with outliers but dropping them is simple and effective
* 

In [None]:
Q1 = []
Q3 = []
Lower_bound = []
Upper_bound = []
Outliers = []


for i in top_features:
    
    # 25th and 75th percentiles
    q1, q3 = np.percentile(train[i],25), np.percentile(train[i],75)
    # Interquartile range
    iqr = q3 - q1
    # Outlier cutoff
    cut_off = 1.5*iqr
    # Lower and Upper bounds
    lower_bound = q1-cut_off
    upper_bound = q3+cut_off
        
    # save outlier indexes
    outlier = [x for x in train.index if train.loc[x,i]<lower_bound or train.loc[x,i]>upper_bound]
    
    # append values
    Q1.append(q1)
    Q3.append(q3)
    Lower_bound.append(lower_bound)
    Upper_bound.append(upper_bound)
    Outliers.append(len(outlier))
    
    # drop outliers
    train.drop(outlier,inplace=True,axis=0)

df_out = pd.DataFrame({'Column':top_features,'Q1':Q1,'Q3':Q3,'Lower bound':Lower_bound,'Upper bound':Upper_bound,'No. of outliers':Outliers})    
df_out.sort_values(by='No. of outliers',ascending=False)

# Feature Transformation

**Combine and process the training and test datasets**
* Keep track of number of rows in the training dataframe
* Drop the Id column because it doesn't help predict SalePrice
* Log transform SalePrice to reduce skewness (store it in target)
* Drop SalePrice from the training data because it's not in the test data

In [None]:
# number of rows
nrows = train.shape[0]

# log transform SalePrice
target = np.log(train['SalePrice'])

# visualize SalePrice again
sns.distplot(target)
plt.xticks(rotation=0);

# drop Id and SalePrice from train dataframe
train.drop(['Id','SalePrice'],inplace=True,axis=1)

# store test Id
test_id = test['Id']

# drop test Id
test.drop(['Id'],inplace=True,axis=1)

# concatenate train and test dataframes
train = pd.concat([train,test])

The distribution of SalePrice is roughly normal now after the log transformation

**Examine missing data**
* Null doesn't mean not important

In [None]:
# Null values
train.isna().sum().sort_values(ascending=False).head(20)

## Encoding
* Replace ordinal variables with appropriate numbers
* Replace null categoorical variables with "other" or "typical"
* Replace numerical variables with either 0, the median, or the mode, depending on data_description.txt

### Ordinal Encoding

In [None]:
# Ex	Excellent
# Gd	Good
# TA	Average/Typical
# Fa	Fair
# NA	No Pool
train['PoolQC'].replace(['Ex','Gd','TA','Fa',np.nan],[4,3,2,1,0],inplace=True)

# GdPrv	Good Privacy
# MnPrv	Minimum Privacy
# GdWo	Good Wood
# MnWw	Minimum Wood/Wire
# NA	No Fence
train['Fence'].replace(['GdPrv','MnPrv','GdWo','MnWw',np.nan],[4,3,2,1,0],inplace=True)

# Ex	Excellent - Exceptional Masonry Fireplace
# Gd	Good - Masonry Fireplace in main level
# TA	Average - Prefabricated Fireplace in main living area or Masonry Fireplace in basement
# Fa	Fair - Prefabricated Fireplace in basement
# Po	Poor - Ben Franklin Stove
# NA	No Fireplace
train['FireplaceQu'].replace(['Ex','Gd','TA','Fa','Po',np.nan],[5,4,3,2,1,0],inplace=True)


# Ex	Excellent
# Gd	Good
# TA	Typical/Average
# Fa	Fair
# Po	Poor
# NA	No Garage
for i in ['GarageCond','GarageQual']:
    train[i].replace(['Ex','Gd','TA','Fa','Po',np.nan],[5,4,3,2,1,0],inplace=True)
    
# Ex	Excellent
# Gd	Good
# TA	Typical
# Fa	Fair
# Po	Poor
# NA	No Basement
for i in ['BsmtCond','BsmtQual']:
    train[i].replace(['Ex','Gd','TA','Fa','Po',np.nan],[5,4,3,2,1,0],inplace=True)

# Gd	Good Exposure
# Av	Average Exposure (split levels or foyers typically score average or above)	
# Mn	Mimimum Exposure
# No	No Exposure
# NA	No Basement
train['BsmtExposure'].replace(['Gd','Av','Mn','No',np.nan],[4,3,2,1,0],inplace=True)

# GLQ	Good Living Quarters
# ALQ	Average Living Quarters
# BLQ	Below Average Living Quarters
# Rec	Average Rec Room
# LwQ	Low Quality
# Unf	Unfinshed
# NA	No Basement
for i in ['BsmtFinType1','BsmtFinType2']:
    train[i].replace(['GLQ','ALQ','BLQ','Rec','LwQ','Unf',np.nan],[6,5,4,3,2,1,0],inplace=True)  

# N	No
# Y	Yes
train['CentralAir'].replace(['N','Y'],[0,1],inplace=True)

# Ex	Excellent
# Gd	Good
# TA	Average/Typical
# Fa	Fair
# Po	Poor
for i in ['HeatingQC','ExterCond','ExterQual']:
    train[i].replace(['Ex','Gd','TA','Fa','Po'],[4,3,2,1,0],inplace=True)

# Ex	Excellent
# Gd	Good
# TA	Typical/Average
# Fa	Fair
# Po	Poor
train['KitchenQual'].replace(['Ex','Gd','TA','Fa','Po'],[4,3,2,1,0],inplace=True)

# Replace NA with most mode (because NA doesn't mean no kitchen)
train['KitchenQual'].fillna(train['KitchenQual'].mode()[0],inplace=True)


### Replace NA with "None" for categorical features
* Useful for One Hot Encoding later

In [None]:
# NA means no miscellaneous feature
train['MiscFeature'].fillna('None',inplace=True)

# NA means no alley access
train['Alley'].fillna('None',inplace=True)

# NA means no garage
for i in ['GarageFinish','GarageType']:
    train[i].fillna('None',inplace=True) 
    
# NA means no masonry work
train['MasVnrType'].fillna('None',inplace=True)

### More Encoding

In [None]:
# Replace null lotfrontage with average of the neighborhood
train['LotFrontage'] = train.groupby('Neighborhood')['LotFrontage'].transform(lambda x:x.fillna(x.median()))

for i in ['GarageYrBlt','GarageCars','GarageArea']:
     train[i].fillna(0,inplace=True)
        
for i in ['BsmtHalfBath','BsmtFullBath','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF']:
    train[i].fillna(0,inplace=True)
    
# If no masonary work, then area is 0
train['MasVnrArea'].fillna(0,inplace=True)

# Replace with the most common value
for i in ['MSZoning','Utilities']:
    train[i].fillna(train[i].mode()[0],inplace=True)

# data_description says "assume typical unless deductions are warranted"
train['Functional'].fillna('Typ',inplace=True)

# Assume SaleType is 'Other' if null
train['SaleType'].fillna('Oth',inplace=True)

#Replace with most common value
train['Electrical'].fillna(train['Electrical'].mode()[0],inplace=True)

# Replace null with 'Other'
for i in ['Exterior1st','Exterior2nd']:
    train[i].fillna('Other',inplace=True)

## Create New Features

In [None]:
# Boolean features
train['HasPool'] = train['PoolArea'].apply(lambda x: 1 if x>0 else 0)
train['HasFirePlace'] = train['FireplaceQu'].apply(lambda x: 1 if x>0 else 0)
train['HasFence'] = train['Fence'].apply(lambda x: 1 if x>0 else 0)
train['HasMsonary'] = train['MasVnrArea'].apply(lambda x: 1 if x>0 else 0)
train['HasGarage'] = train['GarageArea'].apply(lambda x: 1 if x>0 else 0)
train['HasBsmt'] = train['TotalBsmtSF'].apply(lambda x: 1 if x>0 else 0)

In [None]:
# Total surface area of house
train['TotalSF'] = train.apply(lambda x: x['1stFlrSF'] + x['2ndFlrSF'] + x['TotalBsmtSF'], axis=1)

# Total number of bathrooms in the house
train['TotalBath'] = train.apply(lambda x: x['FullBath'] + 0.5*x['HalfBath'] + x['BsmtFullBath'] + 0.5*x['BsmtHalfBath'], axis=1)

# Total Porch area in the house
train['TotalPorch'] = train.apply(lambda x: x['OpenPorchSF'] + x['EnclosedPorch'] + x['3SsnPorch'] + x['ScreenPorch'], axis=1)

# New house or an old house
train['NewHouse'] = train.apply(lambda x: 1 if x['SaleCondition']=='Partial' else 0, axis=1)

Convert categorical data ainto numbers to prepare for regression

In [None]:
# One-Hot/Dummy encoding
train = pd.get_dummies(train,drop_first=True)
train.head()

**Seperate the training and test datasets**

In [None]:
# train dataset
df = train.iloc[:nrows,:]

# test dataset
test = train.iloc[nrows:,:]

**Split the training dataset up**
* So that I can train the model without using the test dataset

In [None]:
from sklearn.model_selection import train_test_split

X = df
y = target

# training and validation set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=27)

# Attempt #1: Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lr = LinearRegression()

lr.fit(X_train,y_train)

rmse = np.sqrt(mean_squared_error(y_test,lr.predict(X_test)))  # use RMSE because the competition scores using RMSE
print(rmse)

"Score is good, but it can be better"

# Attempt #2: Ridge Regression Model
*'This is another one of the types of regression in machine learning which is usually used when there is a high correlation between the independent variables'*

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# different alpha values
alphas = [0.01, 0.1, 0.3, 1, 3, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100]  # alpha is the regularization strength

def calcRMSE(a):
    lr = Ridge(alpha=a)
    
    lr.fit(X_train,y_train)
    
    rmse = np.sqrt(mean_squared_error(y_test,lr.predict(X_test)))
    
    return rmse
    
for a in alphas:
    print('For Alpha = ',a,', RMSE = ',calcRMSE(a))


# Find the alpha that produces the lowest RMSE
vals = {}
for i in range(1, 100):
    vals[i] = calcRMSE(i)
    
optimalAlpha = min(vals, key=vals.get)
print("Optimal alpha value =", optimalAlpha)

**Fit the data using Ridge Regression with the alpha from the previous step**

In [None]:
model = Ridge(alpha=optimalAlpha)
model.fit(X_train,y_train)

**Exponentiate the predicted values for SalePrice (because it was log transformed earlier)**

In [None]:
log_pred = model.predict(test)
actual_pred = np.exp(log_pred)

**Submit the predictions**

In [None]:
data_dict = {'Id':test_id,'SalePrice':actual_pred}

submit = pd.DataFrame(data_dict)
submit.to_csv('submission.csv',index=False)

![Final score](https://i.ibb.co/ryGdff7/image.png)