In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

The data set contains 14 features including the target, which is the sales price, of houses in the boston area in a time period from 1950s up to present time. Features such as garage, quality, how many rooms and many other were documented each time a house was built and sold. The objective here is to develop a model that can predict the selling price of a house. For this, a test, training and submission set are provided to train the model, test it and finally using the submission set to see how well the accuracy is. The steps will start of by creating the dataframes, performing an EDA by constructing graphs to visualize any trends, data cleaning and preparation, and finally model constructing.

## The first part is to create two dataframes 1) for training data 2) for test data

In [None]:
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
sub_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')

# Import packages

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(style='dark')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier # Random Forest

from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

#feature selection modules
from sklearn.feature_selection import SelectKBest # Univariate Feature Selection
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import chi2 # To apply Univariate Feature Selection
from sklearn.feature_selection import RFE # Recursive Feature Selection
from sklearn.decomposition import PCA # To apply PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


### The business goal is to develop a model that can predict housing prices. So the target is price of the house.


# 1) Understanding the data:

In [None]:
def detailed_analysis(df, pred=None):
  obs = df.shape[0]
  types = df.dtypes
  counts = df.apply(lambda x: x.count())
  uniques = df.apply(lambda x: [x.unique()])
  nulls = df.apply(lambda x: x.isnull().sum())
  distincts = df.apply(lambda x: x.unique().shape[0])
  missing_ratio = (df.isnull().sum() / obs) * 100
  skewness = df.skew()
  kurtosis = df.kurt()
  print('Data shape:', df.shape)

  if pred is None:
    cols = ['types', 'counts', 'nulls', 'distincts', 'missing ratio', 'uniques', 'skewness', 'kurtosis']
    details = pd.concat([types, counts, nulls, distincts, missing_ratio, uniques, skewness, kurtosis], axis=1)
  else:
    corr = df.corr()[pred]
    details = pd.concat([types, counts, nulls, distincts, missing_ratio, uniques, skewness, kurtosis, corr], axis=1
                        , sort=False)
    corr_col = 'corr ' + pred
    cols = ['types', 'counts', 'nulls', 'distincts', 'missing ratio', 'uniques', 'skewness', 'kurtosis', corr_col]

  details.columns = cols
  dtypes = details.types.value_counts()
  print('____________________________\nData types:\n', dtypes)
  print('____________________________')
  return details


In [None]:
details = detailed_analysis(train_df, 'SalePrice')
display(details.sort_values(by='corr SalePrice', ascending=False))

In [None]:

print('Highly positive skewed (skewed to the right)----------\n:', details.query('skewness > 1.4'))
print('Skewed to the left-----------\n:', details.query('skewness < -1')) 

In [None]:
num_col = [col for col in train_df.columns if train_df[col].dtypes != 'O']
cat_col = [col for col in train_df.columns if train_df[col].dtypes == 'O']
print(num_col)
print(cat_col)

There are several observations we can make from the deatailed table above
* Starting with the dependent variable, we can see that it has a high positive skewness (skewed to the right) value which would mean we need to consider a transformation. Some possible transformations include: square root, cube root and log. There is always a debate about whether it is necessary to obtain a normal distribution. I have read mixed opinions on this topic where some say it is not necessary, and others say of course it is dependent on the type of model being used. In this case, since we are trying to predict the price, then the type of models would be a regression, most likely a linear regression as the title says. There are other variables that are skewed to the left or right, but some maybe could be removed because there is no significance.
* Next, we look at the type of variables. There are both numerical and categorical features, so they need to be separately analyzed

* There are missing values for each feature type, so this also needs to be dealt with before we start constructing our models.

* Feature selection will also be necessary since there are way too many features, and some are not significant


# Numerical features analysis

In [None]:
import math

target = 'SalePrice'
nc = 4
nr = math.ceil(len(num_col)/nc)

i = 0
j = 0

figure, ax = plt.subplots(nrows = nr, ncols = nc, figsize = (20,5*nr))
for value in num_col:
    ax[i,j].set_xlabel(value)
    ax[i,j].set_ylabel(target)
    ax[i,j].set_title(value + 'vs' + target)
    ax[i,j].scatter(x = train_df[[value]], y = train_df[[target]], alpha = 0.3)
    if j == nc-1:
        i = i + 1
        j = 0
    else:
        j = j + 1
    figure.tight_layout(pad=5.0)

What we can get out of the plots is a visual understanding of the correlation between our target and numerical features. However, some of the features are discrete values, some also seem to have high concentration of values in one area. This could be a sign of possible transformation of the features, or maybe even some anomalies/outliers. However, one thing is for sure, it can already be seen here that certain numerical features can be dropped.

## Categorical variables

In [None]:
def boxplot(x, y, **kwargs):
    sns.boxplot(x=x, y=y)
    x = plt.xticks(rotation=90)

g = sns.FacetGrid(
    pd.melt(train_df.fillna("MISSING"), id_vars=['SalePrice'], value_vars = cat_col),
    col='variable',
    col_wrap=4,
    sharex=False,
    sharey=False,
    height=5
).map(boxplot, 'value', 'SalePrice')

* MSZoning plot: There are clearly a few outliears, and it also appears that there is no strong influence between the MSZoning and the saleprice of a house
* Street plot: Again, clearly outliers are present, but only for pave, and there seems to be no strong influence on the house price.
* Alley plot: Same conclusion, outliers and no strong influence

* Without going into to much detail with each plot, it appears that several features does not have a strong influence on the house of the prices. The utlities feature can possibly be removed since there is only one observation. Neighboorhood does have a strong influence on the house price, which also makes sense. If it is a neighborhood with a high crime rate, then this would have a negative influence on the price. Or if the neighboorhood is perfect for a family, young or old couples this would also influence the price. Condition of the house also seems to have an influence on the price, which is realistic. Roof material has a influence. This is a big factor in purchasing a home since changing the roof is an expensive affair, so it seems reasonable that the type of material used will affect the price. Exterior1st appears to have some influence on the price, but the majority is around the 200k-300k. Exterqual appears to have an influence. Basment quality, and pool quality appear to have an influence on the price of the house. Finally, the saletype also seems to have an influence. Conclusion is, there also can be seen signs that certain features could be removed.

## Filling in missing values

I will first look at the categorical features. There are many features that using ratings in the form as strings to 
evaulate the quality of a given attribute. I will use 1-5 to convert the ratings where 1 is the worst, and 5 is the best. This will be done using a loop for both test and training sets to make it easier.

In [None]:
# to categorical feature
cols = ["MSSubClass","BsmtFullBath","BsmtHalfBath","HalfBath","BedroomAbvGr",
        "KitchenAbvGr","MoSold","YrSold","YearBuilt","YearRemodAdd",
        "LowQualFinSF","GarageYrBlt"]

for c in cols:
    train_df[c] = train_df[c].astype(str)
    test_df[c] = test_df[c].astype(str)
    

# encode quality
# Ex(Excellent), Gd（Good）, TA（Typical/Average）, Fa（Fair）, Po（Poor）
cols = ['ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC',
        'KitchenQual','FireplaceQu','GarageQual','GarageCond','PoolQC']
for c in cols:
    train_df[c].fillna(0, inplace=True)
    test_df[c].fillna(0, inplace =True)
    
    train_df[c].replace({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}, inplace=True)
    test_df[c].replace({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}, inplace=True)
    

In [None]:
def pair_features_to_dummies(df, col1, col2, prefix):
    d_1 = pd.get_dummies(df[col1].astype(str), prefix=prefix)
    d_2 = pd.get_dummies(df[col2].astype(str), prefix=prefix)
    for c in list(set(list(d_1.columns) + list(d_2.columns))):
        if not c in d_1.columns: d_1[c] = 0
        if not c in d_2.columns: d_2[c] = 0
    return (d_1 + d_2).clip(0, 1)

cond = pair_features_to_dummies(train_df,'Condition1','Condition2','Condition')
cond = pair_features_to_dummies(test_df,'Condition1','Condition2','Condition')
exterior = pair_features_to_dummies(train_df,'Exterior1st','Exterior2nd','Exterior')
exterior = pair_features_to_dummies(test_df,'Exterior1st','Exterior2nd','Exterior')
bsmtftype = pair_features_to_dummies(train_df,'BsmtFinType1','BsmtFinType2','BsmtFinType') 
bsmtftype = pair_features_to_dummies(test_df,'BsmtFinType1','BsmtFinType2','BsmtFinType') 

train_new = pd.concat([train_df, cond, exterior, bsmtftype], axis=1)
test_new = pd.concat([train_df, cond, exterior, bsmtftype], axis=1)
train_new.drop(['Condition1','Condition2', 'Exterior1st','Exterior2nd',
               'BsmtFinType1','BsmtFinType2'], axis=1, inplace=True)
test_new.drop(['Condition1','Condition2', 'Exterior1st','Exterior2nd',
               'BsmtFinType1','BsmtFinType2'], axis=1, inplace=True)
train_new.head(10)
test_new.head(10)

In [None]:
total_null = train_new.isnull().sum()
percent = (train_new.isnull().sum()/train_new.isnull().count())
total_not_null = train_new.notnull().sum()

missing_data = pd.concat([total_null, percent, total_not_null], axis = 1, keys = ["Total_null", "Percent of missing values", "total_not_null"]) #axis = 1 create two columns, one for total and one for percent

print(missing_data.loc[missing_data["Total_null"] > 0].sort_values(by = 'Percent of missing values', ascending = False))

After looking at how many values are missing, I think it is justifiable to drop the top three features. Of course,
it is never optimal to drop data. But in this case where over 88% is missing, then filling the values would maybe create some false conclusions later on. What we could do is fill in the values for now, and issue a feature selection later. Lets try that first.

In [None]:
for c in ['MiscFeature', 'Alley', 'Fence']:
    train_new[c].fillna('None', inplace=True)
    test_new[c].fillna('None', inplace=True)
    
train_new['LotFrontage'] = train_new.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
test_new['LotFrontage'] = test_new.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

train_new.loc[train_new.GarageYrBlt.isnull(),'GarageYrBlt'] = train_new.loc[train_new.GarageYrBlt.isnull(),'YearBuilt']
test_new.loc[test_new.GarageYrBlt.isnull(),'GarageYrBlt'] = test_new.loc[test_new.GarageYrBlt.isnull(),'YearBuilt']


train_new['GarageType'].fillna('None', inplace=True)
train_new['GarageFinish'].fillna(0, inplace=True)
test_new['GarageType'].fillna('None', inplace=True)
test_new['GarageFinish'].fillna(0, inplace=True)

for c in ['GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']:
    train_new[c].fillna(0, inplace=True)
    test_new[c].fillna(0, inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder


for i, t in train_new.loc[:, train_new.columns != 'SalePrice'].dtypes.iteritems():
    if t == object:
        train_new[i].fillna(train_new[i].mode()[0], inplace=True)
        train_new[i] = LabelEncoder().fit_transform(train_new[i].astype(str))
    else:
        train_new[i].fillna(train_new[i].median(), inplace=True)
        
for i, t in test_new.loc[:, test_new.columns != 'SalePrice'].dtypes.iteritems():
    if t == object:
        test_new[i].fillna(train_new[i].mode()[0], inplace=True)
        test_new[i] = LabelEncoder().fit_transform(test_new[i].astype(str))
    else:
        test_new[i].fillna(test_new[i].median(), inplace=True)

In [None]:
train_new.head(5)
test_new.head(5)

So now, all strings are converted, and missing values are delt with. Lets try and do some feature selection to reduce the number of features. First, we split the training set into train and test. We should be cautious that this is still the training set. We also have a test test set that we will use to test our model before using the submission data set.

In [None]:
X = train_new.drop(['SalePrice'], axis = 1)
y = train_new['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [None]:
RandForest_RFE = RandomForestClassifier()
rfe = RFE(estimator = RandForest_RFE, n_features_to_select = 5, step = 1)
rfe = rfe.fit(X_train, y_train)

print('Best features chosen by RFE: \n')
for i in X_train.columns[rfe.support_]:
    print(i)

That is one way to do it. Another way is the following:

In [None]:
RandForest_RFECV = RandomForestClassifier()
rfecv = RFECV(estimator = RandForest_RFECV, step = 1, cv = 3, scoring = 'accuracy')
rfecv = rfecv.fit(X_train, y_train)
print('Best number of features:', rfecv.n_features_)
print('Features :\n')
for i in X_train.columns[rfecv.support_]:
    print(i)
    
plt.figure()
plt.xlabel("Number of Features")
plt.ylabel("Score of Selected Features")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [None]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=80)
fit = bestfeatures.fit(X_train,y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(80,'Score'))  #print 10 best features

In [None]:

from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X_train,y_train)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X_train.columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.show()

In [None]:
#drop = ['PoolQC','MiscFeature','Alley','Fence','Utilities']
#for i in drop:
#    X_train = X_train.drop(columns = i)
#    X_test = X_test.drop(columns = i)
#    test_df = test_df.drop(columns = i)
#    print(i + "removed")

In [None]:
total_null = train_df.isnull().sum()
percent = (train_df.isnull().sum()/train_df.isnull().count())
total_not_null = train_df.notnull().sum()

missing_data = pd.concat([total_null, percent, total_not_null], axis = 1, keys = ["Total_null", "Percent of missing values", "total_not_null"]) #axis = 1 create two columns, one for total and one for percent

print(missing_data.loc[missing_data["Total_null"] > 0].sort_values(by = 'Percent of missing values', ascending = False))

We can see that PoolQC, MiscFeature, Alley, and Fence have extremely high percent of missing values. This could be that the house does not have this feature, or that the value was not recorded. However, it could be justified to exclude these features from our model, since filling the missing values would give a bias result.

In [None]:
#drop = ['PoolQC','MiscFeature','Alley','Fence','Utilities']
#for i in drop:
#    X_train = X_train.drop(columns = i)
#    X_test = X_test.drop(columns = i)
#    test_df = test_df.drop(columns = i)
#    print(i + "removed")
    


In [None]:
numerical_features = [col for col in X_train.columns if X_train[col].dtypes!= 'O']
categorical_features =[col for col in X_train.columns if X_train[col].dtypes== 'O']



In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder,LabelBinarizer

In [None]:
#most_common = X_train['Electrical'].mode()
#X_train['Electrical'] = X_train['Electrical'].fillna(most_common)

#X_train["MasVnrArea"].fillna(0, inplace = True)
#X_train["MasVnrType"].fillna("None", inplace = True)

#X_test["MasVnrArea"].fillna(0, inplace = True)
#X_test["MasVnrType"].fillna("None", inplace = True)

#test_df["MasVnrArea"].fillna(0, inplace = True)
#test_df["MasVnrType"].fillna("None", inplace = True)

In [None]:

#X_train["FireplaceQu"].fillna("None", inplace = True)
#X_test["FireplaceQu"].fillna("None", inplace = True)
#test_df["FireplaceQu"].fillna("None", inplace = True)

In [None]:
class new_LabelBina(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = OneHotEncoder()
    def fit(self, X, y=None):
        self.encoder.fit(X)
        return self
    def transform(self, X):
        return self.encoder.transform(X)

In [None]:
numerical_pipeline = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'median')),
                                     ('scaler', StandardScaler())])

target_pipeline = Pipeline(steps = [('target_scale', StandardScaler())])

categorical_pipeline = Pipeline(steps = [('label_encoder', new_LabelBina()),
                                        ('imputer_cat', SimpleImputer(missing_data = 'nan', 
                                                                      strategy = 'constant', fill_value = 0))])

full_pipeline = ColumnTransformer(transformers=[('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features),
                                              ])

We will start with feature selection to see if there are any features that can be disregarded

In [None]:
house_data_prep=full_pipeline.fit_transform(X_train)

We also need to scale the output

In [None]:
scale_output=StandardScaler()
new_target=scale_output.fit_transform(np.array(y_train).reshape(-1,1))

In [None]:
RandForest_RFECV = RandomForestClassifier()
rfecv = RFECV(estimator = RandForest_RFECV, step = 1, cv = 3, scoring = 'accuracy')
rfecv = rfecv.fit(house_data_prep, new_target)
print('Best number of features:', rfecv.n_features_)
print('Features :\n')
for i in X_train.columns[rfecv.support_]:
    print(i)
    
plt.figure()
plt.xlabel("Number of Features")
plt.ylabel("Score of Selected Features")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

So now, the full pipeline is constructed and fitted on the training set, so we now can use house_data_prepared to train our model. 

In [None]:
linear=LinearRegression()
scale_output=StandardScaler()
new_y=scale_output.fit_transform(np.array(y_train).reshape(-1,1))
linear.fit(spotify_preparared,new_y)
test_data=full_pipeline.transform(X_test)
predictions=linear.predict(test_data)
from sklearn.metrics import mean_squared_error
new_y=scale_output.transform(np.array(y_test).reshape(-1,1))

lin_mse = np.sqrt(mean_squared_error(new_y,predictions))
print(lin_mse)