In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

%pylab inline
%config InlineBackend.figure_formats = ['retina']

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
%matplotlib inline

# Exploratory Data Analysis

In [2]:
# Importing the data

data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

data.head()

In [4]:
test.head()

In [3]:
data.info()

In [5]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

num_cols = data.select_dtypes(include=numerics).columns.to_list()
num_cols.pop(-1)

# For test data
num_cols_test = test.select_dtypes(include=numerics).columns.to_list()

num_cols

In [6]:
data[num_cols].head()

In [7]:
test[num_cols_test].head()

In [8]:
data['MoSold'] = data['MoSold'].astype('category')

test['MoSold'] = test['MoSold'].astype('category')

data['MoSold'].dtype

In [9]:
import datetime

data['Age'] = datetime.date.today().year - data['YearBuilt']
data['RemodelAge'] = datetime.date.today().year - data['YearRemodAdd']
data['SoldAge'] = datetime.date.today().year - data['YrSold']
data['BuiltRemodelAge'] = data['YearRemodAdd'] - data['YearBuilt']
data['GarageAge'] = datetime.date.today().year - data['GarageYrBlt']
data.drop(['Id','YearRemodAdd', 'YearBuilt', 'YrSold', 'GarageYrBlt'], axis=1, inplace=True)

# new numerical columns
num_cols = data.select_dtypes(include=numerics).columns.to_list()

# For test
test['Age'] = datetime.date.today().year - test['YearBuilt']
test['RemodelAge'] = datetime.date.today().year - test['YearRemodAdd']
test['SoldAge'] = datetime.date.today().year - test['YrSold']
test['BuiltRemodelAge'] = test['YearRemodAdd'] - test['YearBuilt']
test['GarageAge'] = datetime.date.today().year - test['GarageYrBlt']
test.drop(['Id','YearRemodAdd', 'YearBuilt', 'YrSold', 'GarageYrBlt'], axis=1, inplace=True)

# new numerical columns
num_cols_test = test.select_dtypes(include=numerics).columns.to_list()

data.head()

In [None]:
num_cols

In [10]:
cat_cols = data.select_dtypes(include='object').columns.to_list()

cat_cols_test = test.select_dtypes(include='object').columns.to_list()

cat_cols

In [11]:
cat_cols.pop(-1)
cat_cols_test.pop(-1)

In [12]:
data[cat_cols].head()

In [13]:
test[cat_cols_test].head()

In [14]:
# Fill the empty categorical values with None
for col in cat_cols:
    data[col].fillna('None', inplace=True)
    data[col] = data[col].apply(lambda x: 'Other' if x == 'Othr' else x)
    
# For test
for col in cat_cols_test:
    test[col].fillna('None', inplace=True)
    test[col] = test[col].apply(lambda x: 'Other' if x == 'Othr' else x)

In [16]:
# Checking to see how manay of the numerical columns have missing data

data[num_cols].isnull().sum()

Using Histogram to check for the skewness of the columns with missing data. This will determine the statistcs we will use to fill up the empty fields.

In [17]:
data['LotFrontage'].hist()

In [18]:
data['GarageAge'].hist()

In [19]:
data['MasVnrArea'].hist()

We can see all 3 columns are very skewed, hence we will be using the median to fill empty data.

In [20]:
# calculating the skew values for all columns

data.skew()

In [21]:
# Filling up the missing data
for col in ['LotFrontage', 'GarageAge', 'MasVnrArea']:
    data[col].fillna(data[col].median(), inplace=True)
    

# For test
for col in num_cols_test:
    test[col].fillna(test[col].median(), inplace=True)

In [23]:
# checking to be sure no more missing data
test.info()

**Analyse Our Features**

In [24]:
# Checking the correlation between features

corr_matrix = data.corr()
corr_matrix

In [25]:
# Remove the correlation of the diagonal for future referrence

for x in range(corr_matrix.shape[0]):
    corr_matrix.iloc[x,x] = 0.0
    
corr_matrix

In [26]:
# Checking for the strongly most correlated variables

corr_matrix.abs().idxmax()

In [27]:
sns.heatmap(corr_matrix, center=0)

Above shows each column with their higly correlated columns

Now we will take the log transformation of our skewed numerical variables

In [28]:
log_cols = data.skew().sort_values(ascending=False)
log_cols = log_cols.loc[log_cols > 0.5]

# For test
log_cols_test = test.skew().sort_values(ascending=False)
log_cols_test = log_cols_test.loc[log_cols_test > 0.5]

log_cols

In [29]:
for col in log_cols.index:
    data[col] = np.log1p(data[col])
    
# For test
for col in log_cols_test.index:
    test[col] = np.log1p(test[col])

In [30]:
test[num_cols_test].head()

In [31]:
data['MasVnrArea'].hist()

**Now let us scale our numerical data**

In [34]:
test['GarageAge'].min()

In [35]:
from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()

for col in num_cols:
    data[col] = mm.fit_transform(data[[col]]).squeeze()


In [36]:
data[num_cols].head()

**One hot encoding the categorical data**

In [37]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le = LabelEncoder()
ohe = OneHotEncoder()

In [38]:
cat_cols.append('MoSold')

In [39]:
# One Hot Encoding the dataset
for col in cat_cols:
    dt_le = le.fit_transform(data[col]).astype(np.int)
    dt_ohe = ohe.fit_transform(dt_le.reshape(-1,1)).astype(np.int)
    data = data.drop(col, axis=1)
    
    n_cols = dt_ohe.shape[1]
    col_names = ['_'.join([col, str(x)]) for x in range(n_cols)]
    
    new_df = pd.DataFrame(dt_ohe.toarray(),
                         index=data.index,
                         columns=col_names)
    data = pd.concat([data, new_df], axis=1)

In [40]:
data.head()

In [41]:
smaller_cols = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'FullBath', 'HalfBath', 'BedroomAbvGr',  'PoolArea', 'MiscVal', 'SalePrice', 'Age', 'RemodelAge', 'SoldAge', 'BuiltRemodelAge', 'GarageAge']
sns.set_context('notebook')
sns.set_style('white')

sns.pairplot(data[smaller_cols], plot_kws=dict(alpha=0.1, edgecolor=None))

# PCA

Using PCA, let us see how many of the 316 columns will remain relevant for our prediction

In [42]:
from sklearn.decomposition import PCA

We will be ranging our n_dimensions from 1 - 15

In [47]:
new_df = data.copy()
new_df = new_df.drop(['SalePrice', 'SaleCondition'], axis = 1)

pca_list = list()
feature_weight_list = list()

# Fit a range of PCA models

for n in range(10, 101):
    
    # Create and fit the model
    PCAmod = PCA(n_components=n)
    PCAmod.fit(new_df)
    
    # Store the model and variance
    pca_list.append(pd.Series({'n':n, 'model':PCAmod,
                               'variance': PCAmod.explained_variance_ratio_.sum()}))
    
    # Calculate and store feature importances
    weights = PCAmod.explained_variance_ratio_.reshape(-1,1)/PCAmod.explained_variance_ratio_.sum()
    overall_contribution = np.abs(PCAmod.components_)*weights
    abs_feature_values = overall_contribution.sum(axis=0)
    feature_weight_list.append(pd.DataFrame({'n':n, 
                                             'features': new_df.columns,
                                             'values':abs_feature_values/abs_feature_values.sum()}))
    
pca_df = pd.concat(pca_list, axis=1).T.set_index('n')
pca_df

In [48]:
features_df = (pd.concat(feature_weight_list)
               .pivot(index='n', columns='features', values='values'))

features_df

We can see for most of our features, their weights are very low, hence not important.

Let us plot the explained variance

In [54]:
sns.set_context('talk')
fig, ax = plt.subplots()
fig.set_size_inches(20, 5)
ax = pca_df['variance'].plot(kind='bar')

ax.set(xlabel='Number of dimensions',
       ylabel='Percent explained variance',
       title='Explained Variance vs Dimensions');

From the above, the varinace was not adding much from after 95. Therefore, our absolyte varinace will be 95

Now using Linear Regression, Let us predict the house prices with our PCA reduced dataset.

In [71]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score, mean_squared_error

data1 = data.copy()
data1 = data1.drop('SaleCondition', axis = 1)

X = data1.drop('SalePrice', axis=1)
y = data1.SalePrice

def get_avg_score(ns):
    r2_scores = []
    mse_scores = []
    for n in ns:
        pipe = [
            ('pca', PCA(n_components=n)),
            ('estimator', ElasticNet(warm_start=True, max_iter=100000, random_state=42))
        ]
        pipe = Pipeline(pipe)
        scores = []
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
        pipe.fit(X_train, y_train)
        r2 = r2_score(y_test, pipe.predict(X_test))
        r2_scores.append(r2)
        mse_score = mean_squared_error(y_test, pipe.predict(X_test))
        mse_scores.append(mse_score)
        
    scores = pd.DataFrame({'n': ns,
                           'R2': r2_scores,
                           'MSE': mse_scores
                          })
    
    return scores


ns = [5, 50, 90, 95, 100, 105, 110, 300]
score_list = get_avg_score(ns)

In [72]:
score_list

# KMEANS

Using KMeans clustering to cluster our data into its different SaleCondition categories

In [77]:
data2 = data.copy()
data2 = data2.drop(['SalePrice', 'SaleCondition'], axis = 1)

from sklearn.cluster import KMeans

km = KMeans(n_clusters=6, random_state=42)
km = km.fit(data2)

data2['kmeans'] = km.predict(data2)

In [78]:
data['SaleCondition'].value_counts()

In [79]:
data2['kmeans'].value_counts()

# Agglomerative Clustering

In [82]:
from sklearn.cluster import AgglomerativeClustering

ag = AgglomerativeClustering(n_clusters=6, linkage='ward', compute_full_tree=True)
ag = ag.fit(data2)
data2['agglom'] = ag.fit_predict(data2)

In [83]:
data2['agglom'].value_counts()