# Kaggle Competition:
Predict the House Prices using advanced regression techniques





## Import Required Packages

---



In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

Import google drive library to load csv files:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pwd

Change path directory to redirect to data folder:

In [None]:
%cd /content/drive/MyDrive/Colab_Notebooks/Kaggle/Price_Prediction
!ls data

## Read datasets - train.csv and test.csv
Datasets will be read from **google drive** path.
Data path is set to '/MyDrive/Colab_Notebooks/Kaggle/Price_Prediction/data/'
---



In [None]:
#data_path = '/MyDrive/Colab_Notebooks/Kaggle/Price_Prediction/data/'

In [None]:
# Read train.csv and test.csv
train_df = pd.read_csv( 'data/train.csv');
test_df = pd.read_csv('data/test.csv');

In [None]:
print(train_df.describe())
print(test_df.describe())

Dropping the "id" column from the dataset:

In [None]:
# Drop the ID columns
train_df.drop('Id', axis = 1, inplace = True)
test_df.drop('Id', axis = 1, inplace = True)

## Correlation and Heatmap

---



In [None]:
corrMat = train_df.corr()
corrMat

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(corrMat, vmax=1,square=True, cmap="YlGnBu")
plt.title("Correlation Matrix")

## Dividing training dataset into x and y

---



In [None]:
# divide train data in x and y
train_df_y = train_df['SalePrice']
train_df_x = train_df.drop('SalePrice', axis=1)

## Preprocess train.csv

---



### Imputing Missing Values in train.csv


In [None]:
# Replacing NA values in LotFrontage column with the mean value of the column data
meanValueLF = np.round(train_df_x['LotFrontage'].mean(), 2)
train_df_x['LotFrontage'].fillna(meanValueLF, inplace = True);

# Replace Alley column value NA with Unknown
train_df_x['Alley'] = train_df_x['Alley'].replace(np.nan, 'Unknown')

# Imputing NA values in MasVnrArea column with mode value
train_df_x['MasVnrArea'].fillna(train_df_x['MasVnrArea'].mode(), inplace = True)
train_df_x['MasVnrArea'] = train_df_x['MasVnrArea'].replace(np.nan, 0)

# Dropping rows with NA values in GarageYrBlt
train_df_x.dropna(subset=['GarageYrBlt'], inplace = True)

#train_df_x.to_csv('testaru.csv')

### Change "object" categorical columns into numerical categorical columns in train.csv


In [None]:
trainObjCol = list(train_df_x.select_dtypes(include=['object']).columns)
for c in trainObjCol:
  train_df_x[c] = train_df_x[c].astype('category').cat.codes

### Applying standard scaler on train.csv

In [None]:
scalar = StandardScaler()
train_df_X = scalar.fit_transform(train_df_x)

## Preprocess test.csv

---



### Imputing Missing Values in Test dataset

In [None]:
# Replacing NA values in LotFrontage column with the mean value of the column data
meanValueLFtest = np.round(test_df['LotFrontage'].mean(), 2)
test_df['LotFrontage'].fillna(meanValueLFtest, inplace = True);

# Replace Alley column value NA with Unknown
test_df['Alley'] = test_df['Alley'].replace(np.nan, 'Unknown')

# Imputing NA values in MasVnrArea column with mode value
test_df['MasVnrArea'].fillna(test_df['MasVnrArea'].mode(), inplace = True)
train_df['MasVnrArea'] = train_df['MasVnrArea'].replace(np.nan, 0)

# Dropping rows with NA values in GarageYrBlt
test_df.dropna(subset=['GarageYrBlt'], inplace = True)

### Change "object" categorical columns into numerical categorical columns in test.csv

In [None]:
testObjCol = list(test_df.select_dtypes(include=['object']).columns)
for c in testObjCol:
  test_df[c] = test_df[c].astype('category').cat.codes

### Applying standard scaler on test.csv

In [None]:
scalar = StandardScaler()
test_df_X = scalar.fit_transform(test_df)

## Principle Component Analysis

---



In [None]:
pca=PCA()
pca_res = pca.fit_transform(train_df_X)
pca_res

In [None]:
pca.get_covariance()

In [None]:
explained_variance=pca.explained_variance_ratio_
explained_variance

Plotting explained variance from PCA:

In [None]:
with plt.style.context('dark_background'):
    plt.figure(figsize=(10, 10))
    plt.bar(range(79),explained_variance, alpha=0.5,align='center', label='individual explained variance' )
    plt.xlabel('Principal components')
    plt.ylabel('Explained variance ratio')
    plt.legend()
    plt.tight_layout()

    # range(79) used because number of features in the data is 79

The highest contribution of the features to the variance is approximately 14%. We are going to consider only those features whose contribution is more than 2% to the variance. From the above bar chart, we will consider 10 components to be considered.

In [None]:
pca=PCA(n_components=10)
pca_res_new = pca.fit_transform(train_df_X)
pca_res_new

In [None]:
pca.get_covariance()

In [None]:
explained_variance_new=pca.explained_variance_ratio_
explained_variance_new

In [None]:
with plt.style.context('dark_background'):
    plt.figure(figsize=(10, 10))
    plt.bar(range(10),explained_variance_new, alpha=0.5,align='center', label='individual explained variance' )
    plt.xlabel('Principal components')
    plt.ylabel('Explained variance ratio')
    plt.legend()
    plt.tight_layout()