# Import all the required libraries 

In [None]:
import numpy as np
import pandas as pd
import datetime as dt

**Read the house prices table $train.csv$ into DataFrame house_prices**

In [None]:
house_prices = pd.read_csv("train.csv")

In [None]:
# Display the first five records in the dataframe
house_prices.head()

h_p is the copy of DataFrame train_house_prices

In [None]:
h_p = house_prices.copy()

In [None]:
h_p.shape

In [None]:
# info() is a very useful function that displays all the varible names, thier data types and null values in each variable/coulmn 
h_p.info()

In [None]:
h_p.describe()      #other atributes of the dataframe

In [None]:
# all numeric (float and int) variables in the dataset
house_numeric = h_p.select_dtypes(include=['float64', 'int64'])
house_numeric.head()

In [None]:
house_numeric.info()

###### Some of these numerical data columns can be considered as ordinal categorical data so we can drop them from the numerical dataset

* For example Fireplaces, HalfBath/FullBath etc.

In [None]:
# dropping the columns we want to treat as categorical variables
# house_numeric = house_numeric.drop(['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 
                              #      'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 
                               #    'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 
                             #     'MoSold', 'YrSold'], axis=1) 
house_numeric.shape

# DATA MUNGING & DATA CLEANING


A couple of columns in the housing prices data contain null values. Instead of using the backward/ forward filling method, I introduced a class/category 'missing' for all the NaNs in the categorical variables. This way we are retaining the original information instead of guessing things.

In [None]:
h_p.MasVnrType=["missing" if x is np.nan else x for x in h_p.MasVnrType]

In [None]:
h_p.Electrical=["missing" if x is np.nan else x for x in h_p.Electrical]

To test whether the Nan's are replaced with 'missing we will use 'unique' function to find out the unique number of categories in each coulumn.

In [None]:
print(h_p.MasVnrType.unique())
print(h_p.Electrical.unique())

**We have successfully introduced a new category 'missing' to our data.**

In our data set there are a few column values coded as 'NA' if a certain feature is not present in the house. But all these NA's are entered wrongly as Nan (null values) in the data set. I decoded all the NA's as _'No (feature name)'_ so that NA is not to be confused as missing data. 

In [None]:
h_p.FireplaceQu=["No Fireplace" if x is np.nan else x for x in h_p.FireplaceQu]

In [None]:
h_p.Alley=["No alley access" if x is np.nan else x for x in h_p.Alley]

In [None]:
h_p.PoolQC=["No Pool" if x is np.nan else x for x in h_p.PoolQC]

In [None]:
h_p.Fence=["No Fence" if x is np.nan else x for x in h_p.Fence]

In [None]:
h_p.MiscFeature=["None" if x is np.nan else x for x in h_p.MiscFeature]

In [None]:
h_p.BsmtQual=["No Bsmnt" if x is np.nan else x for x in h_p.BsmtQual]

In [None]:
h_p.BsmtCond=["No Bsmnt" if x is np.nan else x for x in h_p.BsmtCond]

In [None]:
h_p.BsmtExposure=["No Bsmnt" if x is np.nan else x for x in h_p.BsmtExposure]

In [None]:
h_p.BsmtFinType1=["No Bsmnt" if x is np.nan else x for x in h_p.BsmtFinType1]

In [None]:
h_p.BsmtFinType2=["No Bsmnt" if x is np.nan else x for x in h_p.BsmtFinType2]

In [None]:
h_p.GarageType=["No Grg" if x is np.nan else x for x in h_p.GarageType]

In [None]:
h_p.GarageFinish=["No Grg" if x is np.nan else x for x in h_p.GarageFinish]

In [None]:
h_p.GarageCond=["No Grg" if x is np.nan else x for x in h_p.GarageCond]

In [None]:
h_p.GarageQual=["No Grg" if x is np.nan else x for x in h_p.GarageQual]

Let's check if all the 'NA' values were coded correctly.

In [None]:
print(h_p.FireplaceQu.unique())
print(h_p.Alley.unique())
print(h_p.PoolQC.unique())
print(h_p.Fence.unique())
print(h_p.MiscFeature.unique())
print(h_p.BsmtQual.unique())
print(h_p.BsmtCond.unique())
print(h_p.BsmtExposure.unique())
print(h_p.BsmtFinType1.unique())
print(h_p.BsmtFinType2.unique())
print(h_p.GarageType.unique())
print(h_p.GarageFinish.unique())
print(h_p.GarageCond.unique())
print(h_p.GarageQual.unique())

**All the 'NA's' are gone and are decoded by the codes mentioned above.**

Numerical variables like LotFrontage, GarageYrBlt, MasVnrArea have null values. I used mean imputation to deal with missing values.

Mean imputation is a method in which the missing value on a certain variable is replaced by the mean of the available cases. 

In [None]:
h_p['LotFrontage'].interpolate(method='linear', axis=0, inplace=True)

In [None]:
h_p['GarageYrBlt'].interpolate(method='linear', axis=0, inplace=True)

In [None]:
h_p['MasVnrArea'].interpolate(method='linear', axis=0, inplace=True)

**Check if h_p has any null values**

In [None]:
h_p.isnull().values.any()

**The data is now clean without any missing values and ready to use for further analysis. Let us import the cleaned dataframe into a csv file for further analaysis**

In [None]:
h_p.to_csv('E:\Data Science\Capstone Project - House prices\Data Story\house_prices_cleaned.csv', index=False)