# Cleansing and Integrating Raw Data Task 4


Date: Sunday, 13 May 2018 

Environment: Python 3.6.4 and Anaconda3-5.1.0 (64-bit)

Libraries used:
* pandas 0.22.0 (for data frame, included in Anaconda Python 3.6) 
* math
* seaborn
* matplotlib.pyplot
* statsmodels.formula.api

# Introduction
This assignment consists of following tasks:

1. Auditing and Cleansing the Job dataset,
2. Integrating the Job datasets,
3. Finding missing value and fill in the reasonable values,
4. Finding the outliers.

# Task 4. Finding the outliers

## 1. Import Libraries

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import seaborn as sns
from pylab import rcParams
import math
import matplotlib.pyplot as plt
from sklearn import preprocessing
%matplotlib inline
plt.style.use('classic')
pd.set_option('display.float_format', lambda x: '%.3f'%x)
import statsmodels.formula.api as sm
from statsmodels.formula.api import ols

In [None]:
rcParams['figure.figsize']= 20,10
sns.set_style('whitegrid')
sns.set(color_codes=True)

## 2. Read a file

In [None]:
data4 = pd.read_csv("dataset4_with_outliers.csv")
data4.head()

## 3. Correlation using a heat map

In [None]:
sns.heatmap(data4.corr(), annot =True)

## 4. Log Transformation

###### Log Transformation is done to make highly skewed distribution less skewed

In [None]:
data4['logPrice'] = None
i = 0
for row in data4.iterrows():
    data4['logPrice'].at[i] = math.log(data4["price"][i])
    i += 1

data4['logLiving'] = None
i = 0
for row in data4.iterrows():
    data4['logLiving'].at[i] = math.log(data4["sqft_living"][i])
    i += 1
    
data4['logLot'] = None
i = 0
for row in data4.iterrows():
    data4['logLot'].at[i] = math.log(data4["sqft_lot"][i])
    i += 1
    
data4['logAbove'] = None
i = 0
for row in data4.iterrows():
    data4['logAbove'].at[i] = math.log(data4["sqft_above"][i])
    i += 1
    
data4['logBathrooms'] = None
i = 0
for row in data4.iterrows():
    data4['logBathrooms'].at[i] = math.log(data4["bathrooms"][i])
    i += 1
    
data4['logBedrooms'] = None
i = 0
for row in data4.iterrows():
    data4['logBedrooms'].at[i] = math.log(data4["bedrooms"][i])
    i += 1
 
data4.head()

## 5. Z-Score Transformation

###### For checking standard deviations and mean

In [None]:
std_scale = preprocessing.StandardScaler().fit(data4[['logPrice', 'logLiving','logLot','logAbove','logBathrooms','logBedrooms']])
df_std = std_scale.transform(data4[['logPrice', 'logLiving','logLot','logAbove','logBathrooms','logBedrooms']]) # an array not a df
df_std[0:5]

In [None]:
print('Mean after standardisation:\nPrice = {:.2f}, Livings = {:.2f}, Lot = {:.2f}, Above = {:.2f}, Bathrooms = {:.2f}, Bedrooms = {:.2f}'
      .format(df_std[:,0].mean(), df_std[:,1].mean(), df_std[:,2].mean(), df_std[:,3].mean(), df_std[:,4].mean(), df_std[:,5].mean()))
print('\nStandard deviation after standardisation:\nPrice = {:.2f}, Livings = {:.2f}, Lot = {:.2f}, Above = {:.2f}, Bathrooms = {:.2f}, Bedrooms = {:.2f} '
      .format(df_std[:,0].std(), df_std[:,1].std(), df_std[:,2].std(), df_std[:,3].std(), df_std[:,4].std(), df_std[:,5].std()))

###### Curve for checking if data is normalised or not

In [None]:
plt.close('all')
fig, ((ax1, ax2), (ax3,ax4),(ax5,ax6)) = plt.subplots(nrows=3, ncols=2)
sns.distplot(pd.to_numeric(data4['logPrice']), ax= ax1).set_title('Price');
sns.distplot(pd.to_numeric(data4['logLiving']), ax= ax2).set_title('Living');
sns.distplot(pd.to_numeric(data4['logLot']), ax= ax3).set_title('Lot');
sns.distplot(pd.to_numeric(data4['logAbove']), ax= ax4).set_title('Above');
sns.distplot(pd.to_numeric(data4['logBathrooms']), ax= ax5).set_title('Bathrooms');
sns.distplot(pd.to_numeric(data4['logBedrooms']), ax= ax6).set_title('Bedrooms');
plt.tight_layout()
plt.show()

## 6. Converting into decimal form

In [None]:
data4['logPrice'] = data4['logPrice'].astype(float)
data4['logBedrooms'] = data4['logBedrooms'].astype(float)
data4['logLiving'] = data4['logLiving'].astype(float)
data4['logLot'] = data4['logLot'].astype(float)
data4['logAbove'] = data4['logAbove'].astype(float)
data4['logBathrooms'] = data4['logBathrooms'].astype(float)

## 7. Checking Outliers
- Model is fitted for checking the outliers.
- Residual values are then extracted from the model.
- Residual values are the differences between the predicted and the actual values


# Price

#### First considering Price and taking variables which have high correlation with it:
* Living (Corr : 0.72)
* Grade (Corr : 0.71)
* Bathrooms (Corr : 0.54)

### 1. Living

In [None]:
# Fitting a model for logPrice and logLiving
model = ols(formula = 'logPrice ~ logLiving', data = data4).fit()
data_res = data4
# Extracting residual values
data_res['resid_pl'] = model.resid
# Summary of the model
model.summary()

###### Observations:
- R-squared: 0.457

###### Boxplot on residual values for extracting the outliers

In [None]:
fig, ax1 = plt.subplots(ncols=1, sharey=True, figsize=(10,5))
ax2 = sns.boxplot(data_res['resid_pl'], orient ='h', ax=ax1).set_title('Outliers')
plt.show()

###### Query applied according to the boxplot

In [None]:
data_res[(data_res['resid_pl'] > 1.0)].sort_values('resid_pl') 

###### Observations:
- Potential Outliers : 8, 1060, 5072, 3777, 8869
- Confirmed Outliers : 8, 1060, 5072, 3777, 8869

###### Reason for considering them as Outliers:
For indices (8, 1060, 5072, 3777, 8869):
- Price for sqft_living does not seems valid according to the overall record given.
- For index(8): sqft_living is less for high price
- For index(1060): sqft_living is less for high price
- For index(5072): sqft_living is less for high price
- For index(3777): sqft_living is less for high price
- For index(8869): sqft_living is less for high price

###### Dropping indices containing outliers

In [None]:
data_res = data_res.drop([8,1060,5072,3777,8869])

### 2. Grade

In [None]:
# Fitting a model for logPrice and grade
model = ols(formula = 'logPrice ~ grade', data = data_res).fit()
# Extracting residual values
data_res['resid_pg'] = model.resid
# Summary of the model
model.summary()

###### Observations:
- R-squared: 0.488

###### Boxplot on residual values for extracting the outliers

In [None]:
fig, ax1 = plt.subplots(ncols=1, sharey=True, figsize=(10,5))
ax2 = sns.boxplot(data_res['resid_pg'], orient ='h', ax=ax1).set_title('Outliers')
plt.show()

In [None]:
data_res[(data_res['resid_pg'] > 1.0)].sort_values('resid_pg').iloc[:,2:19]

###### Observations:
- Potential Outlier: 1518
- Confirmed Outlier: None

###### Reason for not considering them as Outliers:
- For index (1518): If we see other records, it does not seems to be an outlier.

### 3. Bathrooms

In [None]:
model = ols(formula = 'logPrice ~ logBathrooms', data = data_res).fit()
data_res['resid_pb'] = model.resid
model.summary()

###### Boxplot on residual values for extracting the outliers

In [None]:
fig, ax1 = plt.subplots(ncols=1, sharey=True, figsize=(10,5))
ax2 = sns.boxplot(data_res['resid_pb'], orient ='h', ax=ax1).set_title('Outliers')
plt.show()

In [None]:
data_res[(data_res['resid_pb'] > 1.2) | (data_res['resid_pb'] < -1.2)].sort_values('resid_pb') 

###### Observations:
- Potential Outliers:  4691, 5752, 4200, 1881, 1174
- Confirmed Outlier: None

###### Reason for not considering them as Outliers:
- For index ( 4691, 5752, 4200): If we see other records, it does not seems to be an outlier.

### 4. Above

In [None]:
model = ols(formula = 'logPrice ~ logAbove', data = data_res).fit()
data_res['resid_pa'] = model.resid
model.summary()

###### Boxplot on residual values for extracting the outliers

In [None]:
fig, ax1 = plt.subplots(ncols=1, sharey=True, figsize=(10,5))
ax2 = sns.boxplot(data_res['resid_pa'], orient ='h', ax=ax1).set_title('Outliers')
plt.show()

In [None]:
data_res[(data_res['resid_pa'] > 1.2)].sort_values('resid_pa') 

###### Observations:
- No outliers could be seen

# Bedrooms

### 1. Living

In [None]:
model = ols(formula = 'logBedrooms ~ logLiving', data = data_res).fit()
data_res['resid_bl'] = model.resid
model.summary()

###### Boxplot on residual values for extracting the outliers

In [None]:
fig, ax1 = plt.subplots(ncols=1, sharey=True, figsize=(10,5))
ax2 = sns.boxplot(data_res['resid_bl'], orient ='h', ax=ax1).set_title('Outliers')
plt.show()

In [None]:
data_res[(data_res['resid_bl'] > 1.0) | (data_res['resid_bl'] < -1.0)].sort_values('resid_bl') 

###### Observations:
- Potential Outliers : 2737,9774,2880,7290,161,4846,8004,9655,2610,2019,8619,356,9628,736,9322,3329,3072
- Confirmed Outlier: 2737,9774,2880,7290,161,4846,8004,9655,2610,2019,8619,356


###### Reason for considering them as Outliers:
For indices ( 2737,9774,2880,7290,161,4846,8004,9655,2610,2019,8619,356):
- Bedrooms for sqft_living does not seems valid according to the overall record given.
- For index(2737): sqft_living is more for less bedrooms
- For index(9774): sqft_living is less for more bedrooms
- For index(2880): sqft_living is less for more bedrooms
- For index(7290): sqft_living is less for more bedrooms
- For index(161): sqft_living is less for more bedrooms
- For index(4846): sqft_living is less for more bedrooms
- For index(8004): sqft_living is less for more bedrooms
- For index(9655): sqft_living is less for more bedrooms
- For index(2610): sqft_living is less for more bedrooms
- For index(2019): sqft_living is less for more bedrooms
- For index(8619): price is less for such sqft_living
- For index(356): price is less for sqft_living

###### Dropping indices containing outliers

In [None]:
data_res = data_res.drop([2737,9774,2880,7290,161,4846,8004,9655,2610,2019,8619,356])

# Bathrooms

## 1. Living

In [None]:
model = ols(formula = 'logBathrooms ~ logLiving', data = data_res).fit()
data_res['resid_bal'] = model.resid
model.summary()

###### Boxplot on residual values for extracting the outliers

In [None]:
fig, ax1 = plt.subplots(ncols=1, sharey=True, figsize=(10,5))
ax2 = sns.boxplot(data_res['resid_bal'], orient ='h', ax=ax1).set_title('Outliers')
plt.show()

In [None]:
data_res[(data_res['resid_bal'] > 0.9) | (data_res['resid_bal'] < -1.0)].sort_values('resid_bal') 

###### Observations:
- Potential Outliers: 2907,6358,4827,9280,2574,3166,2513,6074
- Confirmed Outliers: 2907,2574

###### Reason for considering them as Outliers:
For indices (2907,2574):
- Bathrooms for sqft_living does not seems valid according to the overall record given.
- For index(2907): sqft_living is more for less Bathrooms
- For index(2574): sqft_living is less for more Bathrooms

###### Dropping indices containing outliers

In [None]:
data_res = data_res.drop([2907,2574])

## 2. Above

In [None]:
model = ols(formula = 'logBathrooms ~ logAbove', data = data_res).fit()
data_res['resid_ba'] = model.resid
model.summary()

###### Boxplot on residual values for extracting the outliers

In [None]:
fig, ax1 = plt.subplots(ncols=1, sharey=True, figsize=(10,5))
ax2 = sns.boxplot(data_res['resid_ba'], orient ='h', ax=ax1).set_title('Outliers')
plt.show()

In [None]:
data_res[(data_res['resid_ba'] > 0.9) | (data_res['resid_ba'] < -1.0)].sort_values('resid_ba') 

###### Observations:
- Potential Outliers: 2513,6358,3166,9280
- Confirmed Outliers: None

###### Reason for not considering them as Outliers:
- For index (2513,6358,3166,9280): If we see other records, they does not seems to be an outlier.

## 3. Grade

In [None]:
model = ols(formula = 'logBathrooms ~ grade', data = data_res).fit()
data_res['resid_bag'] = model.resid
model.summary()

###### Boxplot on residual values for extracting the outliers

In [None]:
fig, ax1 = plt.subplots(ncols=1, sharey=True, figsize=(10,5))
ax2 = sns.boxplot(data_res['resid_bag'], orient ='h', ax=ax1).set_title('Outliers')
plt.show()

In [None]:
data_res[(data_res['resid_bag'] > 0.9) | (data_res['resid_bag'] < -0.9)].sort_values('resid_bag')

###### Observations:
- Potential Outliers: 717, 3071, 7481, 4193, 2116, 4125, 4504, 9705, 3830, 8696, 1719
- Confirmed Outliers: None

###### Reason for not considering them as Outliers:
- For index (717, 3071, 7481, 4193, 2116, 4125, 4504, 9705, 3830, 8696, 1719): If we see other records, they does not seems to be an outlier.

## 4. Yr_built

In [None]:
model = ols(formula = 'logBathrooms ~ yr_built', data = data_res).fit()
data_res['resid_bay'] = model.resid
model.summary()

###### Boxplot on residual values for extracting the outliers

In [None]:
fig, ax1 = plt.subplots(ncols=1, sharey=True, figsize=(10,5))
ax2 = sns.boxplot(data_res['resid_bay'], orient ='h', ax=ax1).set_title('Outliers')
plt.show()

In [None]:
data_res[(data_res['resid_bay'] <= -1.0) | (data_res['resid_bay'] > 1.18)].sort_values('resid_bay').iloc[:,2:17]

###### Observations:
- Potential Outliers: 3933, 4131, 3657, 1219, 4576
- Confirmed Outliers: None

# Living

### 1. Grade

In [None]:
model = ols(formula = 'logLiving ~ grade', data = data_res).fit()
data_res = data4
data_res['resid_lg'] = model.resid
model.summary()

###### Boxplot on residual values for extracting the outliers

In [None]:
fig, ax1 = plt.subplots(ncols=1, sharey=True, figsize=(10,5))
ax2 = sns.boxplot(data_res['resid_lg'], orient ='h', ax=ax1).set_title('Outliers')
plt.show()

In [None]:
data_res[(data_res['resid_lg'] > 0.9) | (data_res['resid_lg'] < -0.9)].sort_values('resid_lg').iloc[:,2:17]

###### Observations:
- Confirmed Outliers: None

### 2. Above

In [None]:
model = ols(formula = 'logLiving ~ logAbove', data = data_res).fit()
data_res['resid_la'] = model.resid
model.summary()

###### Boxplot on residual values for extracting the outliers

In [None]:
fig, ax1 = plt.subplots(ncols=1, sharey=True, figsize=(10,5))
ax2 = sns.boxplot(data_res['resid_la'], orient ='h', ax=ax1).set_title('Outliers')
plt.show()

In [None]:
data_res[(data_res['resid_la'] > 0.6)].sort_values('sqft_living').iloc[:,2:18]

###### Observations:
- Potential Outliers: 6498,4900,7669,5642,3755,5970,4827,2268,4828
- Confirmed Outliers: 6498,4827

###### Reason for considering them as Outliers:
For indices (6498,4827):
- Living for Above does not seems valid according to the overall record given.

###### Dropping indices containing outliers

In [None]:
data_res = data_res.drop([6498,4827])

# Floors

## 1. Above

In [None]:
model = ols(formula = 'floors ~ logAbove', data = data_res).fit()
data_res['resid_fa'] = model.resid
model.summary()

###### Boxplot on residual values for extracting the outliers

In [None]:
fig, ax1 = plt.subplots(ncols=1, sharey=True, figsize=(10,5))
ax2 = sns.boxplot(data_res['resid_fa'], orient ='h', ax=ax1).set_title('Outliers')
plt.show()

In [None]:
data_res[(data_res['resid_fa'] > 1.9)].sort_values('resid_fa').iloc[:,2:17]

###### Observations:
- Potential Outliers: 3840, 7276, 8369, 7502, 4460
- Confirmed Outliers: None

## 2. Yr_built

In [None]:
model = ols(formula = 'floors ~ yr_built', data = data_res).fit()
data_res['resid_fy'] = model.resid
model.summary()

###### Boxplot on residual values for extracting the outliers

In [None]:
fig, ax1 = plt.subplots(ncols=1, sharey=True, figsize=(10,5))
ax2 = sns.boxplot(data_res['resid_fy'], orient ='h', ax=ax1).set_title('Outliers')
plt.show()

In [None]:
data_res[(data_res['resid_fy'] > 1.7)].sort_values('resid_fy').iloc[:,2:19] 

###### Observations:
- Potential Outliers: 6856, 4460, 788, 1370
- Confirmed Outliers: None

# Grade

## 1. Above

In [None]:
model = ols(formula = 'grade ~ logAbove', data = data_res).fit()
data_res['resid_ga'] = model.resid
model.summary()

###### Boxplot on residual values for extracting the outliers

In [None]:
fig, ax1 = plt.subplots(ncols=1, sharey=True, figsize=(10,5))
ax2 = sns.boxplot(data_res['resid_ga'], orient ='h', ax=ax1).set_title('Outliers')
plt.show()

In [None]:
data_res[(data_res['resid_ga'] > 3) | (data_res['resid_ga'] < -3)].sort_values('resid_ga').iloc[:,2:20] 

###### Observations:
- Potential Outliers: 1158, 3077, 3482, 5528, 1881, 793
- Confirmed Outliers: None

## 8. Dropping indices from the main data

In [None]:
data4 = data4.drop([8,1060,5072,3777,8869,2737,9774,2880,7290,161,4846,8004,9655,2610,2019,8619,356,2907,2574,6498,4827])

## 9. Converting dataframe into CSV file format 

In [None]:
data4.to_csv('./dataset4_solution.csv',encoding='utf-8')

## References
- Tutorials