In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/train']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    df_to_scale.to_csv(file_base + '.scaled.csv', index=False)

if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_input_data(float(os.environ['INPUT_SCALE_FACTOR']))

# **This notebook showcases the knowledge that I have gained after taking the Python4DataScience course by UCSD**

*I have used the data about residential homes in  Ames, Iowa*

## I have the following approach to present my work
 - Explore the dataset
 - Categorise the features in 3 types
    - building(features related to the building)
    - the space(the features related to the space around the house)
    - neighbourhood(the features about the surrounding of the house)
 - Then i will find the correlation between the features and the sale-price of the house
 - For a feature in the "building" category I will consider them if their correlation is more than 66%
 - For the features in the "space" and the "neighbourhood" category, i will consider only when they are more than 33%
     - if two features are in the category and have almost same correlation then i will drop one of the features
 - i will scale the features if required

### started with importing all the necessary libraries

In [1]:
import numpy as np
# import pandas as pd
exec(os.environ['IREWR_IMPORTS'])
# FIRST-AUTHOR: remove plotting, ML code
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error
# from sklearn.preprocessing import scale
# from math import sqrt

In [2]:
train_data = pd.read_csv("./input/train.scaled.csv")

In [3]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
train_data.head().transpose()

Unnamed: 0,0,1,2,3,4
Id,1,2,3,4,5
MSSubClass,60,20,60,70,60
MSZoning,RL,RL,RL,RL,RL
LotFrontage,65.0,80.0,68.0,60.0,84.0
LotArea,8450,9600,11250,9550,14260
...,...,...,...,...,...
MoSold,2,5,9,2,12
YrSold,2008,2007,2008,2006,2008
SaleType,WD,WD,WD,WD,WD
SaleCondition,Normal,Normal,Normal,Abnorml,Normal


In [5]:
train_data.shape

(1460, 81)

In [6]:
train_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

### the data has got 81 features and 1460 samples, out of the 81 features some are numerical where as some are categorical
- i will drop the categorical features as i am not very confident how to handle them 

In [7]:
# the numerical features related to the space in the building
space_feat_num=['LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','GarageYrBlt','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','SalePrice']

In [8]:
#the data about space of the building
train_data_num_space = train_data[space_feat_num]

In [9]:
#the numerical feature about the neighbourhood of the building
neighbourhood_feat_num = ["LotFrontage",'SalePrice']

In [10]:
#the data about neighbourhood of the building
train_data_num_neighbourhood = train_data[neighbourhood_feat_num]

In [11]:
train_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [12]:
#the numerical features of the building
building_feat_num = ['OverallQual','OverallCond','YearBuilt','YearRemodAdd','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr',
                     'KitchenAbvGr','TotRmsAbvGrd','Fireplaces','MiscVal','MoSold','YrSold','SalePrice']

In [13]:
#data about the building
train_data_num_building = train_data[building_feat_num]

### i tried to find the number of null values for each and then the fraction which is nulll for a given column
- if the significant fraction is null then i will drop the feature **after looking at their correlation** or else i will try and fill in the null values

In [14]:
total = train_data_num_space.isnull().sum().sort_values(ascending=False)
#percentage of null in a given feature and arranging them in ascending order 
percent = (train_data_num_space.isnull().sum()/train_data_num_space.isnull().count()).sort_values(ascending=False)
#creating a dataframe to visualize the number of missing values and the percentage of the missing values
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
# printing the head values
missing_data.head(20)

Unnamed: 0,Total,Percent
GarageYrBlt,81,0.055479
MasVnrArea,8,0.005479
LotArea,0,0.0
BsmtFinSF1,0,0.0
BsmtFinSF2,0,0.0
BsmtUnfSF,0,0.0
GarageCars,0,0.0
GarageArea,0,0.0
WoodDeckSF,0,0.0
OpenPorchSF,0,0.0


### visualising the correlation between different space features and the sale price of the house

In [15]:
#saleprice correlation matrix
corrmat = train_data_num_space.corr()
k = 15 #number of variables for heatmap
# FIRST-AUTHOR: remove plotting
# f, ax = plt.subplots(figsize=(12, 9))
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train_data_num_space[cols].values.T)
# FIRST-AUTHOR: remove plotting
# sns.set(font_scale=1.25)
# hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
# plt.show()
_ = cols.values
_ = cols.values


- we can drop GarageYrBlt and MasVnrArea as we can not get any correlation
- we will consider the features that have a correlation greater than 33% and we will chose only one of the many similarly related features
    - we will drop the WoodenDeckSF and will select the OpenPorcSF instead
    - we will chose the BsmtFinSF1,
    - we will drop the GarageCars and choose the GarageArea

### we repeat the feature selection process and the data cleaning process for the building features of the house

In [16]:
#total number of samples with the given features
total = train_data_num_building.isnull().sum().sort_values(ascending=False)
#percentage of null in a given feature and arranging them in ascending order 
percent = (train_data_num_building.isnull().sum()/train_data_num_building.isnull().count()).sort_values(ascending=False)
#creating a dataframe to visualize the number of missing values and the percentage of the missing values
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
# printing the head values
missing_data.head(20)

Unnamed: 0,Total,Percent
OverallQual,0,0.0
FullBath,0,0.0
YrSold,0,0.0
MoSold,0,0.0
MiscVal,0,0.0
Fireplaces,0,0.0
TotRmsAbvGrd,0,0.0
KitchenAbvGr,0,0.0
BedroomAbvGr,0,0.0
HalfBath,0,0.0


In [17]:
# great there is
# now we will look at the correlation
corrmat = train_data_num_building.corr()
k = 15 #number of variables for heatmap
# FIRST-AUTHOR: remove plotting
# f, ax = plt.subplots(figsize=(12, 9))
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train_data_num_building[cols].values.T)
# FIRST-AUTHOR: remove plotting
# sns.set(font_scale=1.25)
# hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
# plt.show()
_ = cols.values
_ = cols.values


-  we consider the features with correlations greater than 56%
-  we consider the feature full bath
- drop the 1stfloorSF as it similar to the TotalBsmtSF, keep theTotalBsmtSF
- we keep the GrlivArea and  OverallQual

### we try to celan the data  for the features that describe the neighbourhood of the building 

In [18]:
#data cleaning
# total number of null values
total = train_data_num_neighbourhood.isnull().sum().sort_values(ascending=False)
#percentage of null in a given feature and arranging them in ascending order 
percent = (train_data_num_neighbourhood.isnull().sum()/train_data_num_neighbourhood.isnull().count()).sort_values(ascending=False)
#creating a dataframe to visualize the number of missing values and the percentage of the missing values
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
# printing the head values
missing_data.head(20)

Unnamed: 0,Total,Percent
LotFrontage,259,0.177397
SalePrice,0,0.0


In [19]:
# great there is no null value
# now we will look at the correlation
corrmat = train_data_num_neighbourhood.corr()
k = 15 #number of variables for heatmap
# FIRST-AUTHOR: remove plotting
# f, ax = plt.subplots(figsize=(12, 9))
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train_data_num_neighbourhood[cols].values.T)
# FIRST-AUTHOR: remove plotting
# sns.set(font_scale=1.25)
# hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
# plt.show()
_ = cols.values
_ = cols.values


### from the above corelation matrix we can conclude that there is no special feature that we have to consider for the neighbourhood and we drop the features describing the neighbourhood of the buildings

## hence finally, we have the following features under consideration

In [20]:
selected_features = ['OpenPorchSF','BsmtFinSF1','GarageArea','GrLivArea','OverallQual','TotalBsmtSF','FullBath','SalePrice']

### the data for furthur exploration is reduced to the **selected_train_data**

In [21]:
selected_train_data = train_data[selected_features]

### i have tried to find the correlation between the selected features and the "sale price"

In [22]:
#scatterplot
# FIRST-AUTHOR: remove plotting
# sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageArea', 'TotalBsmtSF', 'FullBath', 'OpenPorchSF','BsmtFinSF1']
# FIRST-AUTHOR: remove plotting
# sns.pairplot(selected_train_data[cols], size = 2.5)
# plt.show();
_ = selected_train_data[cols]

## we can see that some of the features are linearly dependent while the others are not linearly dependent
- with advance knowledge of data presentation and conversion of scale we can get a lot of insights from the above plots
- **as i have the limit knowledge of data interpretation and statistics as of now, i could not get more insight from this visualisation** 

In [23]:
selected_train_data.head()

Unnamed: 0,OpenPorchSF,BsmtFinSF1,GarageArea,GrLivArea,OverallQual,TotalBsmtSF,FullBath,SalePrice
0,61,706,548,1710,7,856,2,208500
1,0,978,460,1262,6,1262,2,181500
2,42,486,608,1786,7,920,2,223500
3,35,216,642,1717,7,756,1,140000
4,84,655,836,2198,8,1145,2,250000


### i have scaled the data so that the magnitude of one feature should not dominate the others while training our model

In [24]:
# FIRST-AUTHOR: remove ML code
# Scaled_selected_train_data  = scale(selected_train_data)

In [25]:
# FIRST-AUTHOR: remove ML code
# Scaled_selected_train_data[:5,:]

### to train our model we have to separate the sale-price and the other features

In [26]:
X = selected_train_data[['OpenPorchSF','BsmtFinSF1','GarageArea','GrLivArea','OverallQual','TotalBsmtSF','FullBath']]
# FIRST-AUTHOR: remove ML code
# X=scale(X)

In [27]:
Y=selected_train_data['SalePrice']
Y.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

### we split our data into **training and test data**

In [28]:
# FIRST-AUTHOR: remove ML code
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=324)

### we choose the linear regression model without normalization to make our model

In [29]:
# FIRST-AUTHOR: remove ML code
# regressor = LinearRegression(normalize=False)
# regressor.fit(X_train, y_train)

### we make the prediction on hte test set so that it can be validated afterwards

In [30]:
# FIRST-AUTHOR: remove ML code
# y_prediction = regressor.predict(X_test)
# y_prediction

### we calculate the RMSE value of our prediction and the test data

In [31]:
# FIRST-AUTHOR: remove ML code
# RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))

In [32]:
# FIRST-AUTHOR: remove ML code
# print(RMSE)
# y_test.shape

### **though the prediction have very large error which is because of limited knowledge of Machine Learning algorithms and i will improve upon this prediction as i grow with the upcoming courses of this MicroMasters**

### i try the next learning algorithm the **decision Tree**

In [33]:

# FIRST-AUTHOR: remove ML code
# regressor = DecisionTreeRegressor()
# regressor.fit(X_train, y_train)

In [34]:
# FIRST-AUTHOR: remove ML code
# y_prediction = regressor.predict(X_test)
# y_prediction

In [35]:
# FIRST-AUTHOR: remove ML code
# RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))

In [36]:
# FIRST-AUTHOR: remove ML code
# RMSE

### **the output of the prediction is limited by my knowledge of machine Learning algorithms, the resluts can be further improved** 

## finally we can say that the features that have greatest influence on the selling price of the house are

In [37]:
selected_train_data.columns

Index(['OpenPorchSF', 'BsmtFinSF1', 'GarageArea', 'GrLivArea', 'OverallQual',
       'TotalBsmtSF', 'FullBath', 'SalePrice'],
      dtype='object')

- OpenPorchSF
- BsmtFinSF1
- GarageArea
- GrLivArea
- OverallQual
- TotslBsmtSF
- FullBath

## my conclusions do not include the categorical features

## i have successfully predicted the sale price of house form the given fetures **though with a lot of errors**

# I thank the lecturers of UCSD and the Kaggle community for helping me to learn all that i could 