# Lab 03 - Feature Engineering

The ```train.csv``` consits of a House Price dataset collected in USA. In this lab you are expected to use your knowledge in Feature Engineering and train a Machine Learning Algorithm to predict the House Price using this dataset. Each column of the dataset is clearly explained ```data_description.txt```

[Orginal Dataset in Kaggle](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data)

## Part 01

- Check for any missing data in the dataset and get rid of them using an approprite imputation method.
- Use label encoding and one hot encoding methods to covert all the categorical features available in the dataset.
- Train a Linear Regression model using 80% data from the dataset seperately for each categorical encoding method.
- Validate the Linear Regression model using R2 Score for each categorical encoding  method.



In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Loading the Dataset

In [57]:
initial_dataset =pd.read_csv('train.csv')
initial_dataset.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [58]:
del initial_dataset['Id']
columns = initial_dataset.columns
initial_dataset.head()


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


### Checking Dataset for Missing Values

In [59]:
pd.isnull(initial_dataset).sum()

MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 80, dtype: int64

In [60]:
#initial_dataset.isnull().sum()

### Imputing Missing Data

In [61]:
from sklearn.impute import SimpleImputer
imp_mode = SimpleImputer(missing_values= np.nan, strategy='most_frequent')
imputed_dataset = imp_mode.fit_transform(initial_dataset.drop('SalePrice', axis=1))
pd.isnull(imputed_dataset).sum()


0

### Converting the Numpy Array Back to Pandas Dataframe

In [62]:
df = pd.DataFrame(imputed_dataset, columns= columns.drop('SalePrice'))
df_copy = df.copy(deep=True)
pd.isnull(df).sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
Length: 79, dtype: int64

### Label Encoding the Dataset

Immport Label Encoder

In [63]:
df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [64]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

#THese are the columns that can be catogorised
catColumns = ['MSZoning', 'Street', 'Alley', 'LotShape',
'LandContour', 'Utilities',
'LotConfig', 'LandSlope', 'Neighborhood',
'Condition1', 'Condition2',
'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
'Exterior1st',
'Exterior2nd', 'MasVnrType', 'ExterQual',
'ExterCond', 'Foundation',
'BsmtQual', 'BsmtCond', 'BsmtExposure',
'BsmtFinType1', 'BsmtFinType2',
'Heating', 'HeatingQC', 'CentralAir',
'Electrical', 'KitchenQual',
'Functional', 'FireplaceQu', 'GarageType',
'GarageFinish', 'GarageQual',
'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
'MiscFeature', 'SaleType',
'SaleCondition','YrSold','MoSold']

#Encode labels in Columns
for col in catColumns:
    df[col] = label_encoder.fit_transform(df[col])
df.head(10)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,3,65.0,8450,1,0,3,3,0,4,...,0,0,2,2,2,0,1,2,8,4
1,20,3,80.0,9600,1,0,3,3,0,2,...,0,0,2,2,2,0,4,1,8,4
2,60,3,68.0,11250,1,0,0,3,0,4,...,0,0,2,2,2,0,8,2,8,4
3,70,3,60.0,9550,1,0,0,3,0,0,...,0,0,2,2,2,0,1,0,8,0
4,60,3,84.0,14260,1,0,0,3,0,2,...,0,0,2,2,2,0,11,2,8,4
5,50,3,85.0,14115,1,0,0,3,0,4,...,0,0,2,2,2,700,9,3,8,4
6,20,3,75.0,10084,1,0,3,3,0,4,...,0,0,2,2,2,0,7,1,8,4
7,60,3,60.0,10382,1,0,0,3,0,0,...,0,0,2,2,2,350,10,3,8,4
8,50,4,51.0,6120,1,0,3,3,0,4,...,0,0,2,2,2,0,3,2,8,0
9,190,3,50.0,7420,1,0,3,3,0,0,...,0,0,2,2,2,0,0,2,8,4


## One Hot Encoding the Dataset

one-hot encoding is a fundamental technique in machine learning for transforming categorical data 

In [65]:
#we seperate the different type of set into different 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

transformer = make_column_transformer(
    (OneHotEncoder(categories='auto', handle_unknown= 'ignore',sparse_output= False), catColumns),
    remainder = 'passthrough',verbose_feature_names_out= False)

transformed = transformer.fit_transform(df_copy)
ohe_df = pd.DataFrame(transformed, columns= transformer.get_feature_names_out())
ohe_df.head()

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_Pave,LotShape_IR1,...,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,2003.0,2,548,0,61,0,0,0,0,0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1976.0,2,460,298,0,0,0,0,0,0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,2001.0,2,608,0,42,0,0,0,0,0
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,1998.0,3,642,0,35,272,0,0,0,0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,2000.0,3,836,192,84,0,0,0,0,0


### Slicing the Dataset

In [66]:
dataset=ohe_df.values #numpy

data=dataset[:,:301]
target=dataset[:,301]

print(data)

[[0.0 0.0 0.0 ... 0 0 0]
 [0.0 0.0 0.0 ... 0 0 0]
 [0.0 0.0 0.0 ... 0 0 0]
 ...
 [0.0 0.0 0.0 ... 0 0 0]
 [0.0 0.0 0.0 ... 0 0 0]
 [0.0 0.0 0.0 ... 0 0 0]]


**_______________________________________________________________________________________________________________**
**_______________________________________________________________________________________________________________**

### Splitting the Dataset for Training and Validation

In [67]:
y_actual = initial_dataset.SalePrice
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(ohe_df,y_actual, test_size=0.2, random_state=0)

### Training the Linear Regression Model

In [68]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()

model.fit(x_train, y_train)
y_predict = model.predict(x_test)


### Calculating the R2 Score

In [69]:
from sklearn.metrics import r2_score


r_sq = r2_score(y_test, y_predict)
print("R2 score:", r_sq)


R2 score: -16121909.282752097


**_______________________________________________________________________________________________________________**
**_______________________________________________________________________________________________________________**

In [79]:
from sklearn.preprocessing import QuantileTransformer

model_qntl_data = QuantileTransformer(output_distribution='normal', random_state=0)
data_scaled = model_qntl_data.fit_transform(data)


model_qntl_target = QuantileTransformer(output_distribution='normal', random_state=0)
target_scaled = model_qntl_target.fit_transform(target.reshape(-1,1))
print(np.max(target_scaled),np.min(target_scaled))

5.19933758270342 -5.199337582605575


In [80]:
from sklearn.preprocessing import PolynomialFeatures

model_poly=PolynomialFeatures(degree=2,include_bias=False)
data_high=model_poly.fit_transform(data_scaled)
print(data_high.shape)

(1460, 45752)


In [83]:
from sklearn.model_selection import train_test_split
train_data,test_data,train_target,test_target=train_test_split(data_high,target_scaled,test_size=0.2)

In [84]:
from sklearn.linear_model import LinearRegression

model=LinearRegression()
model.fit(train_data,train_target)
predicted_target=model.predict(test_data)

### Calculate R2

In [86]:
from sklearn.metrics import r2_score

r2=r2_score(test_target,predicted_target)
print("r2 score:",r2)

r2 score: -1.6210369822146293


## Label Encoding

In [75]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

#THese are the columns that can be catogorised
catColumns = ['MSZoning', 'Street', 'Alley', 'LotShape',
'LandContour', 'Utilities',
'LotConfig', 'LandSlope', 'Neighborhood',
'Condition1', 'Condition2',
'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
'Exterior1st',
'Exterior2nd', 'MasVnrType', 'ExterQual',
'ExterCond', 'Foundation',
'BsmtQual', 'BsmtCond', 'BsmtExposure',
'BsmtFinType1', 'BsmtFinType2',
'Heating', 'HeatingQC', 'CentralAir',
'Electrical', 'KitchenQual',
'Functional', 'FireplaceQu', 'GarageType',
'GarageFinish', 'GarageQual',
'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
'MiscFeature', 'SaleType',
'SaleCondition','YrSold','MoSold']

#Encode labels in Columns
for col in catColumns:
    df[col] = label_encoder.fit_transform(df[col])
df.head(10)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,3,65.0,8450,1,0,3,3,0,4,...,0,0,2,2,2,0,1,2,8,4
1,20,3,80.0,9600,1,0,3,3,0,2,...,0,0,2,2,2,0,4,1,8,4
2,60,3,68.0,11250,1,0,0,3,0,4,...,0,0,2,2,2,0,8,2,8,4
3,70,3,60.0,9550,1,0,0,3,0,0,...,0,0,2,2,2,0,1,0,8,0
4,60,3,84.0,14260,1,0,0,3,0,2,...,0,0,2,2,2,0,11,2,8,4
5,50,3,85.0,14115,1,0,0,3,0,4,...,0,0,2,2,2,700,9,3,8,4
6,20,3,75.0,10084,1,0,3,3,0,4,...,0,0,2,2,2,0,7,1,8,4
7,60,3,60.0,10382,1,0,0,3,0,0,...,0,0,2,2,2,350,10,3,8,4
8,50,4,51.0,6120,1,0,3,3,0,4,...,0,0,2,2,2,0,3,2,8,0
9,190,3,50.0,7420,1,0,3,3,0,0,...,0,0,2,2,2,0,0,2,8,4


In [76]:
y_actual = initial_dataset.SalePrice
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df,y_actual, test_size= 0.2, random_state=0)

from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(x_train, y_train)


In [77]:
from sklearn.metrics import r2_score
y_predict = model.predict(x_test)

r_sq = r2_score(y_test, y_predict)
print("R2 Value:", r_sq)

R2 Value: 0.5367944591197663


Slicing the data set

In [89]:
#dataset into data and target
target = initial_dataset['SalePrice']

data = df # As a Label encoded data
data_ohe = ohe_df # As a One hot encoded data

In [90]:
data = data.values
data_ohe = data_ohe.values
target = target.values
data.shape, target.shape, data_ohe.shape

((1460, 79), (1460,), (1460, 302))

In [92]:
#Label encoded data
train_data, test_data, train_target, test_target = train_test_split(data, target, test_size = 0.2, random_state=48)
train_data.shape, test_data.shape

#one hot
train_data1, test_data1, train_target1, test_target1 = train_test_split(data_ohe, target, test_size = 0.2, random_state=48)
train_data1.shape, test_data1.shape

((1168, 302), (292, 302))

In [93]:
r = LinearRegression().fit(train_data, train_target)
r1 = LinearRegression().fit(train_data1, train_target1)