In [230]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [231]:
train = pd.read_csv("Clean_data")

In [232]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
0,1,60,RL,65.0,8450,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,2,2008,WD,Normal,208500,0,0,0
1,2,20,RL,80.0,9600,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,5,2007,WD,Normal,181500,0,0,0
2,3,60,RL,68.0,11250,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,9,2008,WD,Normal,223500,0,0,0
3,4,70,RL,60.0,9550,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,2,2006,WD,Abnorml,140000,0,0,0
4,5,60,RL,84.0,14260,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,12,2008,WD,Normal,250000,0,0,0


In [233]:
#split the data to avoid over fitting
X_train, X_test, y_train, y_test = train_test_split(train, train["SalePrice"], test_size=0.3, random_state=42)
X_train.shape , X_test.shape

((1022, 84), (438, 84))

In [234]:
#creating a small subset of the data to make it more understandable
X_train = X_train[["Alley","LotShape","GarageCond","MasVnrArea","SalePrice"]]

# One- Hot Encoding Using Pandas

In [235]:
#converting into dummy variables
dummy = pd.get_dummies(X_train,drop_first=True)

### Before One hot Encoding

In [236]:
#data before the one hot encoding 
X_train.head()

Unnamed: 0,Alley,LotShape,GarageCond,MasVnrArea,SalePrice
135,Missing,Reg,TA,288.0,174000
1452,Missing,Reg,TA,80.0,145000
762,Missing,Reg,TA,0.0,215200
932,Missing,IR1,TA,302.0,320000
435,Missing,IR2,TA,0.0,212000


In [237]:
data.columns

Index(['Alley', 'LotShape', 'GarageCond', 'MasVnrArea', 'SalePrice'], dtype='object')

### After One-Hot encoding

In [238]:
#data after encoding
dummy.head()

Unnamed: 0,MasVnrArea,SalePrice,Alley_Missing,Alley_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,GarageCond_Fa,GarageCond_Gd,GarageCond_Missing,GarageCond_Po,GarageCond_TA
135,288.0,174000,1,0,0,0,1,0,0,0,0,1
1452,80.0,145000,1,0,0,0,1,0,0,0,0,1
762,0.0,215200,1,0,0,0,1,0,0,0,0,1
932,302.0,320000,1,0,0,0,0,0,0,0,0,1
435,0.0,212000,1,0,1,0,0,0,0,0,0,1


In [239]:
dummy.columns

Index(['MasVnrArea', 'SalePrice', 'Alley_Missing', 'Alley_Pave',
       'LotShape_IR2', 'LotShape_IR3', 'LotShape_Reg', 'GarageCond_Fa',
       'GarageCond_Gd', 'GarageCond_Missing', 'GarageCond_Po',
       'GarageCond_TA'],
      dtype='object')

# One Hot Encoing Using Sci-kit Learn

In [240]:
from sklearn.preprocessing import OneHotEncoder

In [241]:
encoder = OneHotEncoder(categories="auto" #learning the category automatically
                        ,drop="first", #creating k-1 category
                       sparse=False # this will return numpy array else it will return sparse matix
                        ,handle_unknown="error")  #helps to deal with unknown values


#here we have to remove continious variable and then feed it to the encoder
encoder.fit(X_train.drop(["MasVnrArea","SalePrice"],axis=1))

OneHotEncoder(categories='auto', drop='first', dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=False)

In [242]:
encoder.categories_

[array(['Grvl', 'Missing', 'Pave'], dtype=object),
 array(['IR1', 'IR2', 'IR3', 'Reg'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'Missing', 'Po', 'TA'], dtype=object)]

In [243]:
#transforming the data
variables = encoder.transform(X_train.drop(["MasVnrArea","SalePrice"],axis=1))

In [244]:
#converting into dataframe
variables = pd.DataFrame(variables,columns= encoder.get_feature_names(["Alley","LotShape","GarageCond"]))

In [245]:
#concatinating the two data
df= pd.concat([variables, X_train[["MasVnrArea","SalePrice"]]],axis=1)

In [246]:
df.head()

Unnamed: 0,Alley_Missing,Alley_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,GarageCond_Fa,GarageCond_Gd,GarageCond_Missing,GarageCond_Po,GarageCond_TA,MasVnrArea,SalePrice
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,196.0,208500.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,181500.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,162.0,223500.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,140000.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,350.0,250000.0
