In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
X = pd.read_csv('housing-classification-iter3.csv')
y = X.pop('Expensive')
X.head(3)

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,Condition1,Heating,Street,CentralAir,Foundation
0,8450,65.0,856,3,0,0,2,0,0,RL,Norm,GasA,Pave,Y,PConc
1,9600,80.0,1262,3,1,0,2,298,0,RL,Feedr,GasA,Pave,Y,CBlock
2,11250,68.0,920,3,1,0,2,0,0,RL,Norm,GasA,Pave,Y,PConc


In [9]:
# splitting dataset 
X_train, X_test,y_train, y_test, = train_test_split(X,y, test_size=0.2, random_state=1245)

#### Categorical encoding (manual approach, without using pipelines)

##### Replacing NaNs in categorical features

In [15]:
from sklearn.impute import SimpleImputer
# Select non-numerical columns
X_train_cat = X_train.select_dtypes(exclude='number')

# Defining imputer to use to use "N_A" as replacement value
cat_imputer = SimpleImputer(strategy='constant',
                           fill_value='N_A')

#Fitting the imputer
cat_imputer.fit(X_train_cat)

# Transforming the data and keeping it as dataframe
X_cat_imputed = pd.DataFrame(cat_imputer.transform(X_train_cat),
                            columns=X_train_cat.columns)
X_cat_imputed.head(3)

Unnamed: 0,MSZoning,Condition1,Heating,Street,CentralAir,Foundation
0,RM,Norm,GasA,Pave,Y,PConc
1,RL,Norm,GasA,Pave,Y,PConc
2,RL,RRAe,GasA,Pave,Y,CBlock


##### Replacing NaNs in numerical features

In [18]:
#select numerical columns
X_train_num = X_train.select_dtypes(include='number')

# imputing the mean
num_imputer = SimpleImputer(strategy='mean')

#fitting 
num_imputer.fit(X_train_num)

#transforming the data and keeping it as dataframe
X_num_imputed = pd.DataFrame(num_imputer.transform(X_train_num),
                            columns=X_train_num.columns)
X_num_imputed.head(3)

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch
0,8546.0,80.0,1121.0,2.0,0.0,0.0,2.0,132.0,0.0
1,4920.0,41.0,1338.0,2.0,0.0,0.0,2.0,0.0,0.0
2,13517.0,69.850467,725.0,3.0,0.0,0.0,2.0,0.0,0.0


In [21]:
#concatenating cat and num df
X_imputed = pd.concat([X_cat_imputed, X_num_imputed], axis=1)
X_imputed.head(3)

Unnamed: 0,MSZoning,Condition1,Heating,Street,CentralAir,Foundation,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch
0,RM,Norm,GasA,Pave,Y,PConc,8546.0,80.0,1121.0,2.0,0.0,0.0,2.0,132.0,0.0
1,RL,Norm,GasA,Pave,Y,PConc,4920.0,41.0,1338.0,2.0,0.0,0.0,2.0,0.0,0.0
2,RL,RRAe,GasA,Pave,Y,CBlock,13517.0,69.850467,725.0,3.0,0.0,0.0,2.0,0.0,0.0


#### One hot encoding :
This means creating a new binary column for each category in every categorical column.
As with any transformer, we have to:

Import it, 
Initialize it, 
Fit it to the data, 
Use it to transform the data

In [23]:
from sklearn.preprocessing import OneHotEncoder

my_onehot = OneHotEncoder(drop='first')
my_onehot.fit(X_cat_imputed)
X_cat_imputed_onehot= my_onehot.transform(X_cat_imputed)

In [24]:
X_cat_imputed_onehot

<1168x23 sparse matrix of type '<class 'numpy.float64'>'
	with 6773 stored elements in Compressed Sparse Row format>

##### The result is a sparse matrix which means it contains mostly zeros.

In [26]:
# Converting sparse matrix into dataframe
df = pd.DataFrame.sparse.from_spmatrix(X_cat_imputed_onehot)
df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0


In [29]:
# Retrieving column names  from the onehot columns
colnames = my_onehot.get_feature_names_out(X_cat_imputed.columns)
df.columns = colnames
df.head(3)

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,...,Heating_Grav,Heating_OthW,Heating_Wall,Street_Pave,CentralAir_Y,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0


In [30]:
# concatenating onehot df with numerical df
X_imputed = pd.concat([df, X_num_imputed], axis=1)
X_imputed.head(3)

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,...,Foundation_Wood,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,8546.0,80.0,1121.0,2.0,0.0,0.0,2.0,132.0,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,4920.0,41.0,1338.0,2.0,0.0,0.0,2.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,13517.0,69.850467,725.0,3.0,0.0,0.0,2.0,0.0,0.0
