In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings('ignore')


In [3]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Housing.csv')
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
data.isnull().sum()

Unnamed: 0,0
price,0
area,0
bedrooms,0
bathrooms,0
stories,0
mainroad,0
guestroom,0
basement,0
hotwaterheating,0
airconditioning,0


In [5]:
data.dtypes


Unnamed: 0,0
price,int64
area,int64
bedrooms,int64
bathrooms,int64
stories,int64
mainroad,object
guestroom,object
basement,object
hotwaterheating,object
airconditioning,object


In [6]:
# Creating list of categorical columns
cat_cols = [cols for cols in data.columns if data[cols].dtypes == 'O']

# list of classes in each categorical column
for cols in cat_cols:
  print(cols, ':', data[cols].unique())

mainroad : ['yes' 'no']
guestroom : ['no' 'yes']
basement : ['no' 'yes']
hotwaterheating : ['no' 'yes']
airconditioning : ['yes' 'no']
prefarea : ['yes' 'no']
furnishingstatus : ['furnished' 'semi-furnished' 'unfurnished']


In [7]:
# Using one hot encoder
sk_encoder = OneHotEncoder(sparse_output = False,  dtype = 'int', drop = 'first')
sk_encoder_data = sk_encoder.fit_transform(data[cat_cols])
sk_encoder_data = pd.DataFrame(sk_encoder_data, columns = sk_encoder.get_feature_names_out())
sk_encoder_df = pd.concat([data.drop(cat_cols, axis = 1), sk_encoder_data], axis = 1)
sk_encoder_df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,1,0,0,0,1,1,0,0
1,12250000,8960,4,4,4,3,1,0,0,0,1,0,0,0
2,12250000,9960,3,2,2,2,1,0,1,0,0,1,1,0
3,12215000,7500,4,2,2,3,1,0,1,0,1,1,0,0
4,11410000,7420,4,1,2,2,1,1,1,0,1,0,0,0


In [11]:
data2 = sns.load_dataset('penguins')
data2.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [12]:
data2.isnull().sum()

Unnamed: 0,0
species,0
island,0
bill_length_mm,2
bill_depth_mm,2
flipper_length_mm,2
body_mass_g,2
sex,11


In [13]:
data2.dtypes

Unnamed: 0,0
species,object
island,object
bill_length_mm,float64
bill_depth_mm,float64
flipper_length_mm,float64
body_mass_g,float64
sex,object


In [14]:
cat_cols = [cols for cols in data2.columns if data2[cols].dtypes == 'O']

for cols in cat_cols:
  print(cols, ':',data2[cols].unique())

species : ['Adelie' 'Chinstrap' 'Gentoo']
island : ['Torgersen' 'Biscoe' 'Dream']
sex : ['Male' 'Female' nan]


In [16]:
encoder  = OneHotEncoder(sparse_output=False, dtype = 'int', )
encoded_data = encoder.fit_transform(data2[cat_cols])
encoded_data= pd.DataFrame(encoded_data, columns  = encoder.get_feature_names_out())
encoded_df = pd.concat([data2.drop(cat_cols,axis = 1), encoded_data], axis = 1)
encoded_df.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen,sex_Female,sex_Male,sex_nan
0,39.1,18.7,181.0,3750.0,1,0,0,0,0,1,0,1,0
1,39.5,17.4,186.0,3800.0,1,0,0,0,0,1,1,0,0
2,40.3,18.0,195.0,3250.0,1,0,0,0,0,1,1,0,0
3,,,,,1,0,0,0,0,1,0,0,1
4,36.7,19.3,193.0,3450.0,1,0,0,0,0,1,1,0,0


In [19]:
encoded_df = encoded_df.dropna()
encoded_df = encoded_df.drop(['sex_nan'],axis = 1)
encoded_df.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen,sex_Female,sex_Male
0,39.1,18.7,181.0,3750.0,1,0,0,0,0,1,0,1
1,39.5,17.4,186.0,3800.0,1,0,0,0,0,1,1,0
2,40.3,18.0,195.0,3250.0,1,0,0,0,0,1,1,0
4,36.7,19.3,193.0,3450.0,1,0,0,0,0,1,1,0
5,39.3,20.6,190.0,3650.0,1,0,0,0,0,1,0,1


In [20]:
encoded_df.isnull().sum()

Unnamed: 0,0
bill_length_mm,0
bill_depth_mm,0
flipper_length_mm,0
body_mass_g,0
species_Adelie,0
species_Chinstrap,0
species_Gentoo,0
island_Biscoe,0
island_Dream,0
island_Torgersen,0


In [21]:
encoded_df.dtypes

Unnamed: 0,0
bill_length_mm,float64
bill_depth_mm,float64
flipper_length_mm,float64
body_mass_g,float64
species_Adelie,int64
species_Chinstrap,int64
species_Gentoo,int64
island_Biscoe,int64
island_Dream,int64
island_Torgersen,int64
