# One-Hot Encoding

One hot label encoding this is a way to encoding the features.

One-hot encoding is a way to encode categorical features in a dataset.

It converts categorical variables into binary vectors to represent the presence or absence of a category.

This is useful when working with machine learning algorithms that cannot directly handle categorical data.

In [12]:
#import required libraries
import pandas as pd
import numpy as np

In [13]:
#creat a dataframe
df=pd.read_csv('houseprice.csv')
df.head(5)

Unnamed: 0,Town,Area,Price
0,A,1000,1000000
1,A,2000,2500000
2,A,2200,3000000
3,A,2500,3400000
4,A,3000,4000000


In [14]:
#import labelencoder
from sklearn.preprocessing import LabelEncoder

In [15]:
#Create a LabelEncoder object
le=LabelEncoder()

In [16]:
#Transform the 'Town' column to integer values
le.fit_transform(df.Town)

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2])

In [17]:
#Assign the transformed values back to the 'Town' column

df.Town=le.fit_transform(df.Town)

In [18]:
df

Unnamed: 0,Town,Area,Price
0,0,1000,1000000
1,0,2000,2500000
2,0,2200,3000000
3,0,2500,3400000
4,0,3000,4000000
5,1,3500,5000000
6,1,2000,3000000
7,1,2100,3200000
8,1,2200,3300000
9,2,2700,4000000


In [19]:
#Extract the independent variable 'x' and the dependent variable 'y'
x = df[['Town','Area']].values
x

array([[   0, 1000],
       [   0, 2000],
       [   0, 2200],
       [   0, 2500],
       [   0, 3000],
       [   1, 3500],
       [   1, 2000],
       [   1, 2100],
       [   1, 2200],
       [   2, 2700],
       [   2, 2200],
       [   2, 2500],
       [   2, 3000],
       [   2, 3500],
       [   2, 4000],
       [   2, 4500]], dtype=int64)

In [20]:
#create y
y=df.Price
y

0     1000000
1     2500000
2     3000000
3     3400000
4     4000000
5     5000000
6     3000000
7     3200000
8     3300000
9     4000000
10    3500000
11    3700000
12    4000000
13    4400000
14    4800000
15    5000000
Name: Price, dtype: int64

In [21]:
#Import OneHotEncoder and ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

#Create a OneHotEncoder object
ohe=OneHotEncoder()

In [22]:
#Specify the categorical feature column index
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [0])],remainder='passthrough')

In [23]:
#Perform one-hot encoding on the 'x' variable
x=np.array(columnTransformer.fit_transform(x))
x

array([[1.0e+00, 0.0e+00, 0.0e+00, 1.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 2.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 2.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 2.5e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.5e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.0e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.2e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.7e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.2e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.5e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.5e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 4.5e+03]])

In [24]:
#Drop the first column to avoid multicollinearity
x=x[:,1:]
x

array([[0.0e+00, 0.0e+00, 1.0e+03],
       [0.0e+00, 0.0e+00, 2.0e+03],
       [0.0e+00, 0.0e+00, 2.2e+03],
       [0.0e+00, 0.0e+00, 2.5e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 3.5e+03],
       [1.0e+00, 0.0e+00, 2.0e+03],
       [1.0e+00, 0.0e+00, 2.1e+03],
       [1.0e+00, 0.0e+00, 2.2e+03],
       [0.0e+00, 1.0e+00, 2.7e+03],
       [0.0e+00, 1.0e+00, 2.2e+03],
       [0.0e+00, 1.0e+00, 2.5e+03],
       [0.0e+00, 1.0e+00, 3.0e+03],
       [0.0e+00, 1.0e+00, 3.5e+03],
       [0.0e+00, 1.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 4.5e+03]])

In [25]:
#Import LinearRegression model
from sklearn.linear_model import LinearRegression

In [26]:
#Create a LinearRegression model object and fit the data
model=LinearRegression()

In [27]:
model.fit(x,y)

In [28]:
#Make a prediction for a new input
model.predict([[0,1,3000]])

array([3995001.26871353])