In [23]:
!pip install numpy



In [24]:
import numpy as np


In [25]:
a=["A"],['B'],['C'],['D']
X=np.array(a)
print(X)
print(type(X))

[['A']
 ['B']
 ['C']
 ['D']]
<class 'numpy.ndarray'>


In [26]:
X=np.array([["A"],['B'],['C'],['D']])
print(X)
print(type(X))

[['A']
 ['B']
 ['C']
 ['D']]
<class 'numpy.ndarray'>


# Transforming categorical data (A,B, C, D) into binary values (000,001,010,011,111)

One-Hot Encoding (OHE)
One-hot encoding converts categorical values into binary columns (0s and 1s).
By default, it creates one column per category, but this can lead to redundancy (the "dummy variable trap").

In [27]:
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder() #it's a sparse matrix


ohe.fit_transform(X).todense()  #Explicitly converts sparse matrix to dense

# A is now represented as 1 0 0 0 
# B is represented as 0 1 0 0 
# C as 0 0 1 0
# D as 0 0 0 1


matrix([[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.]])

In [28]:

ohe_1=OneHotEncoder(sparse_output=False)#without sparse matrix
ohe_1.fit_transform(X)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

# Removing one variable/column (Standard OHE)

In [29]:
X=np.array([["A"],['A'],['C'],['D']])
#print(X)
#print(type(X))

ohe_1=OneHotEncoder(sparse_output=False,drop="first")#without sparse matrix
ohe_1.fit_transform(X)



'''
[[0., 0.],  # A (baseline)
 [0., 0.],  # A
 [1., 0.],  # C (means "C=1, D=0")
 [0., 1.]]  # D (means "C=0, D=1")'''



"""
Columns: C=1,0, D=0,1

"A" is dropped: It becomes the reference category (all zeros).

Now, the encoded features are not multicollinear."""



'''

Why Use drop="first"?
- Avoids Multicollinearity: Critical for linear models (e.g., regression) where correlated features distort coefficients.

- Reduces Dimensionality: For k categories, you get k-1 columns instead of k.

Example: If you have 100 categories, you save 1 column (useful for high-cardinality data).

- Interpretability: In regression, coefficients are relative to the dropped category (baseline).'''

'\n\nWhy Use drop="first"?\n- Avoids Multicollinearity: Critical for linear models (e.g., regression) where correlated features distort coefficients.\n\n- Reduces Dimensionality: For k categories, you get k-1 columns instead of k.\n\nExample: If you have 100 categories, you save 1 column (useful for high-cardinality data).\n\n- Interpretability: In regression, coefficients are relative to the dropped category (baseline).'

# Used in Binary classification

In [30]:
X=np.array([["A"],['A'],['A'],['D']]) # we have 2 labels now
#print(X)
#print(type(X))

ohe_1=OneHotEncoder(sparse_output=False,drop="first")#without sparse matrix
ohe_1.fit_transform(X)

"""
This cretes 2 labels one with 0 and another with 1. If we don't use drop, it would assign A as 1 0 and D as 0 1.

So,
[[1 0],
[1 0],
[1 0],
[0 1]]

But if used drop , we get this. So,  it's simple now.
array([[0.],
       [0.],
       [0.],
       [1.]])"""

"\nThis cretes 2 labels one with 0 and another with 1. If we don't use drop, it would assign A as 1 0 and D as 0 1.\n\nSo,\n[[1 0],\n[1 0],\n[1 0],\n[0 1]]\n\nBut if used drop , we get this. So,  it's simple now.\narray([[0.],\n       [0.],\n       [0.],\n       [1.]])"

# Error Handling for categorical data

In [31]:
#assume we have 2 labels
X=np.array([["A"],['A'],['A'],['B']]) # we have 2 labels now
#print(X)
#print(type(X))

ohe_1=OneHotEncoder(sparse_output=False) #trained on 2 labels
ohe_1.fit_transform(X)


array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.]])

In [32]:
# if we now present a data which has different labels than A & B, it will give error

Y=np.array([["A"],['A'],['B'],['C']]) #here we have A, C, B label

# as ohe_1 was trained on 2 labels, it can't process 3 labels now
ohe_1.transform(Y) #Found unknown categories [np.str_('C')] in column 0 during transform

ValueError: Found unknown categories [np.str_('C')] in column 0 during transform

In [None]:
#we can solve this by ignoring unknown labels

ohe_1=OneHotEncoder(sparse_output=False,handle_unknown="ignore")
ohe_1.fit_transform(Y)

#now it has assigned A = 1 0 0, B = 0 1 0, C =  0 0 1

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

# With multiple labels

In [None]:

enc=OneHotEncoder(sparse_output=False,categories=[["A","B","C","D"],["X","Y","Z"]]) # assuming that we have 7 labels that's suppose to appear
X=[["A","X"],["B","Y"],["C","Z"]] #and we have this as our labels
enc.fit_transform(X)

"""
here 
A is
1
0
0

B is
0 
1
0

D is
0
0
0

because in X, we don't have D but we trained the encoder (enc) with A, B , C , D, X, Y & Z labels

"""

array([[1., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0., 1.]])

In [None]:
Y = [["D","Z"]]
enc.fit_transform(Y)

#As we trained the encoder that we will have A, B ,  C , D, X, Y, Z . So, it did set A=0, B=0, C=0 , D=1, X=0, Y=0, Z=1

array([[0., 0., 0., 1., 0., 0., 1.]])

# Ordinal Encoding : Here Categories are converted to numerical values whereas 

Purpose: Designed for encoding feature variables (X) with ordinal categories (categories with an inherent order).

Input: Works on 2D arrays (multiple columns).

Output: Returns a 2D array where each category is replaced by an integer.

In [None]:
from sklearn.preprocessing import OrdinalEncoder
X=[["High"],["Low"],["Low"],["Medium"]]
enc=OrdinalEncoder()
enc.fit_transform(X)
# here High as 0, Low as 1, Medium as 2


array([[0.],
       [1.],
       [1.],
       [2.]])

# What if we want to customize , Low as 0, Medium as 1, High as 2

In [None]:
enc_1=OrdinalEncoder(categories=[["Low","Medium","High"]])
enc_1.fit_transform(X)

# High is 2, Low as 0, Medium as 1

array([[2.],
       [0.],
       [0.],
       [1.]])

# Multiple labels

In [None]:
X=[["High","A"],["Low","C"],["Low","B"],["Medium","C"]]
enc=OrdinalEncoder(categories=[["Low","Medium","High"],["A","B","C"]]) #training these labels , so that low is 0, Medium as 1, High 2
# A as 0, B as 1, C as 2
enc.fit_transform(X)

# in output, 2 0 means High , A


array([[2., 0.],
       [0., 2.],
       [0., 1.],
       [1., 2.]])

In [None]:
X = [["cat", "small"], ["dog", "medium"], ["cat", "large"]]
oe = OrdinalEncoder()
X_encoded = oe.fit_transform(X)  # Output: array([[0., 1.], [1., 2.], [0., 0.]])
print(X_encoded)

[[0. 2.]
 [1. 1.]
 [0. 0.]]


# Label Encoding
Purpose: Primarily used for encoding target labels (i.e., the dependent variable y in supervised learning).

Input: Works on 1D arrays (single column).

Output: Returns a 1D array of integers.

In [None]:
from sklearn.preprocessing import LabelEncoder
y=["A","B","B","C","D"]
enc=LabelEncoder()
enc.fit_transform(y)


array([0, 1, 1, 2, 3])