In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
print("="*60)
print(" WHY WE NEED CATEGORICAL ENCODING")
print("="*60)

# Create sample data with categories
data = pd.DataFrame({
    'Customer': ['John', 'Jane', 'Bob', 'Alice', 'Charlie'],
    'Product': ['Laptop', 'Phone', 'Tablet', 'Laptop', 'Phone'],
    'Size': ['Small', 'Medium', 'Large', 'Medium', 'Small'],
    'Color': ['Red', 'Blue', 'Green', 'Red', 'Blue'],
    'Price': [1000, 800, 600, 1100, 850]
})

print("üìä Original Data:")
print(data)

# Try to use in ML (will fail!)
print("\n‚ùå Problem:")
print("   Machine learning algorithms need NUMBERS, not text")
print("   We need to convert 'Laptop' ‚Üí 1, 'Phone' ‚Üí 2, etc.")
print("\n   But there are TWO ways to do this...")

 WHY WE NEED CATEGORICAL ENCODING
üìä Original Data:
  Customer Product    Size  Color  Price
0     John  Laptop   Small    Red   1000
1     Jane   Phone  Medium   Blue    800
2      Bob  Tablet   Large  Green    600
3    Alice  Laptop  Medium    Red   1100
4  Charlie   Phone   Small   Blue    850

‚ùå Problem:
   Machine learning algorithms need NUMBERS, not text
   We need to convert 'Laptop' ‚Üí 1, 'Phone' ‚Üí 2, etc.

   But there are TWO ways to do this...


In [3]:
from sklearn.preprocessing import LabelEncoder

print("\n" + "="*60)
print(" LABEL ENCODING (For Ordinal Data)")
print("="*60)

# Example: T-shirt sizes have natural order
sizes = pd.DataFrame({
    'Size': ['Small', 'Medium', 'Large', 'Medium', 'Small', 'XL', 'Small', 'Large']
})

print("üìä Original Sizes:")
print(sizes['Size'].value_counts())

# Apply Label Encoding
le = LabelEncoder()
sizes['Size_Encoded'] = le.fit_transform(sizes['Size'])

print("\nüìä After Label Encoding:")
print(sizes)
print(f"\nMapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")

print("\n‚úì WHEN TO USE:")
print("   ‚Ä¢ Education: High School < Bachelor < Master < PhD")
print("   ‚Ä¢ T-Shirt Size: XS < S < M < L < XL")
print("   ‚Ä¢ Rating: Poor < Fair < Good < Excellent")
print("   ‚Ä¢ Priority: Low < Medium < High")

print("\n‚ùå WHEN NOT TO USE:")
print("   ‚Ä¢ Colors: Red, Blue, Green (no order)")
print("   ‚Ä¢ Cities: NYC, LA, Chicago (no order)")
print("   ‚Ä¢ Product Categories: Laptop, Phone, Tablet (no order)")

# Create proper ordinal encoding with order
from sklearn.preprocessing import OrdinalEncoder

size_order = [['Small', 'Medium', 'Large', 'XL']]
ordinal_encoder = OrdinalEncoder(categories=size_order)
sizes['Size_Ordinal'] = ordinal_encoder.fit_transform(sizes[['Size']])

print("\nüìä Proper Ordinal Encoding:")
print(sizes)
print("   Small=0, Medium=1, Large=2, XL=3 ‚úì (Maintains order)")


 LABEL ENCODING (For Ordinal Data)
üìä Original Sizes:
Size
Small     3
Medium    2
Large     2
XL        1
Name: count, dtype: int64

üìä After Label Encoding:
     Size  Size_Encoded
0   Small             2
1  Medium             1
2   Large             0
3  Medium             1
4   Small             2
5      XL             3
6   Small             2
7   Large             0

Mapping: {'Large': np.int64(0), 'Medium': np.int64(1), 'Small': np.int64(2), 'XL': np.int64(3)}

‚úì WHEN TO USE:
   ‚Ä¢ Education: High School < Bachelor < Master < PhD
   ‚Ä¢ T-Shirt Size: XS < S < M < L < XL
   ‚Ä¢ Rating: Poor < Fair < Good < Excellent
   ‚Ä¢ Priority: Low < Medium < High

‚ùå WHEN NOT TO USE:
   ‚Ä¢ Colors: Red, Blue, Green (no order)
   ‚Ä¢ Cities: NYC, LA, Chicago (no order)
   ‚Ä¢ Product Categories: Laptop, Phone, Tablet (no order)

üìä Proper Ordinal Encoding:
     Size  Size_Encoded  Size_Ordinal
0   Small             2           0.0
1  Medium             1           1.0
2   Large   

In [4]:
print("\n" + "="*60)
print(" ONE-HOT ENCODING (For Nominal Data)")
print("="*60)

# Example: Product categories (no natural order)
products = pd.DataFrame({
    'Product': ['Laptop', 'Phone', 'Tablet', 'Laptop', 'Phone', 'Watch', 'Tablet'],
    'Price': [1000, 800, 600, 1100, 850, 300, 650]
})

print("üìä Original Data:")
print(products)

# Apply One-Hot Encoding
products_encoded = pd.get_dummies(products, columns=['Product'], prefix='Product')

print("\nüìä After One-Hot Encoding:")
print(products_encoded)

print("\nüí° HOW IT WORKS:")
print("   ‚Ä¢ Creates a NEW COLUMN for each category")
print("   ‚Ä¢ Uses 1 if that's the category, 0 otherwise")
print("   ‚Ä¢ 'Laptop' becomes: Product_Laptop=1, Product_Phone=0, Product_Tablet=0")

print("\n‚úì ADVANTAGES:")
print("   ‚Ä¢ No false ordering (Laptop ‚â† 'better' than Phone)")
print("   ‚Ä¢ ML algorithms treat each category independently")

print("\n‚ö†Ô∏è  DISADVANTAGES:")
print("   ‚Ä¢ Creates many columns if many categories")
print("   ‚Ä¢ Can lead to 'curse of dimensionality'")

# Alternative: Drop first column to avoid multicollinearity
products_encoded_drop = pd.get_dummies(products, columns=['Product'], prefix='Product', drop_first=True)

print("\nüìä One-Hot with drop_first=True:")
print(products_encoded_drop)
print("\nüí° Why drop_first?")
print("   ‚Ä¢ Avoids multicollinearity (columns perfectly predict each other)")
print("   ‚Ä¢ If Product_Phone=0 AND Product_Tablet=0, we KNOW it's Laptop")
print("   ‚Ä¢ Reduces dimensions without losing information")


 ONE-HOT ENCODING (For Nominal Data)
üìä Original Data:
  Product  Price
0  Laptop   1000
1   Phone    800
2  Tablet    600
3  Laptop   1100
4   Phone    850
5   Watch    300
6  Tablet    650

üìä After One-Hot Encoding:
   Price  Product_Laptop  Product_Phone  Product_Tablet  Product_Watch
0   1000            True          False           False          False
1    800           False           True           False          False
2    600           False          False            True          False
3   1100            True          False           False          False
4    850           False           True           False          False
5    300           False          False           False           True
6    650           False          False            True          False

üí° HOW IT WORKS:
   ‚Ä¢ Creates a NEW COLUMN for each category
   ‚Ä¢ Uses 1 if that's the category, 0 otherwise
   ‚Ä¢ 'Laptop' becomes: Product_Laptop=1, Product_Phone=0, Product_Tablet=0

‚úì ADVANTAGES