In [70]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Define sample data
num_records = 1000

# Generate random categorical data
colors = np.random.choice(['red', 'blue', 'green', 'yellow'], num_records)
sizes = np.random.choice(['small', 'medium', 'large'], num_records)
categories = np.random.choice(['A', 'B', 'C'], num_records)

# Generate random numerical data
values = np.random.randn(num_records) * 10

# Create DataFrame
df = pd.DataFrame({
    'color': colors,
    'size': sizes,
    'category': categories,
    'value': values
})

print(df.head())


    color    size category      value
0   green  medium        A  -6.108899
1  yellow   large        A  11.945319
2     red   small        A  12.440339
3   green   small        B -22.031605
4   green   small        A  -7.979595


## 1.Using LabelEncoder

### fit


In [71]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
le_encoder = LabelEncoder()

# Fit the encoder on the 'color' column
le_encoder.fit(df['color'])

# Print the classes learned by the encoder
print("Classes:", le_encoder.classes_)


Classes: ['blue' 'green' 'red' 'yellow']


### fit_transform

In [72]:
# Initialize separate LabelEncoder instances
le_color = LabelEncoder()
le_size = LabelEncoder()
le_category = LabelEncoder()

# Fit and transform the 'color', 'size', and 'category' columns
df['color_encoded'] = le_color.fit_transform(df['color'])
df['size_encoded'] = le_size.fit_transform(df['size'])
df['category_encoded'] = le_category.fit_transform(df['category'])

print(df.head())


    color    size category      value  color_encoded  size_encoded  \
0   green  medium        A  -6.108899              1             1   
1  yellow   large        A  11.945319              3             0   
2     red   small        A  12.440339              2             2   
3   green   small        B -22.031605              1             2   
4   green   small        A  -7.979595              1             2   

   category_encoded  
0                 0  
1                 0  
2                 0  
3                 1  
4                 0  


### inverse_transform

In [73]:
# Convert the encoded 'color' values back to the original labels
original_colors = le_color.inverse_transform(df['color_encoded'])
original_sizes = le_size.inverse_transform(df['size_encoded'])
original_categories = le_category.inverse_transform(df['category_encoded'])

print(original_colors[:5])  # Display first 5 original labels
print(original_sizes[:5])
print(original_categories[:5])


['green' 'yellow' 'red' 'green' 'green']
['medium' 'large' 'small' 'small' 'small']
['A' 'A' 'A' 'B' 'A']


## Using OneHotEncoder

### fit

In [74]:

# Create DataFrame
df = pd.DataFrame({
    'color': colors,
    'size': sizes,
    'category': categories,
    'value': values
})

from sklearn.preprocessing import OneHotEncoder

# Initialize OneHotEncoder
ohe = OneHotEncoder(sparse_output=False,drop='first')  # Set sparse_output=False for a dense array

# Fit the encoder on the 'color' column
ohe.fit(df[['color']])

# Print the categories learned by the encoder
print("Categories:", ohe.categories_)


Categories: [array(['blue', 'green', 'red', 'yellow'], dtype=object)]


### fit_transform

In [75]:
# Initialize OneHotEncoder
ohe_color = OneHotEncoder(sparse_output=False,drop='first')
ohe_size = OneHotEncoder(sparse_output=False,drop='first')
ohe_category = OneHotEncoder(sparse_output=False,drop='first')

# Fit and transform the 'color' column
color_encoded = ohe_color.fit_transform(df[['color']])
color_encoded_df = pd.DataFrame(color_encoded, columns=ohe_color.get_feature_names_out(['color']))

# Fit and transform the 'size' column
size_encoded = ohe_size.fit_transform(df[['size']])
size_encoded_df = pd.DataFrame(size_encoded, columns=ohe_size.get_feature_names_out(['size']))

# Fit and transform the 'category' column
category_encoded = ohe_category.fit_transform(df[['category']])
category_encoded_df = pd.DataFrame(category_encoded, columns=ohe_category.get_feature_names_out(['category']))

# Combine the encoded columns with the original DataFrame (excluding original categorical columns)
encoded_df = pd.concat([color_encoded_df, size_encoded_df, category_encoded_df, df[['value']].reset_index(drop=True)], axis=1)

print(encoded_df.head())


   color_green  color_red  color_yellow  size_medium  size_small  category_B  \
0          1.0        0.0           0.0          1.0         0.0         0.0   
1          0.0        0.0           1.0          0.0         0.0         0.0   
2          0.0        1.0           0.0          0.0         1.0         0.0   
3          1.0        0.0           0.0          0.0         1.0         1.0   
4          1.0        0.0           0.0          0.0         1.0         0.0   

   category_C      value  
0         0.0  -6.108899  
1         0.0  11.945319  
2         0.0  12.440339  
3         0.0 -22.031605  
4         0.0  -7.979595  


In [76]:
# First fit. Then transform the 'size' column

# ohe_size.fit(df[['size']])
# size_encoded = ohe_size.transform(df[['size']])
# size_encoded_df = pd.DataFrame(size_encoded, columns=ohe.get_feature_names_out(['size']))


### inverse_transform

In [77]:
# Convert the one-hot encoded  values back to the original labels
original_colors = ohe_color.inverse_transform(color_encoded)
original_sizes = ohe_size.inverse_transform(size_encoded)
original_categories = ohe_category.inverse_transform(category_encoded)

# Display first 5 original labels
print(original_colors[:5])
print('')
print(original_sizes[:5])
print('')
print(original_categories[:5])


[['green']
 ['yellow']
 ['red']
 ['green']
 ['green']]

[['medium']
 ['large']
 ['small']
 ['small']
 ['small']]

[['A']
 ['A']
 ['A']
 ['B']
 ['A']]


### Using pandas `get_dummies`

In [78]:
# Create DataFrame
df = pd.DataFrame({
    'color': colors,
    'size': sizes,
    'category': categories,
    'value': values
})

# Convert categorical columns to dummy/indicator variables
dummies_df = pd.get_dummies(df, drop_first=True)

dummies_df = dummies_df.astype(int)

print(dummies_df.head())

   value  color_green  color_red  color_yellow  size_medium  size_small  \
0     -6            1          0             0            1           0   
1     11            0          0             1            0           0   
2     12            0          1             0            0           1   
3    -22            1          0             0            0           1   
4     -7            1          0             0            0           1   

   category_B  category_C  
0           0           0  
1           0           0  
2           0           0  
3           1           0  
4           0           0  


## OneHotEncoder with ColumnTransfer

In [79]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer



# Create DataFrame
df = pd.DataFrame({
    'color': colors,
    'size': sizes,
    'category': categories,
    'value': values
})

print(df.head())


    color    size category      value
0   green  medium        A  -6.108899
1  yellow   large        A  11.945319
2     red   small        A  12.440339
3   green   small        B -22.031605
4   green   small        A  -7.979595


In [80]:
# Initialize OneHotEncoder and StandardScaler
ohe = OneHotEncoder(sparse_output=False, drop='first')
scaler = StandardScaler()

# Create a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', ohe, ['color', 'size', 'category']),  # Apply OneHotEncoder to categorical columns
        ('num', scaler, ['value'])                    # Apply StandardScaler to numerical column
    ],
    remainder='passthrough'  # Pass through other columns without transformation
)

# Fit and transform the data
preprocessed_data = preprocessor.fit_transform(df)

# Retrieve feature names from the fitted ColumnTransformer
# Access OneHotEncoder's feature names directly
ohe_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(['color', 'size', 'category'])
feature_names = list(ohe_feature_names) + ['value']

# Create DataFrame with preprocessed data
preprocessed_df = pd.DataFrame(preprocessed_data, columns=feature_names)

print(preprocessed_df.head())

   color_green  color_red  color_yellow  size_medium  size_small  category_B  \
0          1.0        0.0           0.0          1.0         0.0         0.0   
1          0.0        0.0           1.0          0.0         0.0         0.0   
2          0.0        1.0           0.0          0.0         1.0         0.0   
3          1.0        0.0           0.0          0.0         1.0         1.0   
4          1.0        0.0           0.0          0.0         1.0         0.0   

   category_C     value  
0         0.0 -0.660155  
1         0.0  1.191936  
2         0.0  1.242717  
3         0.0 -2.293584  
4         0.0 -0.852060  
