<a href="https://colab.research.google.com/github/Gabrielsandbox/AI-ML-Codebase/blob/main/FeatureEngineering_Categorical_Transform.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Ordinal Encoding** (Ordinal Data)

In [None]:
print(cars['condition'].value_counts())
# #OUTPUT
# New          2881
# Like New     2860
# Good         2027
# Fair          753
# Excellent     186

# create dictionary of label:values in order
rating_dict = {'Excellent':5, 'New':4, 'Like New':3, 'Good':2, 'Fair':1}

#create a new column
cars['condition_rating'] = cars['condition'].map(rating_dict)

'''
OR
'''
# using scikit-learn
from sklearn.preprocessing import OrdinalEncoder

# create encoder and set category order
encoder = OrdinalEncoder(categories=[['Excellent', 'New', 'Like New', 'Good', 'Fair']])

# reshape our feature
condition_reshaped = cars['condition'].values.reshape(-1,1)

# create new variable with assigned numbers
cars['condition_rating'] = encoder.fit_transform(condition_reshaped)

# **Label Encoding** (Nominal Data)

In [None]:
print(cars['color'].nunique())
# #OUTPUT
# 19

print(cars['color'].value_counts()[:5])
# #OUTPUT
# black     2015
# white     1931
# gray      1506
# silver    1503
# blue       869

# convert feature to category type
cars['color'] = cars['color'].astype('category')

# save new version of category codes
cars['color'] = cars['color'].cat.codes

# print to see transformation
print(cars['color'].value_counts()[:5])
# #OUTPUT
# 2     2015
# 18    1931
# 8     1506
# 15    1503
# 3      869


'''
OR
'''

from sklearn.preprocessing import LabelEncoder

# create encoder
encoder = LabelEncoder()

# create new variable with assigned numbers
cars['color'] = encoder.fit_transform(cars['color'])

# **One-hot Encoding**

In [None]:
import pandas as pd
# use pandas .get_dummies method to create one new column for each color
ohe = pd.get_dummies(cars['color'])

# join the new columns back onto our cars dataframe
cars = cars.join(ohe)

## print the column names
print(cars.columns)

## check out one of your new columns
## print the 'suv' column
print(cars['suv'].head())

# **Binary Encoding**

In [None]:
# If we find the need to one-hot encode a lot of categorical features which would, in turn,
# create a sparse matrix and may cause problems for our model, a strong alternative to this issue is performing a binary encoder.
# A binary encoder will find the number of unique categories and then convert each category to its binary representation.

from category_encoders import BinaryEncoder

#this will create a new data frame with the color column removed and replaced with our 5 new binary feature columns
colors = BinaryEncoder(cols = ['color'], drop_invariant = True).fit_transform(cars)

# **Hashing**

In [None]:
from category_encoders import HashingEncoder

# This process is similar to one-hot encoding where it will create new binary columns,
# but within the parameters, you can decide how many features to output.
# A huge advantage is reduced dimensionality, but a large disadvantage is that some categories will be mapped to the same values.
# That is called collision.
# It could be a solution to your project and dataset if you are not as interested in assessing the impact of any particular categorical value.

# instantiate our encoder
encoder = HashingEncoder(cols='color', n_components=5)

# do a fit transform on our color column and set to a new variable
hash_results = encoder.fit_transform(cars['color'])

# **Target Encoding**

In [None]:
# Target encoding is a Bayesian encoder used to transform categorical features into hashed numerical values and is sometimes called the mean encoder.
# This encoder can be utilized for data sets that are being prepared for regression-based supervised learning,
# as it needs to take into consideration the mean of the target variable and its correlation between each individual category of our feature.

# the numerical values of each category is replaced with a blend of the posterior probability of the target
# given a particular categorical value and the prior probability of the target over all the training data.

# Some drawbacks to this approach are overfitting and unevenly distributed values that could lead to extremes.

# Example:
# Say we are preparing our dataset for a regression-based supervised learning algorithm that is trying to predict the selling price.

from category_encoders import TargetEncoder

# instantiate our encoder
encoder = TargetEncoder(cols = 'color')

# set the results of our fit_transform to a variable
# the output will be its own pandas series
encoder_results = encoder.fit_transform(cars['color'], cars['sellingprice'])

print(encoder_results.head())
#   color
# 0 11761.881473
# 1 18007.276995
# 2 8458.251232
# 3 14769.292595
# 4 12691.099747

# print all 19 unique values
print(np.sort(encoder_results['color'].unique()))
# OUTPUT
# [ 3054.12209927  8088.87434555  8458.25123153  9276.78571429
#   9867.50002121  9885.8093167  11043.90243902 11247.82608763
#  11761.88147296 11805.06187625 12124.83443709 12376.19047882
#  12691.09974747 13912.83399734 14769.29259451 15496.72704715
#  17174.36440678 17176.25931731 18007.27699531 18048.52540833]

# **Encoding date-time variables**

In [None]:
print(cars['saledate'].dtypes)
# # OUTPUT
# dtype('O')

cars['saledate'] = pd.to_datetime(cars['saledate'])
# #OUTPUT
# datetime64[ns, tzlocal()]

# create new variable for month
cars['month'] = cars['saledate'].dt.month

# create new variable for day of the week
cars['dayofweek'] = cars['saledate'].dt.day

# create new variable for difference between cars model year and year sold
cars['yearbuild_sold'] = cars['saledate'].dt.year - cars['year']

#available methods through pandas .dt
df[‘col’].dt.year	# Outputs the year
df[‘col’].dt.day	# Outputs the day number
df[‘col’].dt.hour	# Outputs the hour from the time
df[‘col’].dt.minute	# Outputs the minute from the time
df[‘col’].dt.second	# Outputs the seconds from the time
df[‘col’].dt.week	# Outputs the week ordinal of the year
df[‘col’].dt.dayofweek	# Outputs the day of the week with Monday = 0 & Sunday = 6