# CHAPTER-5 Handling Categorical Data

In [3]:
# Not all Categorical data is same.

# Nominal: Sets of categories with no ordering is called "Nominal". 

#     Ex:(Blue, Red, Green), (Man, Woman), (Apple, Banana, Orange)
    
# Ordinal: When set of categories have natural ordering its called "Ordinal".

#     Ex: (Low, Medium, High), (Young, Old)


In [None]:
# k-nearest neighbor:
#     distance between 2 observations(Euclidean distance)
    
# ***This calculting distance is not possible if we are working on strings.
#     Convert this to Numerical data using transformation that conveys the information in the categories

## 5.1 Encoding Nominal Categorical Features

Nominal variables: variables with no ordering

In [11]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

In [12]:
# Creating feature

feature = np.array([["Texas"],
                   ["California"],
                   ["Texas"],
                   ["Delaware"],
                   ["Texas"]])
# array elements reffered as class names

In [13]:
# creating one-hot encoder

one_hot = LabelBinarizer()

In [33]:
# one-hot encode feature
one_hot.fit_transform(feature)

# when the class appears it gives 1s otherwise 0s in the encoded array

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [34]:
# view feature classes
one_hot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [35]:
# to reverse the one_hot encoding

one_hot.inverse_transform(one_hot.fit_transform(feature))

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

In [36]:
# on-hot encoding can also be done using pandas

import pandas as pd

In [37]:
pd.get_dummies(feature[:,0])

# when the class appears it gives 1s otherwise 0s in the encoded array

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [38]:
# what differentiates scikit learn from pandas is its ability to handle lists with multiple classes

# Creating a multiclass feature
multiclass_feature = [("Texas", "Florida"),
                      ("California", "Alabama"),
                      ("Texas", "Florida"),
                      ("Delware", "Florida"),
                      ("Texas", "Alabama")]

In [39]:
# creating multiclass on-hot encoder

one_hot_multiclass = MultiLabelBinarizer()

In [40]:
one_hot_multiclass.fit_transform(multiclass_feature)

# when the class appears it gives 1s otherwise 0s in the encoded array

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

In [41]:
one_hot_multiclass.classes_

array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'],
      dtype=object)

## 5.2 Encoding ordinal categorical features

Ordinal variables: variables with natural ordering

In [47]:
# replace() method to transform string labels to numerical equivalents

In [50]:
# Creating a featurre
ordinal_df = pd.DataFrame({"Score":["Low", "Low", "Medium", "Medium", "High"]})

In [64]:
# creating a score mapper
score_mapper = {"Low":1,
                "Medium":2,
                "High":3}

In [65]:
ordinal_df["Score"].replace(score_mapper)

0                          1
1                          1
2                          2
3                          2
4                          3
5    Barely More Than Medium
Name: Score, dtype: object

In [66]:
ordinal_df_1 = pd.DataFrame({"Score":["Low", "Low", "Medium", "Medium", "High","Barely More Than Medium"]})

In [67]:
# creating a score mapper
score_mapper_1 = {"Low":1,
                "Medium":2,
                 "Barely More Than Medium":3,
                "High":4}

In [68]:
ordinal_df_1["Score"].replace(score_mapper_1)

0    1
1    1
2    2
3    2
4    4
5    3
Name: Score, dtype: int64

In [69]:
# creating a score mapper
score_mapper_2 = {"Low":1,
                "Medium":2,
                 "Barely More Than Medium":2.1,
                "High":3}

In [70]:
ordinal_df_1["Score"].replace(score_mapper_2)

0    1.0
1    1.0
2    2.0
3    2.0
4    3.0
5    2.1
Name: Score, dtype: float64

## 5.3 Encoding Dictionaries of Features

Converting a Dictionary into feature matrix

In [75]:
from sklearn.feature_extraction import DictVectorizer

In [99]:
# creating a dictionary

data_dict = [{"Red" : 2, "Blue" : 4},
            {"Red" : 4, "Blue" : 3},
            {"Red" : 1, "Yellow" : 2},
            {"Red" : 2, "Yellow": 2}]

In [100]:
# creating dictionary vectorizer: to transform dict-like object to vectors

dictvectorizer = DictVectorizer(sparse = False)
# sparse matrix is true by default: sparse matrix stores only non-zero elements
# sprse matrix is useful when we want to minimize the memory

In [101]:
features = dictvectorizer.fit_transform(data_dict)
features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [102]:
# dictvectorizer = DictVectorizer().fit_transform(data_dict)
# dictvectorizer

In [104]:
# names of genereated feature

feature_names = dictvectorizer.get_feature_names_out()
feature_names

array(['Blue', 'Red', 'Yellow'], dtype=object)

In [106]:
# pandas to view the output in a better way

pd.DataFrame(features, columns = feature_names)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


In [107]:
# ex: for a set of documents we have a dictionary each, which contains number times a word appears in the document

# word count dictionaries for four documents

doc1_wordCount = {"Red" : 2, "Blue" : 4}
doc2_wordCount = {"Red" : 4, "Blue" : 3}
doc3_wordCount = {"Red" : 1, "Yellow" : 2}
doc4_wordCount = {"Red" : 2, "Yellow": 2}

In [113]:
# creating a lost with 4 doct count dictionaries

word_counts = [doc1_wordCount,
              doc2_wordCount,
              doc3_wordCount,
              doc4_wordCount]

In [114]:
dictvectorizer.fit_transform(word_counts)

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

## 5.4 Imputing Missing class values

Replacing missing values with predicted values.
Caan be done by training a machine learning classifier algorithm(KNN classifier)

In [120]:
from sklearn.neighbors import KNeighborsClassifier

In [121]:
# Creating a feature matrix with categorical feature.
# categorical feature- first column

X = np.array([[0, 2.10, 1.45],
              [1, 1.18, 1.33],
              [0, 1.22, 1.27],
              [1, -0.21, -1.19]])

In [122]:
# adding missing values to categorical feature.

X_nan = np.array([[np.nan, 0.87, 1.31],
                  [np.nan, -0.67, -0.22]])

In [130]:
# training the KNN

clf = KNeighborsClassifier(3, weights='distance')

# 3- number of neighbors to use.
# weights- "uniform": All points in the neighborhood are weighted equally
#         "distance": nearest points have more influence than that are farther away.

trained_model = clf.fit(X[:,1:], X[:,0])

# X[:,1:]- training data
# X[:,0]- target data

In [131]:
# predicting missing values

imputed_values = trained_model.predict(X_nan[:,1:])

In [132]:
imputed_values

array([0., 1.])

In [136]:
# joining predicted class with other features

X_imputed = np.hstack((imputed_values.reshape(-1,1),X_nan[:,1:]))
# hstack- similar to concatenation(adds to the next column)

In [137]:
# joining 2 feature matrices

np.vstack((X_imputed, X))
# vstack- similar to concatenation(adds to the next row)

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [145]:
# ALTERNATIVE: We can also fill missing values with most frequent value in the data set

from sklearn.impute import SimpleImputer

In [147]:
X_whole = np.vstack((X_nan,X))

imputer = SimpleImputer(strategy = 'most_frequent')

# strategy: 'mean'- replaces missing values with mean along each column
#           'meadian'- replaces missing values with median along each column
#           'most_frequent'- replaces missing values with most frequent value along the column

In [148]:
imputer.fit_transform(X_whole)

array([[ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

## 5.5 Handling Imbalanced Classes

In [150]:
# When a target vector has highly imbalnaced classes we can:
#     Collecting more data/Changing the metrics(evaluation metrics)/considering built-in class weight parameters/up-scaling/down-scaling

In [164]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

In [165]:
# loading a dataset from scikit learn
iris = load_iris()

In [166]:
# creating a feature matrix

feature_iris = iris.data

# creating a target vector

target_iris = iris.target

In [167]:
# Iris dataset contains 3 balanced classes of 50 observations
# to unbalance the dataset- removing first 40 observations

feature_iris = feature_iris[40:,:]
target_iris = target_iris[40:]

In [168]:
target_iris

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [169]:
# creating a binary vector where if element is 0 then making 0 and 1 otherwise
target_iris = np.where((target_iris == 0), 0, 1)

target_iris

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [174]:
# "RandomForestClassifier" in scikit learn offers a "class_weight" parameter to handle imbalances in the dataset

# creating weights

weights = {0: .9, 1: 0.1}

# creating random forest classifier with weights

RandomForestClassifier(class_weight = weights)

In [176]:
# we can directly give "balanced" which automatically creates weights inversely proportional class frequencies

RandomForestClassifier(class_weight = "balanced")

In [182]:
# we can downsample majority classes and upsample minority classes according to the observations

# Example

# Indices of each class observations

iris_class0 = np.where(target_iris == 0)[0]
iris_class1 = np.where(target_iris == 1)[0]

In [188]:
# no of observations in each class

n_class0 = len(iris_class0)
n_class1 = len(iris_class1)

In [192]:
# downsampling the "class1" to size of "class0" without replacing

class1_downsampled = np.random.choice(iris_class1, size = n_class0, replace = False)

In [194]:
# joining class0 target vector with class1 downsampled vector

np.hstack((target_iris[iris_class0],target_iris[class1_downsampled]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [196]:
# joining feature matrices of class0 and downsampled class1

np.vstack((feature_iris[iris_class0,:], feature_iris[class1_downsampled,:]))[0:5]

array([[5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4]])

In [197]:
# we can also upsample the minor dataset

# for every observation in majority class we randomly select an observation from minority class

class0_upsampled = np.random.choice(iris_class0, size = n_class1, replace = True)

In [198]:
np.concatenate((target_iris[class0_upsampled],target_iris[iris_class1]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [201]:
# joining upsampled class0 and class1 feature matrix

np.vstack((feature_iris[class0_upsampled,:],feature_iris[iris_class1,:]))[0:5]

array([[5.3, 3.7, 1.5, 0.2],
       [4.4, 3.2, 1.3, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [4.4, 3.2, 1.3, 0.2],
       [5.1, 3.8, 1.9, 0.4]])

Stratergies for handling imbalanced datasets:
    
    I. To collect more observations, espcially of minority classes. (best stratergy)
    II. We can use better suited model evaluation metric. (since accuracy is used for performance, this can be ill suited)
    III. We can use class weighing parameter options in some models. (many scikit-learn classifiers have class_weight parameter)
    IV/V. Upsampling and Downsampling. (Choosing either option is context-specific and its good to try both) 

KEYPOINTS:
    
    *Machine learning algorithms expect data to be in the form of matrix, one such way to transform is by using scikit learns DictVectorizer
    
    *KNN is a Supervised ML, K-means is Unsupervised ML.

KEYWORDS:
    
    one_hot encoding- conversion of categorical data into a format that is fed into machine learning algorithm
    
    fit_transform- fir into a model and transform it into a form suitable for model in a single step
    
    LabelBinarizer- takes categorical data and returns numpy array
    
    MultiLabelBinarizer- takes multiclass categorical data and returns numpy array
    
    DictVectorizer- transforms dict-like objects to vectors
    
    KNeighborsClassifier- 