# Categorical data processing

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier

In [2]:
feature = np.array([["Texas"],
                    ["California"],
                    ["Texas"],
                    ["Delaware"],
                    ["Texas"]])

In [3]:
one_hot = preprocessing.LabelBinarizer()
one_hot.fit_transform(feature)  # nominal categorical feature encoding

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [4]:
one_hot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [5]:
one_hot.inverse_transform(one_hot.transform(feature))

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

In [6]:
pd.get_dummies(feature[:,0])  # nominal categorical feature encoding

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [7]:
multiclass_feature = [("Texas", "Florida"),
                      ("California", "Alabama"),
                      ("Texas", "Florida"),
                      ("Delaware", "Florida"),
                      ("Texas", "Alabama")]

In [8]:
one_hot_multiclass = preprocessing.MultiLabelBinarizer()
one_hot_multiclass.fit_transform(multiclass_feature)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

In [9]:
one_hot_multiclass.classes_

array(['Alabama', 'California', 'Delaware', 'Florida', 'Texas'],
      dtype=object)

In [10]:
dataframe = pd.DataFrame({"mark": ["low", "low", "average", "average", "high"]})
scale_mapper = {"low": 1,
                "average": 2,
                "high": 3}
dataframe["mark"].replace(scale_mapper)  # ordinal categorical data encoding

0    1
1    1
2    2
3    2
4    3
Name: mark, dtype: int64

In [11]:
data_dict = [{"red": 2, "blue": 4},
             {"red": 4, "blue": 3},
             {"red": 1, "yellow": 2},
             {"red": 2, "yellow": 2}]

In [12]:
dictvectorizer = DictVectorizer(sparse=False)  # features dictionary encoding

In [13]:
features = dictvectorizer.fit_transform(data_dict)
features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [14]:
feature_names = dictvectorizer.get_feature_names()
feature_names

['blue', 'red', 'yellow']

In [15]:
pd.DataFrame(features, columns=feature_names)

Unnamed: 0,blue,red,yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


In [16]:
X = np.array([[0, 2.10, 1.45],
             [1, 1.18, 1.33],
             [0, 1.22, 1.27],
             [1, -0.21, -1.19]])

In [17]:
X_with_nan = np.array([[np.nan, 0.87, 1.31],
                       [np.nan, -0.67, -0.22]])

In [18]:
clf = KNeighborsClassifier(3, weights="distance")
trained_model = clf.fit(X[:,1:], X[:,0])

In [19]:
imputed_values = trained_model.predict(X_with_nan[:,1:])
X_with_imputed = np.hstack((imputed_values.reshape(-1,1), X_with_nan[:,1:]))
np.vstack((X_with_imputed, X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])