### Encoding Nominal Categorical Features

# Encoding Nominal Categorical Features

In [3]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
%matplotlib inline

In [4]:
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

In [5]:
feature = np.array([["Texas"],["California"], ["Texas"], ["Delaware"], ["Texas"]])

In [7]:
model = LabelBinarizer()

In [9]:
new = model.fit_transform(feature)
new

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [10]:
model.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [11]:
model.inverse_transform(new)

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

### pd.get_dummies

In [12]:
df = pd.DataFrame(feature)

In [14]:
df.columns=['places']

In [15]:
df

Unnamed: 0,places
0,Texas
1,California
2,Texas
3,Delaware
4,Texas


In [16]:
pd.get_dummies(df['places'])

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


## multiclass-features

In [17]:
multiclass_feature = [("Texas", "Florida"),("California", "Alabama"), ("Texas", "Florida"), ("Delware", "Florida"), ("Texas", "Alabama")]

In [18]:
df2 = pd.DataFrame(multiclass_feature)

In [19]:
df2

Unnamed: 0,0,1
0,Texas,Florida
1,California,Alabama
2,Texas,Florida
3,Delware,Florida
4,Texas,Alabama


In [20]:
one_hot = MultiLabelBinarizer()

In [23]:
trans = one_hot.fit_transform(multiclass_feature)
trans

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

In [22]:
one_hot.classes_

array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'],
      dtype=object)

In [24]:
one_hot.inverse_transform(trans)

[('Florida', 'Texas'),
 ('Alabama', 'California'),
 ('Florida', 'Texas'),
 ('Delware', 'Florida'),
 ('Alabama', 'Texas')]

# Encoding Ordinal Categorical Features

In [25]:
data = pd.DataFrame({'scores':['low', 'high','medium','low','high']})

In [27]:
data

Unnamed: 0,scores
0,low
1,high
2,medium
3,low
4,high


In [28]:
scaler = {'low':1, 'medium':2, 'high':3}

In [31]:
data.replace(scaler)

Unnamed: 0,scores
0,1
1,3
2,2
3,1
4,3


In [32]:
dataframe = pd.DataFrame({"Score": ["Low","Low", "Medium", "Medium", "High", "Barely More Than Medium"]})

In [33]:
scaler2 = {'Low':1, 'Medium':2, 'Barely More Than Medium':2.1, 'High':3}

In [34]:
dataframe.replace(scaler2)

Unnamed: 0,Score
0,1.0
1,1.0
2,2.0
3,2.0
4,3.0
5,2.1


# Encoding Dictionaries of Features

In [37]:
from sklearn.feature_extraction import DictVectorizer

In [38]:
data_dict = [{"Red": 2, "Blue": 4},{"Red": 4, "Blue": 3}, {"Red": 1, "Yellow": 2}, {"Red": 2, "Yellow": 2}]

In [40]:
data = pd.DataFrame(data_dict)

In [42]:
data.fillna(0)

Unnamed: 0,Red,Blue,Yellow
0,2,4.0,0.0
1,4,3.0,0.0
2,1,0.0,2.0
3,2,0.0,2.0


In [46]:
model = DictVectorizer(sparse=True)

In [49]:
print(model.fit_transform(data_dict))

  (0, 0)	4.0
  (0, 1)	2.0
  (1, 0)	3.0
  (1, 1)	4.0
  (2, 1)	1.0
  (2, 2)	2.0
  (3, 1)	2.0
  (3, 2)	2.0


In [51]:
model2 = DictVectorizer(sparse=False)

In [53]:
model2.fit_transform(data_dict)

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [55]:
model2.get_feature_names()

['Blue', 'Red', 'Yellow']

# Imputing Missing Class Values

### KNeighborsClassifier Method

In [57]:
from sklearn.neighbors import KNeighborsClassifier

In [56]:
X = np.array([[0, 2.10, 1.45],
              [1, 1.18, 1.33], 
              [0, 1.22, 1.27], 
              [1, -0.21, -1.19]])

In [58]:
X_with_nan = np.array([[np.nan, 0.87, 1.31], [np.nan, -0.67, -0.22]])

In [61]:
clf = KNeighborsClassifier(3, weights='distance')

In [65]:
fitted = clf.fit(X[:,1:], X[:,0])

In [69]:
imputed_values = fitted.predict(X_with_nan[:, 1:])
imputed_values[:, None]

array([[0.],
       [1.]])

In [70]:
imputed_values.reshape(-1,1)

array([[0.],
       [1.]])

In [73]:
X_with_imputed = np.hstack((imputed_values[:, None],X_with_nan[:, 1:]))

In [75]:
np.vstack((X_with_imputed, X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

### Imputer Method

In [77]:
complete_data = np.vstack((X, X_with_nan))

In [78]:
complete_data

array([[ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19],
       [  nan,  0.87,  1.31],
       [  nan, -0.67, -0.22]])

In [81]:
from sklearn.impute import SimpleImputer

In [85]:
model = SimpleImputer(strategy='most_frequent')

In [86]:
model.fit_transform(complete_data)

array([[ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19],
       [ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22]])

# Handling Imbalanced Classes

### put a weight parameters

In [125]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier

In [126]:
iris = load_iris()

In [127]:
features = iris.data
target = iris.target

In [128]:
features = features[40:, :]
target = target[40:]
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [129]:
target = np.where((target==0), 0, 1)

In [136]:
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [131]:
weight = {'0':0.9, '1':0.1}

In [132]:
RandomForestClassifier(class_weight=weight)

RandomForestClassifier(class_weight={'0': 0.9, '1': 0.1})

In [133]:
RandomForestClassifier(class_weight='balance')

RandomForestClassifier(class_weight='balance')

### we can downsample the majority class or upsample the minority class.

In [141]:
i_class1 = np.where(target==1)[0]
i_class0 = np.where(target==0)[0]

In [142]:
n_class1 = len(i_class1)
n_class0 = len(i_class0)

In [147]:
i_class1_downsized = np.random.choice(i_class1, size=n_class0, replace=False)

In [149]:
np.hstack((target[i_class1_downsized], target[i_class0]))

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [154]:
np.vstack((features[i_class0,:], features[i_class1_downsized,:]))

array([[5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4],
       [4.8, 3. , 1.4, 0.3],
       [5.1, 3.8, 1.6, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [5. , 3.3, 1.4, 0.2],
       [5.6, 2.8, 4.9, 2. ],
       [7.1, 3. , 5.9, 2.1],
       [6.7, 3. , 5.2, 2.3],
       [7.9, 3.8, 6.4, 2. ],
       [6.5, 3. , 5.5, 1.8],
       [6.9, 3.2, 5.7, 2.3],
       [5.5, 2.4, 3.8, 1.1],
       [6.4, 3.2, 4.5, 1.5],
       [5.5, 2.5, 4. , 1.3],
       [5.8, 2.8, 5.1, 2.4]])

In [155]:
i_class0_upsampled = np.random.choice(i_class0, size=n_class1, replace=True)

In [156]:
np.hstack((target[i_class0_upsampled], target[i_class1]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [158]:
np.vstack((features[i_class0_upsampled], features[i_class1]))[:5]

array([[5.1, 3.8, 1.6, 0.2],
       [5. , 3.3, 1.4, 0.2],
       [5. , 3.3, 1.4, 0.2],
       [5. , 3.3, 1.4, 0.2],
       [4.6, 3.2, 1.4, 0.2]])