<h1><font color='blue' size=7>Feature Engineering</font></h1>

<h1><font color='red' size=5>Data Transformation</font></h1>

In [2]:
import pandas as pd
import numpy as np

import warnings

In [3]:
warnings.filterwarnings('ignore')

<h1><font color='black' size=4>Transformations using map</font></h1>

In [4]:
data = pd.DataFrame({
    'veggies' : ['tomatoe', 'onion', 'garlic', 'pumpkin', 'lettuce', 'eggplant', 'broccoli', 'carrot'],
    'kg_price' : [2.0, 1.5, 1.9, 3.0, 0.9, 2.7, 4.0, 2.1]
})
data

Unnamed: 0,veggies,kg_price
0,tomatoe,2.0
1,onion,1.5
2,garlic,1.9
3,pumpkin,3.0
4,lettuce,0.9
5,eggplant,2.7
6,broccoli,4.0
7,carrot,2.1


In [5]:
veggie_to_color = {
    'tomatoe' : 'red',
    'onion' : 'white',
    'garlic' : 'white',
    'pumpkin' : 'orange',
    'lettuce' : 'green',
    'eggplant' : 'purple',
    'broccoli' : 'green'
}

In [6]:
data['color'] = data['veggies'].map(veggie_to_color)
data

Unnamed: 0,veggies,kg_price,color
0,tomatoe,2.0,red
1,onion,1.5,white
2,garlic,1.9,white
3,pumpkin,3.0,orange
4,lettuce,0.9,green
5,eggplant,2.7,purple
6,broccoli,4.0,green
7,carrot,2.1,


<h1><font color='black' size=4>Discretization & Binning</font></h1>

In [7]:
ages = [21, 20, 19, 25, 27, 38, 50, 72, 99, 83, 65, 43, 100]
bins = [18, 25, 50, 70, 100]
categories = pd.cut(ages, bins)

In [8]:
categories

[(18, 25], (18, 25], (18, 25], (18, 25], (25, 50], ..., (70, 100], (70, 100], (50, 70], (25, 50], (70, 100]]
Length: 13
Categories (4, interval[int64]): [(18, 25] < (25, 50] < (50, 70] < (70, 100]]

In [9]:
categories.codes

array([0, 0, 0, 0, 1, 1, 1, 3, 3, 3, 2, 1, 3], dtype=int8)

<h1><font color='black' size=4>Dummy variables</font></h1>
<p>Create numerical variables based on categorical data</p>

In [10]:
df = pd.DataFrame({
    'animals' : ['bear', 'dog', 'tiger', 'wolf', 'dragon', 'snake'],
    'age' : range(6)
})
df

Unnamed: 0,animals,age
0,bear,0
1,dog,1
2,tiger,2
3,wolf,3
4,dragon,4
5,snake,5


In [11]:
pd.get_dummies(df)

Unnamed: 0,age,animals_bear,animals_dog,animals_dragon,animals_snake,animals_tiger,animals_wolf
0,0,1,0,0,0,0,0
1,1,0,1,0,0,0,0
2,2,0,0,0,0,1,0
3,3,0,0,0,0,0,1
4,4,0,0,1,0,0,0
5,5,0,0,0,1,0,0


<h1><font color='black' size=4>Label Encoder</font></h1>

In [15]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(['paris', 'paris', 'tokyo', 'amsterdam'])

list(le.classes_)

['amsterdam', 'paris', 'tokyo']

In [16]:
integer_encoded = le.transform(['tokyo', 'tokyo', 'paris', 'amsterdam'])
print('Transform: ', integer_encoded)
print('Inverse Transformation: ', list(le.inverse_transform([2, 2, 1, 0])))

Transform:  [2 2 1 0]
Inverse Transformation:  ['tokyo', 'tokyo', 'paris', 'amsterdam']


<h1><font color='black' size=4>One Hot Encoder</font></h1>

In [17]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
print(integer_encoded)
onehot_encoded = one_hot_encoder.fit_transform(integer_encoded)

[[2]
 [2]
 [1]
 [0]]


In [None]:
onehot_encoded

<h1><font color='black' size=5>Label Encoder vs. One Hot Encoder</font></h1>

* http://queirozf.com/entries/one-hot-encoding-a-feature-on-a-pandas-dataframe-an-example
* https://www.algosome.com/articles/dummy-variable-trap-regression.html
* https://medium.com/@contactsunny/label-encoder-vs-one-hot-encoder-in-machine-learning-3fc273365621


In [18]:
data = pd.read_excel('./../../../dataset/xlsx/data_1.xlsx', sheet_name='LabelEncoder')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
Country      10 non-null object
Age          9 non-null float64
Salary       9 non-null float64
Purchased    10 non-null object
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [19]:
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [20]:
print(data.loc[:, 'Country'])

0     France
1      Spain
2    Germany
3      Spain
4    Germany
5     France
6      Spain
7     France
8    Germany
9     France
Name: Country, dtype: object


<h4><font size=4 color='green'>The country values have been replaced by the numbers 0, 1 and 2.</font></h4>
<p><b>label encoding introduces a new problem. For example, we have encoded a set of country names into numerical data. This is actually categorical data and there is no relation, of any kind, between the rows.</b></p>

<p><b>The problem here is, since there are different numbers in the same column, the model will misunderstand the data to be in some kind of order, 0 < 1 < 2. But this isn’t the case at all. To overcome this problem, we use One Hot Encoder.</b></p>

In [21]:
label_encoder = LabelEncoder()
label_encoder.fit(data.loc[:, 'Country'])
data.loc[:, 'Country'] = label_encoder.transform(data.loc[:, 'Country'])
label_encoder_2 = LabelEncoder()
data.loc[:, 'Purchased'] = label_encoder_2.fit_transform(data.loc[:, 'Purchased'])
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,27.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,,1


In [22]:
label_encoder.classes_

array(['France', 'Germany', 'Spain'], dtype=object)

In [23]:
label_encoder.inverse_transform([0, 1, 2])

array(['France', 'Germany', 'Spain'], dtype=object)

<h1><font color='green' size=4>One Hot Encoder</font></h1>

<p>What one hot encoding does is, it takes a column which has categorical data, which <b>has been label encoded</b>, and then splits the column into multiple columns. The numbers are replaced by 1s and 0s, depending on which column has what value. </p>

In [24]:
label_encoder = LabelEncoder()
data.iloc[:, 0] = label_encoder.fit_transform(data.iloc[:, 0])
hot_encoder = OneHotEncoder(categorical_features = [0])
data_array = hot_encoder.fit_transform(data).toarray()

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
encoded_data = pd.DataFrame(data=data_array, columns=['CountryFrance', 'CountryGermany', 'CountrySpain', 'Age', 'Salary', 'Purchased'])
encoded_data.head()

<h1><font color='blue' size=3>Using pandas.get_dummies</font></h1>

In [25]:
data_2 = pd.read_excel('./../../../dataset/xlsx/data_1.xlsx', sheet_name='LabelEncoder')
data_2.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [26]:
data_2 = pd.get_dummies(data_2)
data_2.head()

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain,Purchased_No,Purchased_Yes
0,44.0,72000.0,1,0,0,1,0
1,27.0,48000.0,0,0,1,0,1
2,30.0,54000.0,0,1,0,1,0
3,38.0,61000.0,0,0,1,1,0
4,40.0,,0,1,0,0,1


<h1><font color='black' size=5>Imputation</font></h1>

<h2><font color='blue' size=4>Univariate vs. Multivariate Imputation</font></h2>

<p>One type of imputation algorithm is univariate, which imputes values in the i-th feature dimension using only non-missing values in that feature dimension (e.g. <b>impute.SimpleImputer</b>). By contrast, multivariate imputation algorithms use the entire set of available feature dimensions to estimate the missing values (e.g. <b>impute.IterativeImputer</b>)</p>

<h2><font color='blue' size=4>Univariate Feature Imputation</font></h2>

In [4]:
data = pd.read_excel('./../../../dataset/xlsx/data_1.xlsx', sheet_name='Imputation')
data.head(5)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44,72000,No
1,Spain,27,48000,Yes
2,Germany,30,54000,No
3,Spain,33,61000,
4,Germany,40,54000,Yes


- #### missing_values: Placeholder for the missing values, all occurrences of missing values will be imputed, integer or NaN (default="NaN")
- #### strategy: (default="mean") 
    - "mean"
    - "median"
    - "most_frequent"
- #### axis: The axis along which to impute
    - 0: columns
    - 1: rows
- #### copy: If True a copy of X will be created, If False, imputation will be done in-place 

In [28]:
from sklearn.impute import SimpleImputer
from numpy import nan

In [29]:
data.isnull().sum()

Country      3
Age          4
Salary       6
Purchased    3
dtype: int64

In [30]:
foo = data['Country']
foo

0      France
1       Spain
2     Germany
3       Spain
4     Germany
5         NaN
6       Spain
7      France
8     Germany
9         NaN
10    Germany
11     France
12        NaN
13     France
14    Germany
15     France
Name: Country, dtype: object

In [31]:
foo = data['Country'].values.reshape(-1,1)
foo

array([['France'],
       ['Spain'],
       ['Germany'],
       ['Spain'],
       ['Germany'],
       [nan],
       ['Spain'],
       ['France'],
       ['Germany'],
       [nan],
       ['Germany'],
       ['France'],
       [nan],
       ['France'],
       ['Germany'],
       ['France']], dtype=object)

In [32]:
imputer = SimpleImputer(strategy='most_frequent')
data['Country'] = imputer.fit_transform(data['Country'].values.reshape(-1,1))

In [33]:
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,,61000.0,
4,Germany,40.0,,Yes


In [34]:
data.isnull().sum()

Country      0
Age          4
Salary       6
Purchased    3
dtype: int64

<h2><font color='blue' size=4>Multivariate Feature Imputation</font></h2>

<p>A more sophisticated approach is to use the <b>IterativeImputer</b> class, which models each feature with missing values as a function of other features, and uses that estimate for imputation. It does so in an iterated round-robin fashion: at each step, a feature column is designated as output y and the other feature columns are treated as inputs X. A regressor is fit on (X, y) for known y. Then, the regressor is used to predict the missing values of y. This is done for each feature in an iterative fashion, and then is repeated for max_iter imputation rounds. The results of the final imputation round are returned.</p>

<p><b>Note:</b> This estimator is still experimental for now: the predictions and the API might change without any deprecation cycle. To use it, you need to explicitly import enable_iterative_imputer.</p>

In [35]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(max_iter=10, random_state=0)
imputer.fit([[1,2], [3,6], [4,8], [np.nan, 3], [7, np.nan]])

IterativeImputer(add_indicator=False, estimator=None, imputation_order='ascending',
                initial_strategy='mean', max_iter=10, max_value=None, min_value=None,
                missing_values=nan, n_nearest_features=None, random_state=0, sample_posterior=False,
                tol=0.001, verbose=0)

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=0,
                 sample_posterior=False, tol=0.001, verbose=0)

In [36]:
X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
#the model learns that the second feature is double the first
print(np.round(imputer.transform(X_test)))

[[ 1.  2.]
 [ 6. 12.]
 [ 3.  6.]]


In [64]:
data = pd.read_excel('./../../../dataset/xlsx/data_1.xlsx', sheet_name='Imputation')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 4 columns):
Country      32 non-null object
Age          32 non-null int64
Salary       32 non-null int64
Purchased    22 non-null object
dtypes: int64(2), object(2)
memory usage: 1.1+ KB


In [65]:
from sklearn.model_selection import train_test_split

features = data.drop(['Purchased'], axis=1)
y = data['Purchased']

X = pd.get_dummies(features)

X.head()

Unnamed: 0,Age,Salary,Country_Colombia,Country_France,Country_Germany,Country_Spain
0,44,72000,0,1,0,0
1,27,48000,0,0,0,1
2,30,54000,0,0,1,0
3,33,61000,0,0,0,1
4,40,54000,0,0,1,0


In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [67]:
X_test.head()

Unnamed: 0,Age,Salary,Country_Colombia,Country_France,Country_Germany,Country_Spain
29,39,64000,0,1,0,0
15,37,65000,0,1,0,0
24,50,23500,0,0,1,0
17,27,48000,0,0,0,1
8,50,49000,0,0,1,0


In [68]:
it_imputer = IterativeImputer(max_iter=10, random_state=0)
it_imputer.fit(X_train)

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=0,
                 sample_posterior=False, tol=0.001, verbose=0)

In [69]:
print(it_imputer.transform(X_test))

[[3.90e+01 6.40e+04 0.00e+00 1.00e+00 0.00e+00 0.00e+00]
 [3.70e+01 6.50e+04 0.00e+00 1.00e+00 0.00e+00 0.00e+00]
 [5.00e+01 2.35e+04 0.00e+00 0.00e+00 1.00e+00 0.00e+00]
 [2.70e+01 4.80e+04 0.00e+00 0.00e+00 0.00e+00 1.00e+00]
 [5.00e+01 4.90e+04 0.00e+00 0.00e+00 1.00e+00 0.00e+00]
 [3.70e+01 6.70e+04 0.00e+00 1.00e+00 0.00e+00 0.00e+00]
 [5.00e+01 8.30e+04 0.00e+00 0.00e+00 1.00e+00 0.00e+00]
 [3.70e+01 6.70e+04 1.00e+00 0.00e+00 0.00e+00 0.00e+00]]


<h1><font color='black' size=5>Pipeline</font></h1>

<p>Sequentially apply a <b>list of transforms</b> and a <b>final estimator</b>. <b>Intermediate</b> steps of the pipeline must be <b>‘transforms’</b>, that is, they <b>must implement fit and transform methods</b>. The <b>final estimator</b> only needs to implement <b>fit</b>. The transformers in the pipeline can be cached using memory argument.</p>

In [5]:
from sklearn import svm
from sklearn.datasets import samples_generator
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline

In [7]:
# generate some data to play with
X, y = samples_generator.make_classification(n_informative=5, n_redundant=0, random_state=42)

In [8]:
# ANOVA SVM-C
anova_filter = SelectKBest(f_regression, k=5)
clf = svm.SVC(kernel='linear')
anova_svm = Pipeline([('anova', anova_filter), ('svc', clf)])
# you can set the parameters using the names issued
# For instance, fit using a k of 10 in the SelectKBest
# and a parameter 'C' of the svm
anova_svm.set_params(anova__k=10, svc__C=.1).fit(X,y)

Pipeline(memory=None,
         steps=[('anova',
                 SelectKBest(k=10,
                             score_func=<function f_regression at 0x000001C428390488>)),
                ('svc',
                 SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3,
                     gamma='auto_deprecated', kernel='linear', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [10]:
prediction = anova_svm.predict(X)
anova_svm.score(X, y)

0.83

In [12]:
anova_svm.score?

[1;31mSignature:[0m [0manova_svm[0m[1;33m.[0m[0mscore[0m[1;33m([0m[0mself[0m[1;33m,[0m [0mX[0m[1;33m,[0m [0my[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0msample_weight[0m[1;33m=[0m[1;32mNone[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Apply transforms, and score with the final estimator

Parameters
----------
X : iterable
    Data to predict on. Must fulfill input requirements of first step
    of the pipeline.

y : iterable, default=None
    Targets used for scoring. Must fulfill label requirements for all
    steps of the pipeline.

sample_weight : array-like, default=None
    If not None, this argument is passed as ``sample_weight`` keyword
    argument to the ``score`` method of the final estimator.

Returns
-------
score : float
[1;31mFile:[0m      c:\programdata\anaconda3\lib\site-packages\sklearn\pipeline.py
[1;31mType:[0m      function


In [13]:
# Getting the selected features chosen by anova_filter
anova_svm['anova'].get_support()

array([False, False,  True,  True, False, False,  True,  True, False,
        True, False,  True,  True, False,  True, False,  True,  True,
       False, False])

In [14]:
# Another way to get selected features chosen by anova_filter
anova_svm.named_steps.anova.get_support()

array([False, False,  True,  True, False, False,  True,  True, False,
        True, False,  True,  True, False,  True, False,  True,  True,
       False, False])

In [15]:
# Indexing can be used to extract a sub-pipeline
sub_pipeline = anova_svm[:1]
sub_pipeline

Pipeline(memory=None,
         steps=[('anova',
                 SelectKBest(k=10,
                             score_func=<function f_regression at 0x000001C428390488>))],
         verbose=False)

<h1><font color='black' size=5>Column Transformer with Mixed Types</font></h1>

<p>Handy for the case of datasets that contain heterogeneous data types, since we may want to scale the numeric features and one-hot encode the categorical ones.</p>

<p>In this example, the numeric data is standard-scaled after mean-imputation, while the categorical data is one-hot encoded after imputing missing values with a new category ('missing').</p>

https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py

In [17]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

In [18]:
np.random.seed(0)

In [19]:
# read data from titanic dataset
titanic_url = 'https://raw.githubusercontent.com/amueller/scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv'
data = pd.read_csv(titanic_url)

In [21]:
# We will train our classifier with the following features:
# Numeric features
# - age: float
# - fare: float
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}
# - sex: categories encoded as strings {'female', 'male'}
# - pclass: ordinal integers {1, 2, 3}

# We create the preprocessing pipelines for both numerical and categorical data
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                         ('onehot', OneHotEncoder(handle_unknown='ignore'))]) 

preprocessor = ColumnTransformer(
                    transformers=[
                        ('num', numeric_transformer, numeric_features),
                        ('cat', categorical_transformer, categorical_features)
                    ])

In [22]:
# Append classifier to preprocessing pipeline
# Now we have a full prediction pipeline
clf = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('classifier', LogisticRegression(solver='lbfgs'))
              ])

X = data.drop('survived', axis=1)
y = data['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print('model score: %.3f' % clf.score(X_test, y_test))

model score: 0.790


<h1><font color='black' size=5>Using the prediction pipeline in a grid search</font></h1>

In [26]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'classifier__C': [0.1, 1.0, 10, 100],
}

grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)
grid_search.fit(X_train, y_train)

print(("best logistic regression from grid search: %.3f"
       % grid_search.score(X_test, y_test)))

best logistic regression from grid search: 0.798
