# Feature Engineering Exercise

Note: for brevity, splitting and scaling is omitted. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import pydataset
from sklearn.feature_selection import f_regression, SelectKBest, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

## Tips Dataset

In [2]:
tips = pydataset.data('tips')
tips['price_per_person'] = tips.total_bill / tips['size']

X = tips[['size', 'total_bill', 'price_per_person']]
y = tips.tip

In [3]:
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X, y)
print('Top 2 features according to k-best:')
X.columns[kbest.get_support()]

Top 2 features according to k-best:


Index(['size', 'total_bill'], dtype='object')

In [4]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=2).fit(X, y)
print('Top 2 features according to RFE:')
X.columns[rfe.get_support()]

Top 2 features according to RFE:


Index(['total_bill', 'price_per_person'], dtype='object')

In [5]:
def select_kbest(X, y, k):
    kbest = SelectKBest(k=k)
    kbest.fit(X, y)
    mask = kbest.get_support()
    return X.columns[mask]

In [6]:
select_kbest(tips[['total_bill', 'price_per_person', 'size']], tips.tip, 2)

Index(['total_bill', 'size'], dtype='object')

In [7]:
def rfe(X, y, k, model=LinearRegression()):
    rfe = RFE(model, n_features_to_select=k)
    rfe.fit(X, y)
    mask = rfe.get_support()
    return X.columns[mask]

rfe(tips[['total_bill', 'price_per_person', 'size']], tips.tip, 2)

Index(['total_bill', 'price_per_person'], dtype='object')

## Swiss Dataset

In [8]:
swiss = pydataset.data('swiss')
train_validate, test = train_test_split(swiss, random_state=123, test_size=.2)
train, validate = train_test_split(train_validate, random_state=123, test_size=.2)

print(f'train: {train.shape[0]} | validate: {validate.shape[0]} | test: {test.shape[0]}')

train: 29 | validate: 8 | test: 10


In [9]:
X_train = train.drop(columns='Fertility')
y_train = train.Fertility

In [10]:
select_kbest(X_train, y_train, 3)

  msw = sswn / float(dfwn)


Index(['Education', 'Catholic', 'Infant.Mortality'], dtype='object')

In [11]:
rfe(X_train, y_train, 3)

Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object')

Taking it further and building some models:

In [12]:
from sklearn.metrics import mean_squared_error

In [13]:
kbest_cols = select_kbest(X_train, y_train, 3)
X_train_kbest = X_train[kbest_cols]

model_kbest = LinearRegression().fit(X_train_kbest, y_train)

mean_squared_error(
    validate.Fertility,
    model_kbest.predict(validate[kbest_cols])
)

  msw = sswn / float(dfwn)


112.94424700760267

In [14]:
rfe_cols = rfe(X_train, y_train, 3)
X_train_rfe = X_train[rfe_cols]

model_rfe = LinearRegression().fit(X_train_rfe, y_train)

mean_squared_error(
    validate.Fertility,
    model_rfe.predict(validate[rfe_cols])
)

133.25818546856837

Evaluate on test

In [15]:
mean_squared_error(
    test.Fertility,
    model_kbest.predict(test[kbest_cols])
)

94.63974916313865