# Feature Engineering Exercise

Note: for brevity, splitting and scaling is omitted. 

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import pydataset
from sklearn.feature_selection import f_regression, SelectKBest, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

## Tips Dataset

In [23]:
dict(a='b', c='d')

{'a': 'b', 'c': 'd'}

In [24]:
{'a': 'b', 'c': 'd'}

{'a': 'b', 'c': 'd'}

In [56]:
tips = pydataset.data('tips')
tips['price_per_person'] = tips.total_bill / tips.size
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.009947
2,10.34,1.66,Male,No,Sun,Dinner,3,0.006054
3,21.01,3.5,Male,No,Sun,Dinner,3,0.012301
4,23.68,3.31,Male,No,Sun,Dinner,2,0.013864
5,24.59,3.61,Female,No,Sun,Dinner,4,0.014397


In [51]:
X = tips[['size', 'total_bill', 'price_per_person']]
y = tips.tip

In [22]:
tips['tip_percentage'] = tips.tip / tips.total_bill

**_Be careful with the above!!_** We are creating a new feature based on our target. This might be useful for analysis, but we can't create this feature for prediction.

In [3]:
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X, y)
print('Top 2 features according to k-best:')
X.columns[kbest.get_support()]

Top 2 features according to k-best:


Index(['size', 'total_bill'], dtype='object')

In [26]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=2).fit(X, y)
print('Top 2 features according to RFE:')
X.columns[rfe.get_support()]

Top 2 features according to RFE:


Index(['total_bill', 'price_per_person'], dtype='object')

In [27]:
rfe.get_support()

array([False,  True,  True])

In [28]:
rfe.support_

array([False,  True,  True])

In [5]:
def select_kbest(X, y, k):
    kbest = SelectKBest(k=k)
    kbest.fit(X, y)
    mask = kbest.get_support()
    return X.columns[mask]

In [6]:
select_kbest(tips[['total_bill', 'price_per_person', 'size']], tips.tip, 2)

Index(['total_bill', 'size'], dtype='object')

In [None]:
def rfe(X, y, k, model=LinearRegression()):
    rfe = RFE(model, n_features_to_select=k)
    rfe.fit(X, y)
    mask = rfe.get_support()
    return X.columns[mask]

In [18]:
rfe(tips[['total_bill', 'price_per_person', 'size']], tips.tip, 2)

Index(['total_bill', 'price_per_person'], dtype='object')

In [21]:
from sklearn.linear_model import LogisticRegression

rfe(tips[['total_bill', 'price_per_person', 'size']], tips.day, 2, model=LogisticRegression())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Index(['price_per_person', 'size'], dtype='object')

## Swiss Dataset

In [8]:
swiss = pydataset.data('swiss')
train_validate, test = train_test_split(swiss, random_state=123, test_size=.2)
train, validate = train_test_split(train_validate, random_state=123, test_size=.2)

print(f'train: {train.shape[0]} | validate: {validate.shape[0]} | test: {test.shape[0]}')

train: 29 | validate: 8 | test: 10


In [9]:
X_train = train.drop(columns='Fertility')
y_train = train.Fertility

In [10]:
select_kbest(X_train, y_train, 3)

  msw = sswn / float(dfwn)


Index(['Education', 'Catholic', 'Infant.Mortality'], dtype='object')

In [11]:
rfe(X_train, y_train, 3)

Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object')

Taking it further and building some models:

In [12]:
from sklearn.metrics import mean_squared_error

In [13]:
kbest_cols = select_kbest(X_train, y_train, 3)
X_train_kbest = X_train[kbest_cols]

model_kbest = LinearRegression().fit(X_train_kbest, y_train)

mean_squared_error(
    validate.Fertility,
    model_kbest.predict(validate[kbest_cols])
)

  msw = sswn / float(dfwn)


112.94424700760267

In [14]:
rfe_cols = rfe(X_train, y_train, 3)
X_train_rfe = X_train[rfe_cols]

model_rfe = LinearRegression().fit(X_train_rfe, y_train)

mean_squared_error(
    validate.Fertility,
    model_rfe.predict(validate[rfe_cols])
)

133.25818546856837

Evaluate on test

In [15]:
mean_squared_error(
    test.Fertility,
    model_kbest.predict(test[kbest_cols])
)

94.63974916313865

How can we make scaled data easier to work with (use a dataframe)

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [35]:
swiss = pydataset.data('swiss')
X = swiss[['Education', 'Examination']]
y = swiss.Fertility

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=.2)

In [50]:
scaler = MinMaxScaler()

scaler.fit(X_train)

X_train_scaled = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)