In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression

from pydataset import data

# Feature Engineering Exercises

##### 1. Load the tips dataset.

- a. Create a column named price_per_person. This should be the total bill divided by the party size.

In [8]:
df = data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [9]:
df['price_per_person'] = df.total_bill / df['size']

- b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

I think total_bill would be most important for predicting the tip amount

- c. Use select k best to select the top 2 features for predicting tip amount. What are they?


In [10]:
X = df[df.columns.drop('tip').tolist()]
y = df.tip
X.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size,price_per_person
1,16.99,Female,No,Sun,Dinner,2,8.495
2,10.34,Male,No,Sun,Dinner,3,3.446667
3,21.01,Male,No,Sun,Dinner,3,7.003333
4,23.68,Male,No,Sun,Dinner,2,11.84
5,24.59,Female,No,Sun,Dinner,4,6.1475


In [11]:
dummies_list = X.select_dtypes(object).columns

dummy_df = pd.get_dummies(X[dummies_list], drop_first=True)
encoded = pd.concat([X, dummy_df], axis = 1)

X = encoded.drop(columns = dummies_list)
X.head()

Unnamed: 0,total_bill,size,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,16.99,2,8.495,0,0,0,1,0,0
2,10.34,3,3.446667,1,0,0,1,0,0
3,21.01,3,7.003333,1,0,0,1,0,0
4,23.68,2,11.84,1,0,0,1,0,0
5,24.59,4,6.1475,0,0,0,1,0,0


In [12]:
f_selector = SelectKBest(f_regression, k=2)
f_selector.fit(X, y)
feature_mask = f_selector.get_support()
f_feature = X.iloc[:,feature_mask].columns.tolist()
f_feature

['total_bill', 'size']

The top two features found when using SelectKBest are total_bill and size.

- d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [13]:
lm = LinearRegression()

rfe = RFE(lm, n_features_to_select=2)

rfe.fit(X,y)  

feature_mask = rfe.support_

rfe_feature = X.iloc[:,feature_mask].columns.tolist()
rfe_feature

['total_bill', 'smoker_Yes']

The top two features found when using RFE was total_bill and smoker_Yes.

- e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

It could have something to do with the fact I didn't scale my data before running the RFE ranker since it used a linear regression model to decide.  I will scale my data now and then check to see if it performs any differently.

In [25]:
from sklearn.preprocessing import MinMaxScaler

columns_to_scale = ['total_bill', 'size', 'price_per_person']

scaler = MinMaxScaler()
X_scaled = X.copy()

In [26]:
X_scaled[columns_to_scale] = pd.DataFrame(scaler.fit_transform(X[columns_to_scale]), 
                                                  columns=X[columns_to_scale].columns.values).set_index([X.index.values])

X_scaled.head()

Unnamed: 0,total_bill,size,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,0.291579,0.2,0.322989,0,0,0,1,0,0
2,0.152283,0.4,0.032854,1,0,0,1,0,0
3,0.375786,0.4,0.237261,1,0,0,1,0,0
4,0.431713,0.2,0.51523,1,0,0,1,0,0
5,0.450775,0.6,0.188075,0,0,0,1,0,0


In [32]:
lm = LinearRegression()

rfe = RFE(lm, n_features_to_select=2)

rfe.fit(X_scaled,y)  

feature_mask = rfe.support_

rfe_feature = X_scaled.iloc[:,feature_mask].columns.tolist()
rfe_feature

['total_bill', 'price_per_person']

After scaling the data, total_bill and price_per_person are found to be the two best features when using RFE

##### 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [33]:
def select_kbest(X, y, number_of_features_to_select = 3):
    f_selector = SelectKBest(f_regression, k=number_of_features_to_select)
    f_selector.fit(X, y)
    feature_mask = f_selector.get_support()
    f_feature = X.iloc[:,feature_mask].columns.tolist()
    return f_feature

In [39]:
select_kbest(X, y, number_of_features_to_select = 2)

['total_bill', 'size']

Same results as before.

##### 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [40]:
def rfe_using_linear_regression(X, y, number_of_features_to_select = 3):
    lm = LinearRegression()

    rfe = RFE(lm, n_features_to_select=number_of_features_to_select)

    rfe.fit(X,y)  

    feature_mask = rfe.support_

    rfe_feature = X.iloc[:,feature_mask].columns.tolist()
    return rfe_feature

In [42]:
rfe_using_linear_regression(X_scaled, y, number_of_features_to_select=2)

['total_bill', 'price_per_person']

Same results from before when done manually.

##### 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [44]:
swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [46]:
swiss_X = swiss.drop(columns='Fertility')
swiss_X.head()


Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,17.0,15,12,9.96,22.2
Delemont,45.1,6,9,84.84,22.2
Franches-Mnt,39.7,5,5,93.4,20.2
Moutier,36.5,12,7,33.77,20.3
Neuveville,43.5,17,15,5.16,20.6


In [47]:
swiss_y = swiss.Fertility
swiss_y.head()

Courtelary      80.2
Delemont        83.1
Franches-Mnt    92.5
Moutier         85.8
Neuveville      76.9
Name: Fertility, dtype: float64

In [49]:
columns_to_scale = ['Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']

scaler = MinMaxScaler()
swiss_X_scaled = swiss_X.copy()


In [50]:
swiss_X_scaled[columns_to_scale] = pd.DataFrame(scaler.fit_transform(swiss_X[columns_to_scale]), 
                                                  columns=swiss_X[columns_to_scale].columns.values).set_index([swiss_X.index.values])

swiss_X_scaled.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,0.178531,0.352941,0.211538,0.079816,0.721519
Delemont,0.496045,0.088235,0.153846,0.845069,0.721519
Franches-Mnt,0.435028,0.058824,0.076923,0.93255,0.594937
Moutier,0.39887,0.264706,0.115385,0.323148,0.601266
Neuveville,0.477966,0.411765,0.269231,0.030761,0.620253


In [51]:
select_kbest(swiss_X_scaled, swiss_y)

['Examination', 'Education', 'Catholic']

In [52]:
rfe_using_linear_regression(swiss_X_scaled, swiss_y)

['Agriculture', 'Education', 'Infant.Mortality']

The top performing models are NOT the same when using the kbest function and the rfe with linear regression function.