In [1]:
import pandas as pd
import numpy as np
import pydataset

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


import wrangle
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

# Feature Engineering Exercises
Do your work for this exercise in a jupyter notebook named feature_engineering within the regression-exercisesrepo. Add, commit, and push your work.

## Exercise One: 
Load the tips dataset

In [2]:
# Load the tips dataset.
tips = pydataset.data('tips')
tips['smoker'] = (tips.smoker == 'Yes').astype(int)
tips['dinner'] = (tips.time == 'Dinner').astype(int)

In [3]:
tips.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size', 'dinner'], dtype='object')

### A:  Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [4]:
tips['tip_percentage'] = (tips.tip/tips.total_bill)
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,dinner,tip_percentage
1,16.99,1.01,Female,0,Sun,Dinner,2,1,0.059447
2,10.34,1.66,Male,0,Sun,Dinner,3,1,0.160542
3,21.01,3.5,Male,0,Sun,Dinner,3,1,0.166587
4,23.68,3.31,Male,0,Sun,Dinner,2,1,0.13978
5,24.59,3.61,Female,0,Sun,Dinner,4,1,0.146808


### B: Create a column named price_per_person. This should be the total bill divided by the party size.

In [5]:
tips['price_per_person'] = (tips.total_bill/tips['size'])
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,dinner,tip_percentage,price_per_person
1,16.99,1.01,Female,0,Sun,Dinner,2,1,0.059447,8.495
2,10.34,1.66,Male,0,Sun,Dinner,3,1,0.160542,3.446667
3,21.01,3.5,Male,0,Sun,Dinner,3,1,0.166587,7.003333
4,23.68,3.31,Male,0,Sun,Dinner,2,1,0.13978,11.84
5,24.59,3.61,Female,0,Sun,Dinner,4,1,0.146808,6.1475


### C: Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

Tip percentage, total_bill, and time of day are the features I would think would be most important.

### Prep df for split

In [6]:
#create new df without sex and day columns
tips = tips[["total_bill", "tip", "size", "tip_percentage", "price_per_person", "smoker", "dinner"]]
tips.head()

Unnamed: 0,total_bill,tip,size,tip_percentage,price_per_person,smoker,dinner
1,16.99,1.01,2,0.059447,8.495,0,1
2,10.34,1.66,3,0.160542,3.446667,0,1
3,21.01,3.5,3,0.166587,7.003333,0,1
4,23.68,3.31,2,0.13978,11.84,0,1
5,24.59,3.61,4,0.146808,6.1475,0,1


### Split data

In [7]:
X = tips[['total_bill', 'size', 'smoker', 'dinner', 'tip_percentage', 'price_per_person']]
y = tips.tip

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### D: Use select k best and recursive feature elimination to select the top 2 features for predicting tip amount. What are they?

In [8]:
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train_scaled, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fed92f2d550>)

In [9]:
#two best features are total_bill and size when using kbest
X_train.columns[kbest.get_support()]

Index(['total_bill', 'size'], dtype='object')

In [10]:
#select two best features using recursive feature elimination
rfe = RFE(estimator=LinearRegression(), n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)
rfe.get_support()

array([ True, False, False, False,  True, False])

In [12]:
#two best features are tip_percentage and total_bill when using recursive feature elimination
pd.Series(rfe.ranking_, index=X_train.columns)

total_bill          1
size                2
smoker              4
dinner              5
tip_percentage      1
price_per_person    3
dtype: int64

### E: Use select k best and recursive feature elimination to select the top 2 features for predicting tip percentage. What are they?

In [13]:
X = tips[['total_bill', 'size', 'smoker', 'dinner', 'price_per_person', 'tip']]
y = tips.tip_percentage

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train_scaled, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fed92f2d550>)

In [15]:
#two best features are price_per_person and tip when using kbest
X_train.columns[kbest.get_support()]

Index(['price_per_person', 'tip'], dtype='object')

In [16]:
#select two best features using recursive feature elimination
rfe = RFE(estimator=LinearRegression(), n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)
rfe.get_support()

array([ True, False, False, False, False,  True])

In [18]:
#two best features are tip and total_bill when using recursive feature elimination
pd.Series(rfe.ranking_, index=X_train.columns)

total_bill          1
size                2
smoker              4
dinner              5
price_per_person    3
tip                 1
dtype: int64

### F: Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

## Exercise Two:
Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

## Exercise Three:
Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

## Exercise Four:
Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).