## Exercise 1

Load the tips dataset.

In [1]:
#Import dependencies
import pandas as pd
from pydataset import data
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import seaborn as sns

In [2]:
#Read in the data
df = data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


## Exercise 1a

Create a column named price_per_person. This should be the total bill divided by the party size.

In [3]:
#Create a new column called price per person
df['price_per_person'] = df['total_bill'] / df['size']

df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


In [4]:
seed = 42

train, validate = train_test_split(df, train_size=0.8,
                                   random_state=seed)

## Exercise 1b

I think the total bill and the price per person will be the most important features. They both account for how much money was spent at the restaurant, and most people tip a percentage of their total bill. If the bill is higher, the tip should be higher as well.

In [5]:
#Check out the day column
train.day.value_counts()

Sat     70
Sun     61
Thur    47
Fri     17
Name: day, dtype: int64

In [6]:
#Check out the time column
train.time.value_counts()

Dinner    142
Lunch      53
Name: time, dtype: int64

In [7]:
#Create dummy columns of my categorical features
train_dummy = pd.get_dummies(columns=['sex', 'smoker', 'day', 'time', 'size'], data=train)
train_dummy

Unnamed: 0,total_bill,tip,price_per_person,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,size_1,size_2,size_3,size_4,size_5,size_6
229,13.28,2.72,6.640000,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,0
209,24.27,2.03,12.135000,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0
97,27.28,4.00,13.640000,0,1,0,1,1,0,0,0,1,0,0,1,0,0,0,0
168,31.71,4.50,7.927500,0,1,1,0,0,0,1,0,1,0,0,0,0,1,0,0
85,15.98,2.03,7.990000,0,1,1,0,0,0,0,1,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,20.49,4.06,10.245000,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0
15,14.83,3.02,7.415000,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0
93,5.75,1.00,2.875000,1,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0
180,34.63,3.55,17.315000,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0


In [8]:
#Drop a couple columns that provide redundant information
train_dummy.drop(columns=['sex_Female', 'smoker_No'], inplace=True)
train_dummy

Unnamed: 0,total_bill,tip,price_per_person,sex_Male,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,size_1,size_2,size_3,size_4,size_5,size_6
229,13.28,2.72,6.640000,1,0,0,1,0,0,1,0,0,1,0,0,0,0
209,24.27,2.03,12.135000,1,1,0,1,0,0,1,0,0,1,0,0,0,0
97,27.28,4.00,13.640000,1,1,1,0,0,0,1,0,0,1,0,0,0,0
168,31.71,4.50,7.927500,1,0,0,0,1,0,1,0,0,0,0,1,0,0
85,15.98,2.03,7.990000,1,0,0,0,0,1,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,20.49,4.06,10.245000,1,1,0,1,0,0,1,0,0,1,0,0,0,0
15,14.83,3.02,7.415000,0,0,0,0,1,0,1,0,0,1,0,0,0,0
93,5.75,1.00,2.875000,0,1,1,0,0,0,1,0,0,1,0,0,0,0
180,34.63,3.55,17.315000,1,1,0,0,1,0,1,0,0,1,0,0,0,0


In [9]:
#Instantiate the min-max scaler and fit/transform my continuous features
mms = MinMaxScaler()

mms.fit(train_dummy[['total_bill', 'price_per_person']])

train_dummy[['total_bill', 'price_per_person']] = mms.transform(train_dummy[['total_bill', 'price_per_person']])

train_dummy.head()

Unnamed: 0,total_bill,tip,price_per_person,sex_Male,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,size_1,size_2,size_3,size_4,size_5,size_6
229,0.167111,2.72,0.216379,1,0,0,1,0,0,1,0,0,1,0,0,0,0
209,0.411008,2.03,0.532184,1,1,0,1,0,0,1,0,0,1,0,0,0,0
97,0.477807,4.0,0.618678,1,1,1,0,0,0,1,0,0,1,0,0,0,0
168,0.576121,4.5,0.290374,1,0,0,0,1,0,1,0,0,0,0,1,0,0
85,0.227031,2.03,0.293966,1,0,0,0,0,1,0,1,0,1,0,0,0,0


In [10]:
#Create X and y dataframes
train_X = train_dummy.drop(columns=['tip'])
train_y = train_dummy['tip']

## Exercise 1c

Use select k best to select the top 2 features for predicting tip amount. What are they?

In [11]:
#Instantiate SelectKBest and 
f_selector = SelectKBest(f_regression, k=2)

f_selector.fit(train_X, train_y)

feature_mask = f_selector.get_support()

In [12]:
#Check out my feature mask
feature_mask

array([ True, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False])

In [13]:
#Use boolean masking to determine my most impactful features
train_X.columns[feature_mask]

Index(['total_bill', 'size_2'], dtype='object')

## Exercise 1d

Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [14]:
#Instantiate the linear regression model, and pass it into RFE
lm = LinearRegression()

rfe = RFE(lm, n_features_to_select=2)

X_rfe = rfe.fit_transform(train_X,
                          train_y)

In [15]:
#Check out the returned mask
mask = rfe.support_

mask

array([ True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False])

In [16]:
#Use my mask to see the best two features
train_X.columns[mask]

Index(['total_bill', 'price_per_person'], dtype='object')

In [17]:
#Look at the ranking of all features
rfe.ranking_

array([ 1,  1, 13,  9,  8, 10,  6, 11, 15, 12,  4,  5, 14,  7,  3,  2])

In [18]:
#Create a dataframe with the rankings and the features side-by-side
rfe_results = pd.DataFrame({'Ranking': rfe.ranking_, 'Category': train_X.columns})

In [19]:
#Visualize the results
rfe_results.sort_values('Ranking')

Unnamed: 0,Ranking,Category
0,1,total_bill
1,1,price_per_person
15,2,size_6
14,3,size_5
10,4,size_1
11,5,size_2
6,6,day_Sun
13,7,size_4
4,8,day_Fri
3,9,smoker_Yes


## Exercise 1e

The features from SelectKBest are being chosen based on correlation with the target variable, while the features from RFE are selected based on the performance of a linear regression model. These two approaches are different, which is why they may produce different results.

## Exercise 2

Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [20]:
#Define the SelectKBest function
def select_kbest(X, y, k):
    
    f_selector = SelectKBest(f_regression, k=k)
    
    f_selector.fit(X, y)
    
    mask = f_selector.get_support()
    
    return list(X.columns[mask])

In [21]:
#Verify it works
select_kbest(train_X, train_y, 5)

['total_bill', 'price_per_person', 'size_2', 'size_4', 'size_6']

## Exercise 3

Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [22]:
#Define the RFE function
def rfe(X, y, k):
    
    lm = LinearRegression()
    
    rfe = RFE(lm, n_features_to_select=k)
    
    X_rfe = rfe.fit_transform(X, y)
    
    mask = rfe.support_
    
    return list(X.columns[mask])

In [23]:
#Test it
rfe(train_X, train_y, 5)

['total_bill', 'price_per_person', 'size_1', 'size_5', 'size_6']

## Exercise 4

Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [24]:
#Load in the swiss dataset
swiss = data('swiss')

swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [25]:
#Create X and y dataframes
X = swiss.drop(columns=['Fertility'])
y = swiss['Fertility']

In [26]:
#Create a list of columns
X_columns = list(X.columns)

In [27]:
#Instantiate a scaler and fit/transform the columns
mms2 = MinMaxScaler()

X[X_columns] = mms2.fit_transform(X[X_columns])

X.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,0.178531,0.352941,0.211538,0.079816,0.721519
Delemont,0.496045,0.088235,0.153846,0.845069,0.721519
Franches-Mnt,0.435028,0.058824,0.076923,0.93255,0.594937
Moutier,0.39887,0.264706,0.115385,0.323148,0.601266
Neuveville,0.477966,0.411765,0.269231,0.030761,0.620253


In [28]:
#Run the select k best function
select_kbest(X, y, 3)

['Examination', 'Education', 'Catholic']

In [29]:
#Run the rfe function
rfe(X, y, 3)

['Agriculture', 'Education', 'Infant.Mortality']

In [30]:
X_2 = swiss.drop(columns=['Fertility'])
y_2 = swiss['Fertility']

In [31]:
X_2.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,17.0,15,12,9.96,22.2
Delemont,45.1,6,9,84.84,22.2
Franches-Mnt,39.7,5,5,93.4,20.2
Moutier,36.5,12,7,33.77,20.3
Neuveville,43.5,17,15,5.16,20.6


In [32]:
select_kbest(X_2, y_2, 3)

['Examination', 'Education', 'Catholic']

In [33]:
rfe(X_2, y_2, 3)

['Examination', 'Education', 'Infant.Mortality']