In [84]:
import numpy as np
import pandas as pd

from pydataset import data
import wrangle

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, RFE, \
f_regression

In [2]:
tips = data('tips')

In [3]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


## Create a column named price_per_person. This should be the total bill divided by the party size.

In [5]:
tips['price_per_person'] = tips.total_bill / tips['size']

In [6]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


## Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

from looking at tips.head(), it looks like customer sex might influence tip amount

## Use select k best to select the top 2 features for predicting tip amount. What are they?

In [25]:
tips['sex_male'] = tips.sex == 'Male'

In [28]:
tips.smoker = tips.smoker == 'Yes'

In [36]:
tips.day = np.where(tips.day == 'Thur', 4,
                   np.where(tips.day == 'Fri', 5,
                   np.where(tips.day == 'Sat', 6,
                   np.where(tips.day == 'Sun', 7, 0))))

In [39]:
tips['dinner_time'] = tips.time == 'Dinner'

In [40]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,sex_male,dinner_time
1,16.99,1.01,Female,False,7,Dinner,2,8.495,False,True
2,10.34,1.66,Male,False,7,Dinner,3,3.446667,True,True
3,21.01,3.5,Male,False,7,Dinner,3,7.003333,True,True
4,23.68,3.31,Male,False,7,Dinner,2,11.84,True,True
5,24.59,3.61,Female,False,7,Dinner,4,6.1475,False,True


In [96]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    bool   
 4   day               244 non-null    int64  
 5   time              244 non-null    object 
 6   size              244 non-null    int64  
 7   price_per_person  244 non-null    float64
 8   sex_male          244 non-null    bool   
 9   dinner_time       244 non-null    bool   
dtypes: bool(3), float64(3), int64(2), object(2)
memory usage: 16.0+ KB


In [38]:
tips.day.value_counts()

6    87
7    76
4    62
5    19
Name: day, dtype: int64

In [42]:
train, validate, test = wrangle.split_data(tips)

In [43]:
X_train = train.drop(columns=['tip', 'sex', 'time'])
y_train = train.tip

In [47]:
# make the thing
kbest = SelectKBest(f_regression, k=2)
# fit the thing
_ = kbest.fit(X_train, y_train)

In [55]:
X_train.iloc[:,kbest.get_support()].head()

Unnamed: 0,total_bill,size
77,17.92,2
119,12.43,2
222,13.42,2
138,14.15,2
25,19.82,2


## What are they? total_bill and party size

## Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [57]:
model = LinearRegression()

In [58]:
rfe = RFE(model ,n_features_to_select=2)

_ = rfe.fit(X_train, y_train)

In [66]:
X_train.iloc[:,rfe.get_support()].head()

Unnamed: 0,total_bill,price_per_person
77,17.92,8.96
119,12.43,6.215
222,13.42,6.71
138,14.15,7.075
25,19.82,9.91


## What are they? 

total_bill and price_per_person

## Why do you think select k best and recursive feature elimination might give different answers for the top features? 

The kbest chooses features using statistical test, the rfe chooses features by looking at the effect on the model provided to it.

## Does this change as you change the number of features you are selecting?

In [67]:
# make the thing
kbest = SelectKBest(f_regression, k=4)
# fit the thing
_ = kbest.fit(X_train, y_train)

In [68]:
X_train.iloc[:,kbest.get_support()].head()

Unnamed: 0,total_bill,day,size,price_per_person
77,17.92,6,2,8.96
119,12.43,4,2,6.215
222,13.42,5,2,6.71
138,14.15,4,2,7.075
25,19.82,6,2,9.91


In [69]:
rfe = RFE(model ,n_features_to_select=4)

_ = rfe.fit(X_train, y_train)

In [71]:
X_train.iloc[:,rfe.get_support()].head()

Unnamed: 0,total_bill,smoker,price_per_person,dinner_time
77,17.92,True,8.96,True
119,12.43,False,6.215,False
222,13.42,True,6.71,False
138,14.15,False,7.075,False
25,19.82,False,9.91,True


Does this change as you change the number of features you are selecting? The features selected are still different

## Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [80]:
X_train.iloc[:,kbest.get_support()].columns.to_list()

['total_bill', 'day', 'size', 'price_per_person']

In [86]:
def select_kbest(X, y, k):
    '''
    This function will return a list of (k) number of columns from the predictors (X)
    for the target variable (y) using the SelectKBest function
    '''
    # make the thing
    kbest = SelectKBest(f_regression, k=k)
    # fit the thing
    _ = kbest.fit(X, y)
    # return a list of the columns chosen as features
    return X.iloc[:,kbest.get_support()].columns.to_list()

In [83]:
select_kbest(X_train, y_train, 4)

['total_bill', 'day', 'size', 'price_per_person']

## Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [85]:
X_train.iloc[:,rfe.get_support()].columns.to_list()

['total_bill', 'smoker', 'price_per_person', 'dinner_time']

In [87]:
def rfe(X, y, k):
    '''
    This function will return a list of (k) number of columns from the predictors (X)
    for the target variable (y) using the RFE function
    '''
    # make a model for the RFE to work on
    model = LinearRegression()
    # make the RFE thing
    rfe = RFE(model ,n_features_to_select=k)
    # fit the RFE to our dataset
    _ = rfe.fit(X, y)
    # return the column list of chosen features
    return X.iloc[:,rfe.get_support()].columns.to_list()

In [88]:
rfe(X_train, y_train, 4)

['total_bill', 'smoker', 'price_per_person', 'dinner_time']

## Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [89]:
swiss = data('swiss')

In [90]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [91]:
swiss.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [92]:
train_swiss, validate_swiss, test_swiss = wrangle.split_data(swiss)

In [93]:
X_train_swiss = train_swiss.drop(columns='Fertility')
y_train_swiss = train_swiss['Fertility']

In [94]:
select_kbest(X_train_swiss, y_train_swiss, 3)

['Examination', 'Education', 'Catholic']

In [95]:
rfe(X_train_swiss, y_train_swiss, 3)

['Agriculture', 'Examination', 'Education']