In [95]:
import pandas as pd
import numpy as np
import os
import wrangle_grades
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
from pydataset import data

# Load the tips dataset.

In [30]:
df = sns.load_dataset('tips')
# encode categorical variables
df['sex'] = df.sex.map({'Female' : 0 , 'Male' : 1})
df['smoker'] = df.smoker.map({'No' : 0 , 'Yes' : 1})
df['dinner'] = df.time.map({'Dinner' : 1, 'Lunch' : 0})
dummies = pd.get_dummies(df[['day']], drop_first=True)
df = pd.concat([df, dummies], axis=1)

df.drop(columns=['time', 'day'], inplace=True)


df.head()


Unnamed: 0,total_bill,tip,sex,smoker,size,dinner,day_Fri,day_Sat,day_Sun
0,16.99,1.01,0,0,2,1,0,0,1
1,10.34,1.66,1,0,3,1,0,0,1
2,21.01,3.5,1,0,3,1,0,0,1
3,23.68,3.31,1,0,2,1,0,0,1
4,24.59,3.61,0,0,4,1,0,0,1


In [38]:
df.dinner = df.dinner.astype(int)
df.smoker = df.smoker.astype(int)
df.sex = df.sex.astype(int)

# Create a column named price_per_person. This should be the total bill divided by the party size.

In [39]:
df['price_per_person'] = df.total_bill / df['size']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    int64  
 3   smoker            244 non-null    int64  
 4   size              244 non-null    int64  
 5   dinner            244 non-null    int64  
 6   day_Fri           244 non-null    uint8  
 7   day_Sat           244 non-null    uint8  
 8   day_Sun           244 non-null    uint8  
 9   price_per_person  244 non-null    float64
dtypes: float64(3), int64(4), uint8(3)
memory usage: 14.2 KB


# Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

    - Total bill
    - price_per_person

# Use select k best to select the top 2 features for predicting tip amount. What are they?

In [40]:
df.columns.tolist()

['total_bill',
 'tip',
 'sex',
 'smoker',
 'size',
 'dinner',
 'day_Fri',
 'day_Sat',
 'day_Sun',
 'price_per_person']

In [42]:
X = df[['total_bill', 'sex', 'smoker', 'size', 'dinner', 'day_Fri', 'day_Sat', 'day_Sun', 'price_per_person']]
y = df['tip']

In [43]:
kbest = SelectKBest(score_func=f_regression, k=2)
kbest.fit(X, y)  

SelectKBest(k=2, score_func=<function f_regression at 0x146032b80>)

In [56]:
top2 = X.columns[kbest.get_support()]
print(f'The top 2 features using k-best are {top2[0]} and {top2[1]}')

The top 2 features using k-best are total_bill and size


# Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [79]:
model = LinearRegression()

In [80]:
# make the rfe
rfe = RFE(model, n_features_to_select=2)
# fit the rfe
rfe.fit(X, y)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [82]:
rfe.ranking_

array([3, 7, 2, 5, 6, 1, 8, 1, 4])

In [66]:
results = pd.DataFrame(
{
    'rfe_ranking' : rfe.ranking_
}, index = X.columns)
results = results[results.rfe_ranking == 1]
print(f'The top two features using RFE are {results.index[0]} and {results.index[1]}')

The top two features using RFE are day_Fri and day_Sun


# Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

They use different methods.  The feature selection does change as the number of features changes.

# Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [73]:
def select_kbest(X, y, num):
    kbest = SelectKBest(score_func=f_regression, k=num)
    kbest.fit(X, y) 
    
    return X.columns[kbest.get_support()].tolist()
    
    
    

In [78]:
select_kbest(X, y, 2)

['total_bill', 'size']

# Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [91]:
def rfe(X, y, num):
    model = LinearRegression()
    # make the rfe
    rfe = RFE(model, n_features_to_select=num)
    # fit the rfe
    rfe.fit(X, y)
    
    results = pd.DataFrame(
    {
        'rfe_ranking' : rfe.ranking_
    }, index = X.columns)
    print(results.index[results.rfe_ranking == 1].tolist())
    

In [92]:
rfe(X,y,2)

['day_Fri', 'day_Sun']


# Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [96]:
swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [97]:
swiss_X = swiss.drop(columns='Fertility')
swiss_y = swiss.Fertility

In [99]:
select_kbest(swiss_X, swiss_y, 3)

['Examination', 'Education', 'Catholic']

In [100]:
rfe(swiss_X, swiss_y, 3)

['Examination', 'Education', 'Infant.Mortality']
