In [22]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pydataset
import seaborn as sns
import itertools
import wrangle

from pydataset import data
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings("ignore")

# 1. 

Load the tips dataset.

In [2]:
df = data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df = df.rename(columns={'size': 'party_size'})

## A.

Create a column named price_per_person. This should be the total bill divided by the party size.

In [4]:
df['price_per_person'] = df.total_bill / df.party_size

In [5]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,party_size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


# B.

Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?


Party_size, total_bill, price_per_person, time

In [6]:
#find target
target = 'tip'

#quantitative features
quant_features = [col for col in df.columns if (df[col].dtype != 'object') & (col != target)]

# identify categorical features
categ_features = [col for col in df.columns if (df[col].dtype == 'object') & (col != target)]

In [7]:
#encode Categorical
for feature in categ_features:
    dummy_df = pd.get_dummies(df[feature],
                             prefix=f'enc_{df[feature].name}',
                             drop_first=True)
    df = pd.concat([df, dummy_df], axis=1)
    
enc_features = [col for col in df.columns if col.startswith('enc_')]

In [10]:
quant_features

['total_bill', 'party_size', 'price_per_person']

In [9]:
train, test, validate =wrangle.train_test_validate_split(df)

train	 n = 136
test	 n = 49
validate n = 59


In [15]:
#scale quant

#DB for scaled features
train_scaled = pd.DataFrame(index=train.index)
validate_scaled = pd.DataFrame(index = validate.index)
test_scaled = pd.DataFrame(index = test.index)

#make and fit scaler
scaler = MinMaxScaler().fit(train[quant_features])

#addd scaled featuress to DF
train_scaled[quant_features] = scaler.transform(train[quant_features])
validate_scaled[quant_features] = scaler.transform(validate[quant_features])
test_scaled[quant_features] = scaler.transform(test[quant_features])

# add 'scaled' prefix to columns
for feature in quant_features:
    train_scaled = train_scaled.rename(columns={feature: f'scaled_{feature}'})
    validate_scaled = validate_scaled.rename(columns={feature: f'scaled_{feature}'})
    test_scaled = test_scaled.rename(columns={feature: f'scaled_{feature}'})
    
# concat scaled features to original train, validate, test df's
train = pd.concat([train, train_scaled], axis=1)
validate = pd.concat([validate, validate_scaled], axis=1)
test = pd.concat([test, test_scaled], axis=1)

#identify scaled features
scaled_features = [col for col in train.columns if col.startswith('scaled_')]

In [17]:
train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,party_size,price_per_person,enc_sex_Male,enc_smoker_Yes,enc_day_Sat,enc_day_Sun,enc_day_Thur,enc_time_Lunch,scaled_total_bill,scaled_party_size,scaled_price_per_person
126,29.8,4.2,Female,No,Thur,Lunch,6,4.966667,0,0,0,0,1,1,0.591372,1.0,0.142929
46,18.29,3.0,Male,No,Sun,Dinner,2,9.145,1,0,0,1,0,0,0.336726,0.2,0.4578
21,17.92,4.08,Male,No,Sat,Dinner,2,8.96,1,0,1,0,0,0,0.32854,0.2,0.443858
179,9.6,4.0,Female,Yes,Sun,Dinner,2,4.8,0,1,0,1,0,0,0.144469,0.2,0.130369
84,32.68,5.0,Male,Yes,Thur,Lunch,2,16.34,1,1,0,0,1,1,0.655088,0.2,1.0


In [19]:
scaled_features

['scaled_total_bill', 'scaled_party_size', 'scaled_price_per_person']

In [20]:
# divide samples into x and y using only model-appropriate (encoded and scaled) features for x

x_train = train[scaled_features + enc_features]
y_train = train[target]

x_validate = validate[scaled_features + enc_features]
y_validate = validate[target]

x_test = test[scaled_features + enc_features]
y_test = test[target]

# C. 

Use select k best to select the top 2 features for predicting tip amount. What are they?

In [28]:
#KBEST
kmodel = SelectKBest(f_regression, k = 3)
kmodel.fit(x_train,y_train)
feature_mask = kmodel.get_support()
k_best = x_train.iloc[:,feature_mask].columns.tolist()


k_best

['scaled_total_bill', 'scaled_party_size', 'scaled_price_per_person']

# D.

Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [31]:
# RFE
rfe = RFE(LinearRegression(), n_features_to_select=3)
x_rfe = rfe.fit_transform(x_train, y_train)
feature_mask = rfe.support_
rfe_features = x_train.loc[:,feature_mask].columns.to_list()

rfe_features

['scaled_total_bill', 'scaled_price_per_person', 'enc_day_Sat']

# E. 

Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

# 2.

Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [32]:
from sklearn.feature_selection import SelectKBest, f_regression

def kbest(x, y, k):
    selector = SelectKBest(f_regression, k=k)
    selector.fit(x, y)
    feature_mask = selector.get_support()
    k_best_features = x_train.iloc[:,feature_mask].columns.tolist()
    return k_best_features

In [33]:
kbest(x_train,y_train, 3)

['scaled_total_bill', 'scaled_party_size', 'scaled_price_per_person']

# 3.

Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [34]:
from sklearn.feature_selection import RFE

def rfe(x, y, k):
    rfe = RFE(LinearRegression(), n_features_to_select=k)
    x_rfe = rfe.fit_transform(x_train, y_train)
    feature_mask = rfe.support_
    rfe_features = x_train.loc[:,feature_mask].columns.to_list()
    return rfe_features

In [35]:
rfe(x_train,y_train, 3)

['scaled_total_bill', 'scaled_price_per_person', 'enc_day_Sat']

In [47]:
def kbest_rfe(x, y, k):
    selector = SelectKBest(f_regression, k=k)
    selector.fit(x, y)
    feature_mask = selector.get_support()
    k_best_features = x_train.iloc[:,feature_mask].columns.tolist()
    
    rfe = RFE(LinearRegression(), n_features_to_select=k)
    x_rfe = rfe.fit_transform(x_train, y_train)
    feature_mask = rfe.support_
    rfe_features = x_train.loc[:,feature_mask].columns.to_list()
    
    return print(f'kbest is {k_best_features}\n rfe is {rfe_features}')

In [48]:
kbest_rfe(x_train,y_train, 3)

kbest is ['scaled_fertility', 'scaled_examination', 'scaled_education']
 rfe is ['scaled_fertility', 'scaled_agriculture', 'scaled_catholic']


# 4.

Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [36]:
df = data('swiss')

In [37]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [38]:
for col in df.columns:
    df = df.rename(columns={col: col.lower().replace('.', '_')})

In [40]:
df.head()

Unnamed: 0,fertility,agriculture,examination,education,catholic,infant_mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [41]:
target = 'fertility'
quant_features = [col for col in df.columns if (df[col].dtype != 'object') & (col != 'target')]

In [42]:
train,test,validate = wrangle.train_test_validate_split(df)

train	 n = 25
test	 n = 10
validate n = 12


In [43]:
#scale quant

#DB for scaled features
train_scaled = pd.DataFrame(index=train.index)
validate_scaled = pd.DataFrame(index = validate.index)
test_scaled = pd.DataFrame(index = test.index)

#make and fit scaler
scaler = MinMaxScaler().fit(train[quant_features])

#addd scaled featuress to DF
train_scaled[quant_features] = scaler.transform(train[quant_features])
validate_scaled[quant_features] = scaler.transform(validate[quant_features])
test_scaled[quant_features] = scaler.transform(test[quant_features])

# add 'scaled' prefix to columns
for feature in quant_features:
    train_scaled = train_scaled.rename(columns={feature: f'scaled_{feature}'})
    validate_scaled = validate_scaled.rename(columns={feature: f'scaled_{feature}'})
    test_scaled = test_scaled.rename(columns={feature: f'scaled_{feature}'})
    
# concat scaled features to original train, validate, test df's
train = pd.concat([train, train_scaled], axis=1)
validate = pd.concat([validate, validate_scaled], axis=1)
test = pd.concat([test, test_scaled], axis=1)

#identify scaled features
scaled_features = [col for col in train.columns if col.startswith('scaled_')]

In [44]:
# divide samples into x and y using only model-appropriate (encoded and scaled) features for x

x_train = train[scaled_features]
y_train = train[target]

x_validate = validate[scaled_features]
y_validate = validate[target]

x_test = test[scaled_features]
y_test = test[target]

In [49]:
kbest_rfe(x_train,y_train, 3)

kbest is ['scaled_fertility', 'scaled_examination', 'scaled_education']
 rfe is ['scaled_fertility', 'scaled_agriculture', 'scaled_catholic']
