In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from env import user, host, password
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
from acquire import get_zillow_data, train_val_test, acquire_zillow, prep_zillow, wrangle_zillow
import acquire
from pydataset import data

In [2]:
df = data('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
# Create a column named price_per_person. This should be the total bill divided by the party size.
df['price_per_person'] = df.total_bill / df.size
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.009947
2,10.34,1.66,Male,No,Sun,Dinner,3,0.006054
3,21.01,3.50,Male,No,Sun,Dinner,3,0.012301
4,23.68,3.31,Male,No,Sun,Dinner,2,0.013864
5,24.59,3.61,Female,No,Sun,Dinner,4,0.014397
...,...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3,0.016996
241,27.18,2.00,Female,Yes,Sat,Dinner,2,0.015913
242,22.67,2.00,Male,Yes,Sat,Dinner,2,0.013273
243,17.82,1.75,Male,No,Sat,Dinner,2,0.010433


In [5]:
# Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
# total bill and time

In [9]:
df['sex_encoded'] = df.sex.map({'Female': 1, 'Male': 0})
df['smoker_encoded'] = df.smoker.map({'No': 0, 'Yes': 1})
df['day_encoded'] = df.day.map({'Thur': 1, 'Fri': 2, 'Sat': 3, 'Sun': 4})
df['time_encoded'] = df.time.map({'Lunch': 0, 'Dinner': 1})

In [31]:
# Use Select K Best to select the top 2 features for predicting tip amount. What are they?
X_train = df.drop(columns=['tip', 'sex', 'smoker', 'day', 'time'])
y_train = df.tip

from sklearn.feature_selection import SelectKBest, f_regression

# parameters: f_regression stats test, give me 2 features
f_selector = SelectKBest(f_regression, k=2)

# find the top 8 X's correlated with y
f_selector.fit(X_train, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X_train.iloc[:,feature_mask].columns.tolist()


In [32]:
f_feature


['total_bill', 'price_per_person']

In [33]:
# Use Recursive Feature Elimination to select the top 2 features for tip amount. What are they?
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, n_features_to_select=2)

# fit the data using RFE
rfe.fit(X_train,y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train.iloc[:,feature_mask].columns.tolist()


In [34]:
rfe_feature


['total_bill', 'size']

In [35]:
# Why do you think Select K Best and Recursive Feature Elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?
# because kbest is looking at correlation and rfe is looking at what makes the best linear regression model. yes, kbest gives the same as rfe if you use 3 instead of 2 features

In [37]:
# Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the 
# top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.
def select_kbest(X, y, k):
    ''' select k best takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the 
    top k selected features based on the SelectKBest class'''
    f_selector = SelectKBest(f_regression, k=k)
    f_selector.fit(X, y)
    feature_mask = f_selector.get_support()
    f_feature = X.iloc[:,feature_mask].columns.tolist()
    return f_feature



In [39]:
select_kbest(X_train, y_train, 2)

['total_bill', 'price_per_person']

In [40]:
# Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top n features based on the RFE class. 
# Test your function with the tips dataset. You should see the same results as when you did the process manually.
def rfe(X, y, k):
    lm = LinearRegression()
    rfe = RFE(lm, n_features_to_select=2)
    rfe.fit(X_train,y_train)  
    feature_mask = rfe.support_
    rfe_feature = X_train.iloc[:,feature_mask].columns.tolist()
    return rfe_feature

In [41]:
rfe(X_train, y_train, 2)

['total_bill', 'size']

In [43]:
# Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both Select K Best and Recursive Feature Elimination 
# (use the functions you just built to help you out).
df = data('swiss')
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [44]:
X_train = df.drop(columns=['Fertility'])
y_train = df.Fertility

In [45]:
select_kbest(X_train, y_train, 3)

['Examination', 'Education', 'Catholic']

In [46]:
rfe(X_train, y_train, 3)

['Education', 'Infant.Mortality']