In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
from pydataset import data

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

from wrangle import train_validate_test_split
from wrangle import scale_data

In [51]:
df = data('tips')

In [52]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [53]:
# Get dummy variables for sex, smoker, and time
df = pd.get_dummies(df, columns = ['sex','smoker','time'],drop_first=True)

In [54]:
# Convert day to number
df.day = df['day'].astype('category')
df.day = df.day.cat.reorder_categories(['Thur','Fri','Sat','Sun'])
df.day = df.day.cat.codes

a.) Create a column named price_per_person. This should be the total bill divided by the party size.

In [55]:
df['price_per_person'] = df.total_bill/df['size']

In [56]:
train, validate, test = train_validate_test_split(df)

In [57]:
train = train.reset_index()

In [58]:
train.head()

Unnamed: 0,index,total_bill,tip,day,size,sex_Male,smoker_Yes,time_Lunch,price_per_person
0,19,16.97,3.5,3,3,0,0,0,5.656667
1,173,7.25,5.15,3,2,1,1,0,3.625
2,119,12.43,1.8,0,2,0,0,1,6.215
3,29,21.7,4.3,2,2,1,0,0,10.85
4,238,32.83,1.17,2,2,1,1,0,16.415


b.) Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

total bill

c.) Use select k best to select the top 2 features for predicting tip amount. What are they?

In [59]:
x_vars = ['total_bill','size','day','price_per_person']

In [61]:
def scale_data(train, validate, test, return_scaler=False):
  
    columns_to_scale = x_vars
    
    train_scaled = train.copy()
    validate_scaled = validate.copy()
    test_scaled = test.copy()
    
    scaler = MinMaxScaler()
    scaler.fit(train[columns_to_scale])
    
    train_scaled[columns_to_scale] = scaler.transform(train[columns_to_scale])
    validate_scaled[columns_to_scale] = scaler.transform(validate[columns_to_scale])
    test_scaled[columns_to_scale] = scaler.transform(test[columns_to_scale])
    
    if return_scaler:
        return scaler, train_scaled, validate_scaled, test_scaled
    else:
        return train_scaled, validate_scaled, test_scaled

        # come back to this later, can't get it to work in feature engineering

In [62]:
scaled_data = scale_data(train, validate, test, return_scaler=False)

In [64]:
scale = MinMaxScaler()
scale.fit(train[['total_bill','size','day','price_per_person']])
scaled_data = scale.transform(train[['total_bill','size','day','price_per_person']])

scaled_data_df = pd.DataFrame(data = scaled_data, columns = ['total_bill_scaled','size_scaled','day_scaled','price_per_person_scaled'])

train = pd.concat([train, scaled_data_df], axis = 1)

In [65]:
train.head()

Unnamed: 0,index,total_bill,tip,day,size,sex_Male,smoker_Yes,time_Lunch,price_per_person,total_bill_scaled,size_scaled,day_scaled,price_per_person_scaled
0,19,16.97,3.5,3,3,0,0,0,5.656667,0.307114,0.4,1.0,0.150344
1,173,7.25,5.15,3,2,1,1,0,3.625,0.092355,0.2,1.0,0.032258
2,119,12.43,1.8,0,2,0,0,1,6.215,0.206805,0.2,0.0,0.182796
3,29,21.7,4.3,2,2,1,0,0,10.85,0.411622,0.2,0.666667,0.452194
4,238,32.83,1.17,2,2,1,1,0,16.415,0.657534,0.2,0.666667,0.775647


In [67]:
X_train_scaled = train[['total_bill_scaled','size_scaled','price_per_person_scaled','day_scaled','sex_Male','smoker_Yes','time_Lunch']]
y_train = train.tip

In [72]:
# parameters: f_regression stats test, give me 2 features
f_selector = SelectKBest(f_regression, k=2)

# find the top 8 X's correlated with y
f_selector.fit(X_train_scaled, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

In [73]:
f_feature

['total_bill_scaled', 'size_scaled']

In [74]:
print(f'Top 2 features based on SelectKBest are {f_feature}')

Top 2 features based on SelectKBest are ['total_bill_scaled', 'size_scaled']


In [75]:
# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, n_features_to_select=2)

# fit the data using RFE
rfe.fit(X_train_scaled,y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

In [76]:
rfe_feature

['total_bill_scaled', 'price_per_person_scaled']

In [78]:
print(f'Top 2 features based on RFE {rfe_feature}')

Top 2 features based on RFE ['total_bill_scaled', 'price_per_person_scaled']


2.) Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [103]:
def select_kbest(X, y, k):
    # Takes in predictors (X), target (y) , and number of features to select (k) and returns the names 
    # of the top k selected features based on the SelectKBest class.

    # f_regression stats test for top 2
    f_selector = SelectKBest(f_regression, k=k)
    # find the top 2 X's correlated with y
    f_selector.fit(X,y)
    # Boolean mask of whether the column was selected or not
    feature_mask = f_selector.get_support()
    # List of top k features
    f_feature = X.iloc[:,feature_mask].columns.tolist()

    return print(f'Top features based on SelectKBest are {f_feature}')

In [89]:
select_kbest(X_train_scaled, y_train, k=2)

Top 2 features based on SelectKBest are ['total_bill_scaled', 'size_scaled']


3.) Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [102]:
def rfe(X_train_scaled, y_train, n_features_to_select):

        # initialize the ML algorithm
    lm = LinearRegression()

    # create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
    rfe = RFE(lm, n_features_to_select = n_features_to_select)

    # fit the data using RFE
    rfe.fit(X_train_scaled,y_train)  

    # get the mask of the columns selected
    feature_mask = rfe.support_

    # returns list of the features
    rfe_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

    return print(f'Top features based on RFE {rfe_feature}')

In [92]:
rfe(X_train_scaled, y_train, n_features_to_select=2)

Top 2 features based on RFE ['total_bill_scaled', 'price_per_person_scaled']


4.) Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [94]:
swiss = data('swiss')

In [95]:
train, validate, test = train_validate_test_split(swiss)

In [96]:
train = train.reset_index()

In [97]:
train.columns

Index(['index', 'Fertility', 'Agriculture', 'Examination', 'Education',
       'Catholic', 'Infant.Mortality'],
      dtype='object')

In [99]:
# Scale the data
scaler = MinMaxScaler()

scaled_data = scaler.fit_transform(train[['Agriculture', 'Examination', 'Education', 'Catholic',
       'Infant.Mortality']])
scaled_data_df = pd.DataFrame(data = scaled_data, columns = ['Agriculture_scaled', 'Examination_scaled', 'Education_scaled', 'Catholic_scaled',
       'Infant.Mortality_scaled'])

# join the trained and scaled data

train = pd.concat([train, scaled_data_df], axis = 1)

In [100]:
X_train_data = train[['Agriculture_scaled', 'Examination_scaled', 'Education_scaled', 'Catholic_scaled',
       'Infant.Mortality_scaled']]
y_train = train.Fertility

In [104]:
select_kbest(X_train_data, y_train, 3)

Top features based on SelectKBest are ['Examination_scaled', 'Catholic_scaled', 'Infant.Mortality_scaled']


In [105]:
rfe(X_train_data, y_train, 3)

Top features based on RFE ['Agriculture_scaled', 'Examination_scaled', 'Infant.Mortality_scaled']
