In [31]:
import math

import pandas as pd
import numpy as np
import seaborn as sns 
from pydataset import data 
import matplotlib.pyplot as plt
import sklearn.preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from evaluate import plot_residuals, regression_errors, baseline_mean_errors, better_than_the_baseline
from wrangle import split_data

## Feature Engineering: Regression Exercises

#### Part I: Load the tips dataset

In [2]:
tips_df = data('tips')
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


##### Part I(a): Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [3]:
tips_df['tip_percentage'] = (tips_df.tip / tips_df.total_bill)
tips_df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
3,21.01,3.50,Male,No,Sun,Dinner,3,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.139780
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808
...,...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3,0.203927
241,27.18,2.00,Female,Yes,Sat,Dinner,2,0.073584
242,22.67,2.00,Male,Yes,Sat,Dinner,2,0.088222
243,17.82,1.75,Male,No,Sat,Dinner,2,0.098204


##### Part I(b): Create a column named price_per_person. This should be the total bill divided by the party size.


In [6]:
tips_df['price_per_person'] = tips_df.total_bill / tips_df['size']
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


##### Part I(c): Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?


 -The most important feature for calculating tip amount would be the total bill and tip percentage
 -For tip percentage I would say the most important features are going to be total_bill and tip amount.

##### Part I(d): Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?


In [11]:
train, validate, test = split_data(tips_df)
train.shape, validate.shape, test.shape

((136, 9), (59, 9), (49, 9))

In [12]:
train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
19,16.97,3.5,Female,No,Sun,Dinner,3,0.206246,5.656667
173,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345,3.625
119,12.43,1.8,Female,No,Thur,Lunch,2,0.144811,6.215
29,21.7,4.3,Male,No,Sat,Dinner,2,0.198157,10.85
238,32.83,1.17,Male,Yes,Sat,Dinner,2,0.035638,16.415


In [13]:
cols = ['total_bill', 'size', 'tip_percentage', 'price_per_person']

X_train, y_train = train[cols], train.tip
X_validate, y_validate = validate[cols], validate.tip
X_test, y_test = test[cols], test.tip

In [15]:
def Min_Max_Scaler(X_train, X_validate, X_test):
    """
    Takes in X_train, X_validate and X_test dfs with numeric values only
    Returns scaler, X_train_scaled, X_validate_scaled, X_test_scaled dfs 
    """
    scaler = sklearn.preprocessing.MinMaxScaler().fit(X_train)
    X_train_scaled = pd.DataFrame(scaler.transform(X_train), index = X_train.index, columns = X_train.columns)
    X_validate_scaled = pd.DataFrame(scaler.transform(X_validate), index = X_validate.index, columns = X_validate.columns)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), index = X_test.index, columns = X_test.columns)
    
    return scaler, X_train_scaled, X_validate_scaled, X_test_scaled

In [18]:
Min_Max_Scaler(X_train, X_validate, X_test)

(MinMaxScaler(),
      total_bill  size  tip_percentage  price_per_person
 19     0.307114   0.4        0.252863          0.150344
 173    0.092355   0.2        1.000000          0.032258
 119    0.206805   0.2        0.161808          0.182796
 29     0.411622   0.2        0.240873          0.452194
 238    0.657534   0.2        0.000000          0.775647
 208    0.787892   0.6        0.061984          0.384336
 184    0.444101   0.6        0.362968          0.158239
 61     0.380468   0.2        0.181661          0.411218
 42     0.317941   0.2        0.162793          0.328974
 161    0.407203   0.6        0.188456          0.133973
 98     0.197967   0.2        0.131983          0.171171
 71     0.197746   0.2        0.190091          0.170881
 244    0.347106   0.2        0.183941          0.367335
 177    0.327441   0.2        0.112873          0.341471
 141    0.318162   0.2        0.244114          0.329265
 180    0.697304   0.2        0.099116          0.827957
 12     0.7112

In [40]:
f_selector = SelectKBest(score_func=f_regression, k=3)
f_selector.fit(X_train, y_train)

SelectKBest(k=3, score_func=<function f_regression at 0x7ffbf2f50550>)

In [41]:
mask = f_selector.get_support()
X_train.columns[mask]

Index(['total_bill', 'tip', 'price_per_person'], dtype='object')

 - The two most important features in selecting in predicting tip amount are total_bill and size

##### Part 1(e) Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?


In [25]:
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


In [26]:
cols = ['total_bill', 'size', 'tip', 'price_per_person']

X_train, y_train = train[cols], train.tip_percentage
X_validate, y_validate = validate[cols], validate.tip_percentage
X_test, y_test = test[cols], test.tip_percentage

In [27]:
X_train.head()

Unnamed: 0,total_bill,size,tip,price_per_person
19,16.97,3,3.5,5.656667
173,7.25,2,5.15,3.625
119,12.43,2,1.8,6.215
29,21.7,2,4.3,10.85
238,32.83,2,1.17,16.415


In [28]:
y_train.head()

19     0.206246
173    0.710345
119    0.144811
29     0.198157
238    0.035638
Name: tip_percentage, dtype: float64

In [36]:
lm = LinearRegression()
rfe = RFE(estimator=lm, n_features_to_select = 3)
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=3)

In [37]:
rfe.support_

array([ True,  True,  True, False])

In [38]:
X_train.columns[rfe.support_]

Index(['total_bill', 'size', 'tip'], dtype='object')

 - The best features to predict the tip_percentage are total_bill and tip

##### Part I(f): Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?


 - The way the algorithim sorts through the features. The k best method looks for the strongest correlation between the features and the target. Where as the Regression method actually looks for patterns and distance between features and target

#### Question 2: Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.



In [42]:
def select_kbest(x, y, n):
    '''
    This function is a tool that uses the SelectKBest method to pull down 
    the best features to predict a target variable from the features 
    in a dataframe
    '''
    f_selector = SelectKBest(score_func=f_regression, k=n)
    f_selector.fit(x, y)
    
    mask = f_selector.get_support()
    return x.columns[mask]
    

In [43]:
select_kbest(X_train, y_train, 4)

Index(['total_bill', 'size', 'tip', 'price_per_person'], dtype='object')

#### Question 3: Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.



In [44]:
def rfe(x, y, n):
    '''
    This function uses the recursive feature elimination method to predict
    the best features for predicting a target variable for a Regression model
    by taking in X_train and y_train and number of features wanted to select
    '''
    lm = LinearRegression()
    rfe = RFE(estimator=lm, n_features_to_select = n)
    rfe.fit(x, y)
    
    return x.columns[rfe.support_]
    

In [45]:
rfe(X_train, y_train, 3)

Index(['total_bill', 'size', 'tip'], dtype='object')

#### Question 4: Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).



In [49]:
swiss_df = data('swiss')
swiss_df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [59]:
train, validate, test = split_data(swiss_df)
train.shape, validate.shape, test.shape

((25, 6), (12, 6), (10, 6))

In [60]:
X_train, y_train = train.drop(columns = 'Fertility'), train.Fertility
X_validate, y_validate = validate.drop(columns = 'Fertility'), validate.Fertility
X_test, y_test = test.drop(columns = 'Fertility'), test.Fertility

In [61]:
Min_Max_Scaler(X_train, X_validate, X_test)

(MinMaxScaler(),
               Agriculture  Examination  Education  Catholic  Infant.Mortality
 Rolle            0.647561      0.40625   0.290323  0.054508          0.122449
 Lavaux           0.796341      0.50000   0.258065  0.004508          0.500000
 Nyone            0.526829      0.59375   0.354839  0.130533          0.163265
 Conthey          0.953659      0.00000   0.032258  0.997029          0.000000
 Yverdon          0.509756      0.37500   0.225806  0.037910          0.755102
 Oron             0.774390      0.28125   0.000000  0.000000          0.602041
 Cossonay         0.751220      0.59375   0.129032  0.004303          0.367347
 St Maurice       0.831707      0.18750   0.258065  0.990369          0.275510
 Franches-Mnt     0.390244      0.06250   0.129032  0.932377          0.520408
 Orbe             0.565854      0.53125   0.161290  0.018443          0.020408
 Sarine           0.457317      0.40625   0.387097  0.911680          0.948980
 La Chauxdfnd     0.000000      0.8

In [62]:
select_kbest(X_train, y_train, 3)

Index(['Examination', 'Catholic', 'Infant.Mortality'], dtype='object')

In [63]:
rfe(X_train, y_train, 3)

Index(['Agriculture', 'Examination', 'Infant.Mortality'], dtype='object')