In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from pydataset import data

# Exercises

1. Load the `tips` dataset.

In [10]:
tips = data('tips')

In [11]:
#tips.info()

- a. Create a column named `tip_percentage`. This should be the tip amount divided by the total bill.

In [12]:
tips['tip_percentage'] = tips.tip / tips.total_bill

- b. Create a column named `price_per_person`. This should be the total bill divided by the party size.

In [13]:
tips['price_per_person'] = tips.total_bill / tips['size']

In [14]:
# drop total_bill and size (accounted for in 'price per person')
#tips = tips.drop(columns=['total_bill', 'size'])

- c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

In [15]:
tips.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size',
       'tip_percentage', 'price_per_person'],
      dtype='object')

#### Feature Engineering Prep
- check unique values for each object columns
- encode categorical variables
- split data into train, test, validate
- split into X and y dfs
- scale the data
- add scaled data to df as columns
- run the selectkbest/rfe algorithms

In [16]:
#check unique values for each object columns
tips.nunique()

total_bill          229
tip                 123
sex                   2
smoker                2
day                   4
time                  2
size                  6
tip_percentage      242
price_per_person    235
dtype: int64

In [17]:
tips['day'].value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [18]:
# encode categorical variables into dummies (sex, smoker, day, time)
dummy_vars = tips[['sex', 'smoker', 'day', 'time']]
dummy_df = pd.get_dummies(dummy_vars, dummy_na=False, drop_first=True)
df = pd.concat([tips, dummy_df], axis=1)
df = df.drop(columns=dummy_vars)

In [19]:
# split data into train, test, validate
from sklearn.model_selection import train_test_split

train_validate, test = train_test_split(df, test_size=.2, 
                                        random_state=123)

train, validate = train_test_split(train_validate, 
                                   test_size=.3, random_state=123)
train.shape, validate.shape, test.shape

((136, 11), (59, 11), (49, 11))

In [20]:
df.head()

Unnamed: 0,total_bill,tip,size,tip_percentage,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,16.99,1.01,2,0.059447,8.495,0,0,0,1,0,0
2,10.34,1.66,3,0.160542,3.446667,1,0,0,1,0,0
3,21.01,3.5,3,0.166587,7.003333,1,0,0,1,0,0
4,23.68,3.31,2,0.13978,11.84,1,0,0,1,0,0
5,24.59,3.61,4,0.146808,6.1475,0,0,0,1,0,0


In [21]:
# split into X and y dfs
X_train = train.drop(columns=['tip'])
X_validate = validate.drop(columns=['tip'])
X_test = test.drop(columns=['tip'])

y_train = train[['tip']]
y_validate = validate[['tip']]
y_test = test[['tip']]

In [22]:
# scale the data (creates arrays)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(copy=True).fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

In [23]:
# turn scaled data arrays to df
X_train_scaled = pd.DataFrame(X_train_scaled, 
                              columns=X_train.columns.values).\
                            set_index([X_train.index.values])

X_validate_scaled = pd.DataFrame(X_validate_scaled, 
                                columns=X_validate.columns.values).\
                            set_index([X_validate.index.values])

X_test_scaled = pd.DataFrame(X_test_scaled, 
                                columns=X_test.columns.values).\
                            set_index([X_test.index.values])

In [24]:
X_train_scaled.head()

Unnamed: 0,total_bill,size,tip_percentage,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
19,0.307114,0.4,0.252863,0.150344,0.0,0.0,0.0,1.0,0.0,0.0
173,0.092355,0.2,1.0,0.032258,1.0,1.0,0.0,1.0,0.0,0.0
119,0.206805,0.2,0.161808,0.182796,0.0,0.0,0.0,0.0,1.0,1.0
29,0.411622,0.2,0.240873,0.452194,1.0,0.0,1.0,0.0,0.0,0.0
238,0.657534,0.2,0.0,0.775647,1.0,1.0,1.0,0.0,0.0,0.0


In [25]:
# run the selectkbest/rfe algorithms

- d. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?

#### SelectKBest

In [26]:
from sklearn.feature_selection import SelectKBest, f_regression
# Initialize the f_selector object, which defines the test for scoring the features 
# and the number of features we want to keep, k
f_selector = SelectKBest(f_regression, k=2)

# Fit the object to our data. Our selector is scoring, ranking, and identifying the top k features.
f_selector.fit(X_train_scaled, y_train)

# Transform our dataset to reduce to the k best features.
X_reduced = f_selector.transform(X_train_scaled)

print(X_train_scaled.shape)
print(X_reduced.shape)

#  list of booleans that relate to the feature indices.
f_support = f_selector.get_support()
print(type(f_support))
print(f_support)

# columns to keep
f_feature = X_train_scaled.iloc[:,f_support].columns.tolist()
f_feature

(136, 10)
(136, 2)
<class 'numpy.ndarray'>
[ True  True False False False False False False False False]


['total_bill', 'size']

#### Recursive Feature Elimination

In [27]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Initialize the linear regression object
lm = LinearRegression()

# Initialize the RFE object, setting the hyperparameters to be our linear model above (lm), 
# and the number of features we want returned.
rfe = RFE(lm, 2)

X_rfe = rfe.fit_transform(X_train_scaled, y_train)

# Save the X_rfe for later, to feed into a model.
mask = rfe.support_
X_reduced_scaled_rfe = X_train_scaled.iloc[:,mask]

# features selected using rfe
X_reduced_scaled_rfe.columns.tolist()

['total_bill', 'tip_percentage']

- e. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [28]:
# split into X and y dfs
X_train2 = train.drop(columns=['tip_percentage'])
X_validate2 = validate.drop(columns=['tip_percentage'])
X_test2 = test.drop(columns=['tip_percentage'])

y_train2 = train[['tip_percentage']]
y_validate2 = validate[['tip_percentage']]
y_test2 = test[['tip_percentage']]

In [29]:
# scale the data (creates arrays)
from sklearn.preprocessing import MinMaxScaler

scaler2 = MinMaxScaler(copy=True).fit(X_train2)

X_train_scaled2 = scaler2.transform(X_train2)
X_validate_scaled2 = scaler2.transform(X_validate2)
X_test_scaled2 = scaler2.transform(X_test2)

In [30]:
# turn scaled data arrays to df
X_train_scaled2 = pd.DataFrame(X_train_scaled2, 
                              columns=X_train2.columns.values).\
                            set_index([X_train2.index.values])

X_validate_scaled2 = pd.DataFrame(X_validate_scaled2, 
                                columns=X_validate2.columns.values).\
                            set_index([X_validate2.index.values])

X_test_scaled2 = pd.DataFrame(X_test_scaled2, 
                                columns=X_test2.columns.values).\
                            set_index([X_test2.index.values])

#### SelectKBest

In [31]:
from sklearn.feature_selection import SelectKBest, f_regression
# Initialize the f_selector object, which defines the test for scoring the features 
# and the number of features we want to keep, k
f_selector2 = SelectKBest(f_regression, k=2)

# Fit the object to our data. Our selector is scoring, ranking, and identifying the top k features.
f_selector2.fit(X_train_scaled2, y_train2)

# Transform our dataset to reduce to the k best features.
X_reduced2 = f_selector2.transform(X_train_scaled2)

print(X_train_scaled2.shape)
print(X_reduced2.shape)

#  list of booleans that relate to the feature indices.
f_support2 = f_selector2.get_support()
print(type(f_support2))
print(f_support2)

# columns to keep
f_feature2 = X_train_scaled2.iloc[:,f_support2].columns.tolist()
f_feature2

(136, 10)
(136, 2)
<class 'numpy.ndarray'>
[False  True False  True False False False False False False]


['tip', 'price_per_person']

#### RFE

In [32]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Initialize the linear regression object
lm2 = LinearRegression()

# Initialize the RFE object, setting the hyperparameters to be our linear model above (lm), 
# and the number of features we want returned.
rfe2 = RFE(lm2, 2)

X_rfe2 = rfe2.fit_transform(X_train_scaled2, y_train2)

# Save the X_rfe for later, to feed into a model.
mask2 = rfe2.support_
X_reduced_scaled_rfe2 = X_train_scaled2.iloc[:,mask2]

# features selected using rfe
X_reduced_scaled_rfe2.columns.tolist()

['total_bill', 'tip']

- f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?
    - SKB selects features based on correlation between each feature and the target. 
    - RFE uses the top features that would perform best on a linear regression model
    - the top 2-3 features are usually the same in this case but as you increase number of features to 3+ the rank changes

2. Write a function named `select_kbest` that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the `SelectKBest` class. Test your function with the `tips` dataset. You should see the same results as when you did the process manually.

In [33]:
def select_kbest(X, y, n):
    from sklearn.feature_selection import SelectKBest, f_regression
    f_selector = SelectKBest(f_regression, k=n).fit(X, y)
    X_reduced = f_selector.transform(X)
    f_support = f_selector.get_support()
    f_feature = X.iloc[:,f_support].columns.tolist()
    return f_feature

In [34]:
select_kbest(X_train_scaled, y_train, 2)

['total_bill', 'size']

In [35]:
select_kbest(X_train_scaled2, y_train2, 2)

['tip', 'price_per_person']

3. Write a function named `rfe` that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the `RFE` class. Test your function with the `tips` dataset. You should see the same results as when you did the process manually.

In [37]:
rfe(X_train_scaled, y_train, 2)

['total_bill', 'tip_percentage']

In [38]:
rfe(X_train_scaled2, y_train2, 2)

['total_bill', 'tip']

4. Load the `swiss` dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [39]:
from pydataset import data

swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [47]:
swiss.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [48]:
swiss.nunique()

Fertility           46
Agriculture         47
Examination         22
Education           19
Catholic            46
Infant.Mortality    37
dtype: int64

In [49]:
swiss.describe()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
count,47.0,47.0,47.0,47.0,47.0,47.0
mean,70.142553,50.659574,16.489362,10.978723,41.14383,19.942553
std,12.491697,22.711218,7.977883,9.615407,41.70485,2.912697
min,35.0,1.2,3.0,1.0,2.15,10.8
25%,64.7,35.9,12.0,6.0,5.195,18.15
50%,70.4,54.1,16.0,8.0,15.14,20.0
75%,78.45,67.65,22.0,12.0,93.125,21.7
max,92.5,89.7,37.0,53.0,100.0,26.6


In [50]:
X_swiss = swiss.drop(columns='Fertility')
y_swiss = swiss[['Fertility']]
scaler_swiss = MinMaxScaler(copy=True).fit(X_swiss)
columns_to_scale = X_swiss.columns

In [51]:
X_swiss_scaled = scaler_swiss.transform(X_swiss)
X_swiss_scaled = pd.DataFrame(X_swiss_scaled, 
                              columns=X_swiss.columns.values).\
                            set_index([X_swiss.index.values])

In [52]:
select_kbest(X_swiss, y_swiss, 3)

['Examination', 'Education', 'Catholic']

In [53]:
rfe(X_swiss, y_swiss, 3)

['Examination', 'Education', 'Infant.Mortality']