# Feature Engineering Exercises

In [53]:
import pandas as pd
import numpy as np

import seaborn as sns
import wrangle
from pydataset import data

import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler

from sklearn.feature_selection import SelectKBest, f_regression

1. Load the tips dataset.

 a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.\
 b. Create a column named price_per_person. This should be the total bill divided by the party size.\
 c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?\
 d. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?\
 e. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?\
 f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?\
2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [2]:
# load the tips dataset
tips = sns.load_dataset("tips")

In [3]:
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


In [4]:
tips.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [5]:
# create a column named tip_percentage
tips['tip_percentage'] = tips.tip / tips.total_bill

In [6]:
# create a column named price_per_person
tips['price_per_person'] = tips.total_bill / tips.size

In [7]:
tips.sample(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
196,10.34,2.0,Male,Yes,Thur,Lunch,2,0.193424,0.005297
60,20.29,3.21,Male,Yes,Sat,Dinner,2,0.158206,0.010394
52,34.81,5.2,Female,No,Sun,Dinner,4,0.149382,0.017833


In [8]:
train, validate, test = wrangle.split_data(tips)

train---> (136, 9)
validate---> (59, 9)
test---> (49, 9)


In [9]:
train.dtypes

total_bill           float64
tip                  float64
sex                 category
smoker              category
day                 category
time                category
size                   int64
tip_percentage       float64
price_per_person     float64
dtype: object

In [10]:
# Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? 
# The tip percentage?

# let's create a quick loop to look at value counts
columns = list(train.columns)
for col in columns:
    print(train[col].value_counts())
    print('-----------')

7.25     2
13.42    2
15.98    2
18.29    2
8.58     1
        ..
12.66    1
14.15    1
9.60     1
48.33    1
10.77    1
Name: total_bill, Length: 132, dtype: int64
-----------
2.00    13
3.00    12
4.00     5
2.50     5
1.50     5
        ..
2.47     1
1.68     1
5.07     1
3.21     1
5.85     1
Name: tip, Length: 82, dtype: int64
-----------
Male      90
Female    46
Name: sex, dtype: int64
-----------
No     90
Yes    46
Name: smoker, dtype: int64
-----------
Sat     49
Sun     42
Thur    32
Fri     13
Name: day, dtype: int64
-----------
Dinner    99
Lunch     37
Name: time, dtype: int64
-----------
2    91
4    22
3    15
6     3
1     3
5     2
Name: size, dtype: int64
-----------
0.078927    1
0.206140    1
0.291990    1
0.198157    1
0.185185    1
           ..
0.158749    1
0.147059    1
0.139424    1
0.211509    1
0.171875    1
Name: tip_percentage, Length: 136, dtype: int64
-----------
0.003714    2
0.006875    2
0.008186    2
0.009370    2
0.015574    1
           ..
0.01230

In [20]:
# Use all the other numeric features to predict tip amount. 
# Use select k best and recursive feature elimination to select the top 2 features. 
# What are they?

# first assign my x_train and y_trains
# later with more time create dummies
cols=['total_bill', 'price_per_person', 'size']
x_train = train[cols]
x_validate = validate[cols]
x_test = test[cols]

y_train1 = train.tip
y_train2 = train.tip_percentage

In [21]:
x_train.sample(3)

Unnamed: 0,total_bill,price_per_person,size
171,15.81,0.008099,2
54,25.56,0.013094,4
167,31.71,0.016245,4


In [22]:
# scale my dataframes
cols=['total_bill', 'price_per_person', 'size']
x_train_scaled, x_validate_scaled, x_test_scaled = wrangle.min_max_scale(x_train, x_validate, x_test, cols)

In [23]:
x_train_scaled.shape, y_train1.shape

((136, 3), (136,))

In [24]:
# using kbest
f_selector = SelectKBest(score_func=f_regression, k=2)
f_selector.fit(x_train_scaled, y_train1)

SelectKBest(k=2, score_func=<function f_regression at 0x7fe0b3f133a0>)

In [31]:
# display my two most important features
mask = f_selector.get_support()
x_train_scaled.columns[mask]

Index(['total_bill', 'price_per_person'], dtype='object')

In [32]:
# now using recursive feature elimination
lm = LinearRegression()
rfe = RFE(estimator=lm, n_features_to_select=2)
rfe.fit(x_train_scaled, y_train1)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [33]:
# display the two most weighted features
x_train_scaled.columns[rfe.support_]

Index(['total_bill', 'price_per_person'], dtype='object')

Now use kbest and RFE to find best features for tip percentages

In [34]:
# Use all the other numeric features to predict tip percentage. 
# Use select k best and recursive feature elimination to select the top 2 features. What are they?

# using kbest
f_selector = SelectKBest(score_func=f_regression, k=2)
f_selector.fit(x_train_scaled, y_train2)

# display the two most important features
mask = f_selector.get_support()
x_train_scaled.columns[mask]

Index(['total_bill', 'price_per_person'], dtype='object')

In [36]:
# now using recursive feature elimination
lm = LinearRegression()
rfe = RFE(estimator=lm, n_features_to_select=2)
rfe.fit(x_train_scaled, y_train2)

# now using recursive feature elimination
x_train_scaled.columns[rfe.support_]

Index(['total_bill', 'price_per_person'], dtype='object')

- I got similar answers perhaps because my features are almost the same. 

2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [67]:
def select_kbest(X_train, y_train, no_features):
    
    # using kbest
    f_selector = SelectKBest(score_func=f_regression, k=no_features)
    
    # fit
    f_selector.fit(X_train, y_train)

    # display the two most important features
    mask = f_selector.get_support()
    
    return X_train.columns[mask]

In [51]:
# test our function
select_kbest(x_train_scaled, y_train1, 2)

Index(['total_bill', 'price_per_person'], dtype='object')

3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [48]:
def rfe(x_train, y_train, no_features):
    # now using recursive feature elimination
    lm = LinearRegression()
    rfe = RFE(estimator=lm, n_features_to_select=no_features)
    rfe.fit(x_train, y_train)

    # returning the top chosen features
    return x_train.columns[rfe.support_]

In [49]:
rfe(x_train_scaled, y_train1, 2)

Index(['total_bill', 'price_per_person'], dtype='object')

4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [54]:
swiss = data('swiss')

In [55]:
swiss.sample(3)

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Vevey,58.3,26.8,25,19,18.46,20.9
Paysd'enhaut,72.0,63.5,6,3,2.56,18.0
Le Locle,72.7,16.7,22,13,11.22,18.9


In [56]:
swiss.dtypes

Fertility           float64
Agriculture         float64
Examination           int64
Education             int64
Catholic            float64
Infant.Mortality    float64
dtype: object

In [58]:
X_train = swiss.drop(columns='Fertility')
y_train = swiss.Fertility

In [68]:
select_kbest(X_train, y_train, 3)

Index(['Examination', 'Education', 'Catholic'], dtype='object')

In [61]:
rfe(X_train, y_train, 3)

Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object')