In [85]:
import numpy as np
import pandas as pd
import prepare
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
import sklearn.preprocessing
from pydataset import data

**Exercises**
Do your work for this exercise in a jupyter notebook named feature_engineering within the regression-exercises repo. Add, commit, and push your work.



1. Load the tips dataset.

    

In [86]:
tips = prepare.prep_tips()
tips.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,is_dinner,size
1,16.99,1.01,1,0,7,1,2
2,10.34,1.66,0,0,7,1,3
3,21.01,3.5,0,0,7,1,3
4,23.68,3.31,0,0,7,1,2
5,24.59,3.61,1,0,7,1,4


    a. Create a column named price_per_person. This should be the total bill divided by the party size.


In [87]:
tips['price_per_person']= tips.total_bill/tips['size'] 
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,is_dinner,size,price_per_person
1,16.99,1.01,1,0,7,1,2,8.495
2,10.34,1.66,0,0,7,1,3,3.446667
3,21.01,3.5,0,0,7,1,3,7.003333
4,23.68,3.31,0,0,7,1,2,11.84
5,24.59,3.61,1,0,7,1,4,6.1475


    b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

In [88]:
# I thinkn total bill will be the most important, followed by price_per_person


    c. Use select k best to select the top 2 features for predicting tip amount. What are they?


In [89]:
# split
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test = prepare.split_data(tips,'tip', stratify=False)
# scale 
scaler = sklearn.preprocessing.MinMaxScaler()

#choose columns to scale
columns_to_scale = ['total_bill','price_per_person']
#prepare new names
new_column_names = ['scaled_total_bill','scaled_price_per_person']
# Fit the scaler on X_train
scaler.fit(X_train[columns_to_scale])

# Use the scaler to transform train, validate, test (use the thing)
X_train_scaled = scaler.transform(X_train[columns_to_scale])
# put scaled columns into df
X_train_scaled = pd.DataFrame(X_train_scaled, index = X_train.index, columns = new_column_names)
# concat the scaled df back onto original
X_train= pd.concat([X_train, X_train_scaled], axis=1)
# do the same for validate and test
X_validate_scaled = scaler.transform(X_validate[columns_to_scale])
X_validate_scaled = pd.DataFrame(X_validate_scaled, index = X_validate.index, columns = new_column_names)
X_validate= pd.concat([X_validate, X_validate_scaled], axis=1)
X_test_scaled = scaler.transform(X_test[columns_to_scale])
X_test_scaled = pd.DataFrame(X_test_scaled, index = X_test.index, columns = new_column_names)
X_test= pd.concat([X_test, X_test_scaled], axis=1)

In [90]:
X_train.head(3)

Unnamed: 0,total_bill,sex,smoker,day,is_dinner,size,price_per_person,scaled_total_bill,scaled_price_per_person
13,15.42,0,0,7,1,2,7.71,0.187557,0.298393
149,9.78,0,0,4,0,2,4.89,0.058081,0.092403
53,34.81,1,0,7,1,4,8.7025,0.632691,0.370891


In [91]:
# make the tool
kbest = SelectKBest(f_regression, k=3) #pulled three to overcome total_bill x 2
kbest.fit(X_train, y_train)
kbest_results = pd.DataFrame(dict(p=kbest.pvalues_, f=kbest.scores_), index=X_train.columns)
kbest_results

Unnamed: 0,p,f
total_bill,4.300454e-19,109.453315
sex,0.1685244,1.916679
smoker,0.9796329,0.000654
day,0.03069661,4.770199
is_dinner,0.03724088,4.427099
size,2.630124e-09,40.737539
price_per_person,0.0001886191,14.749561
scaled_total_bill,4.300454e-19,109.453315
scaled_price_per_person,0.0001886191,14.749561


In [92]:
X_train.columns[kbest.get_support()]
# total bill is correct as predicted, size is a better fit than price_per_person 
# which makes sense because size has a greater effect on total bill than price per person
# I pulled three to get more than total_bill and scaled_total_bill

Index(['total_bill', 'size', 'scaled_total_bill'], dtype='object')

    d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?


In [93]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=2)
rfe.fit(X_train, y_train)
pd.DataFrame({'rfe_ranking': rfe.ranking_}, index=X_train.columns)
# size and price per person are best

Unnamed: 0,rfe_ranking
total_bill,3
sex,2
smoker,6
day,5
is_dinner,4
size,1
price_per_person,1
scaled_total_bill,8
scaled_price_per_person,7


    e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?


In [94]:
# K best doesnt take multiple features into consideration. where RFE does. size * price_per_person gives all the info of total_bill plus more

2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.


In [95]:
def select_kbest(X, y, k=2):
    kbest = SelectKBest(f_regression, k=k)
    kbest.fit(X, y)
    return X.columns[kbest.get_support()]

In [96]:
select_kbest(X_train,y_train, k=3)

Index(['total_bill', 'size', 'scaled_total_bill'], dtype='object')


3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.


In [97]:
def select_RFE(X, y, k=2):
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select=k)
    rfe.fit(X, y)
    # return pd.DataFrame({'rfe_ranking': rfe.ranking_}, index=X_train.columns)
    return X.columns[rfe.get_support()]

In [98]:
select_RFE(X_train,y_train,k=2)

Index(['size', 'price_per_person'], dtype='object')


4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [99]:
data('swiss', show_doc=True)

swiss

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Swiss Fertility and Socioeconomic Indicators (1888) Data

### Description

Standardized fertility measure and socio-economic indicators for each of 47
French-speaking provinces of Switzerland at about 1888.

### Usage

    data(swiss)

### Format

A data frame with 47 observations on 6 variables, each of which is in percent,
i.e., in [0,100].

[,1] Fertility Ig, "common standardized fertility measure" [,2] Agriculture
[,3] Examination nation [,4] Education [,5] Catholic [,6] Infant.Mortality
live births who live less than 1 year.

All variables but 'Fert' give proportions of the population.

### Source

Project "16P5", pages 549-551 in

Mosteller, F. and Tukey, J. W. (1977) “Data Analysis and Regression: A Second
Course in Statistics”. Addison-Wesley, Reading Mass.

indicating their source as "Data used by permission of Franice van de Walle.
Office of Population Research, Princeton Univer

In [100]:
swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [101]:
# split
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test = prepare.split_data(swiss,'Fertility', stratify=False)

In [102]:
select_kbest(X_train,y_train, k=3)

Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object')

In [103]:
select_RFE(X_train,y_train,k=3)

Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object')

In [104]:
swiss.info()


<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [106]:
#lets try the swiss data set scaled
# scaler 
scaler = sklearn.preprocessing.MinMaxScaler()

# Fit the scaler on X_train
scaler.fit(X_train)
#scale it
X_train_scaled = scaler.transform(X_train)
# put scaled columns into df
X_train_scaled = pd.DataFrame(X_train_scaled, index = X_train.index, columns = X_train.columns)
# do the same for validate and test
X_validate_scaled = scaler.transform(X_validate)
X_validate_scaled = pd.DataFrame(X_validate_scaled, index = X_validate.index, columns = X_train.columns)
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, index = X_test.index, columns = X_train.columns)

In [107]:
X_train_scaled.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Herens,1.0,0.0,0.0,1.0,0.214286
Glane,0.752542,0.28125,0.117647,0.970776,1.0
Sarine,0.497175,0.34375,0.215686,0.911299,0.940476
Monthey,0.719774,0.0625,0.019608,0.981683,0.440476
Gruyere,0.588701,0.21875,0.098039,0.976024,0.535714


In [108]:
select_kbest(X_train_scaled,y_train, k=3)

Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object')

In [109]:
select_RFE(X_train_scaled,y_train, k=3)

Index(['Agriculture', 'Education', 'Catholic'], dtype='object')

In [118]:
swiss_results =pd.DataFrame()
swiss_results['k_best']=select_kbest(X_train,y_train, k=3)
swiss_results['k_best_scaled']=select_kbest(X_train_scaled,y_train, k=3)
swiss_results['RFE']=select_RFE(X_train,y_train, k=3)
swiss_results['RFE_scaled']=select_RFE(X_train_scaled,y_train, k=3)
swiss_results

Unnamed: 0,k_best,k_best_scaled,RFE,RFE_scaled
0,Examination,Examination,Examination,Agriculture
1,Education,Education,Education,Education
2,Infant.Mortality,Infant.Mortality,Infant.Mortality,Catholic
