In [1]:
from pydataset import data

import numpy as np
import pandas as pd
from wrangle import tts_con

from sklearn.feature_selection import SelectKBest, f_regression
import sklearn.preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [2]:
df=data('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


### Create a column named price_per_person. This should be the total bill divided by the party size.


In [4]:
df['price_per_person']= df['total_bill']/df['size']

In [5]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


### Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?


In [6]:
#total bill and size. Maybe time.

### Use select k best to select the top 2 features for predicting tip amount. What are they?


In [7]:
#split data
dftrain, dfval, dftest=tts_con(df)

In [8]:
#split train into x and y. Use only numeric columns for k best
X_train=dftrain[['total_bill', 'size', 'price_per_person']]
y_train=dftrain['tip']

In [9]:
#make the scalar
ss=sklearn.preprocessing.StandardScaler()

In [10]:
#Fit the scalar to the X_train
ss.fit(X_train)

In [11]:
#apply the scalar to x_train and get the scaled data
X_train_scaled=ss.transform(X_train)

In [12]:
#make the selector
f_selector=SelectKBest(f_regression, k=2)

In [13]:
#fit and transform the selector/data
X_reduced=f_selector.fit_transform(X_train_scaled, y_train)
X_reduced.shape

(136, 2)

In [14]:
#get support
f_support = f_selector.get_support()
f_support

array([ True,  True, False])

In [15]:
#visualize the top features
features=X_train.iloc[:,f_support]
features.head()

Unnamed: 0,total_bill,size
36,24.06,3
168,31.71,4
194,15.48,2
142,34.3,6
231,24.01,4


**Takeaways**  
The best features are total bill and size

### Use recursive feature elimination to select the top 2 features for tip amount. What are they?


In [16]:
dftrain.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
36,24.06,3.6,Male,No,Sat,Dinner,3,8.02
168,31.71,4.5,Male,No,Sun,Dinner,4,7.9275
194,15.48,2.02,Male,Yes,Thur,Lunch,2,7.74
142,34.3,6.7,Male,No,Thur,Lunch,6,5.716667
231,24.01,2.0,Male,Yes,Sat,Dinner,4,6.0025


In [17]:
#split train into x and y. Use all data for RFE
X_train_rfe=dftrain.drop(columns='tip')
y_train=dftrain['tip']

In [18]:
#get dummies
X_train_rfe_dumm=pd.get_dummies(X_train_rfe,['sex', 'smoker', 'day', 'time'])

In [19]:
X_train_rfe_dumm.head()

Unnamed: 0,total_bill,size,price_per_person,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
36,24.06,3,8.02,0,1,1,0,0,1,0,0,1,0
168,31.71,4,7.9275,0,1,1,0,0,0,1,0,1,0
194,15.48,2,7.74,0,1,0,1,0,0,0,1,0,1
142,34.3,6,5.716667,0,1,1,0,0,0,0,1,0,1
231,24.01,4,6.0025,0,1,0,1,0,1,0,0,1,0


In [20]:
#make the model
lm=LinearRegression()

In [21]:
#make the rfe using the model
rfe = RFE(lm, n_features_to_select=2)

In [22]:
#fit and transform data
X_rfe=rfe.fit_transform(X_train_rfe_dumm, y_train)

In [23]:
#fit the model with x_rfe
lm.fit(X_rfe, y_train)

In [24]:
#get the mask
mask=rfe.support_
mask

array([False, False, False, False, False, False, False, False, False,
       False, False,  True,  True])

In [25]:
#get the features using rfe
rfe_features=X_train_rfe_dumm.iloc[:,mask]
rfe_features.head()

Unnamed: 0,time_Dinner,time_Lunch
36,1,0
168,1,0
194,0,1
142,0,1
231,1,0


**Takeaways**  
The best features are different. time dinner and time lunch are the best using RFE

### Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?


The different amount of features causes the change in feature selection as well as the addition of catagorical variables in the form of dummies

# Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.



In [26]:
def select_kbest(x,y):
    ss=sklearn.preprocessing.StandardScaler()
    ss.fit(x)
    X_train_scaled=ss.transform(x)
    
    f_selector=SelectKBest(f_regression, k=2)
    X_reduced=f_selector.fit_transform(X_train_scaled, y)
    f_support = f_selector.get_support()
    features=x.iloc[:,f_support]
    return features



In [27]:
select_kbest(X_train, y_train).head()

Unnamed: 0,total_bill,size
36,24.06,3
168,31.71,4
194,15.48,2
142,34.3,6
231,24.01,4


# Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.



In [28]:
def rfe(x,y,z):
    lm=LinearRegression()
    rfe = RFE(lm, n_features_to_select=z)
    X_rfe=rfe.fit_transform(x, y)
    lm.fit(X_rfe, y)
    mask=rfe.support_
    rfe_features=x.iloc[:,mask]
    return rfe_features

In [29]:
rfe(X_train_rfe_dumm, y_train, 2).head()

Unnamed: 0,time_Dinner,time_Lunch
36,1,0
168,1,0
194,0,1
142,0,1
231,1,0


# Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).



In [30]:
#getting the data
swiss=data('swiss')

In [31]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [32]:
#splitting the data
swisstrain, swissval, swisstest=tts_con(swiss)

In [33]:
#splitting the train into x and y
SX_train=swisstrain.drop(columns=['Fertility'])
Sy_train=swisstrain['Fertility']

In [34]:
#running the function using kbest
select_kbest(SX_train, Sy_train).head()

Unnamed: 0,Examination,Education
Glane,14,8
Porrentruy,9,7
Herens,5,2
Echallens,18,2
Le Locle,22,13


In [35]:
#running the function using rfe
rfe(SX_train, Sy_train, 3).head()

Unnamed: 0,Agriculture,Education,Infant.Mortality
Glane,67.8,8,24.9
Porrentruy,35.3,7,26.6
Herens,89.7,2,18.3
Echallens,72.6,2,21.2
Le Locle,16.7,13,18.9
