In [128]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

from pydataset import data


Load the tips dataset.

In [67]:
df= data('tips')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [68]:
df['tip_percentage']=df.tip/df.total_bill
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


Create a column named price_per_person. This should be the total bill divided by the party size.

In [69]:
df['price_per_person']= df.total_bill/df['size']
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


In [70]:
mask=np.array(df.dtypes=="object")
mask

array([False, False,  True,  True,  True,  True, False, False, False])

In [71]:
obj_df=df.iloc[:, mask]

for col in obj_df.columns:
    print(obj_df[col].value_counts())
    print("\n")

Male      157
Female     87
Name: sex, dtype: int64


No     151
Yes     93
Name: smoker, dtype: int64


Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64


Dinner    176
Lunch      68
Name: time, dtype: int64




In [72]:
# create df with new dummy vars
dummy_df = pd.get_dummies(obj_df, dummy_na=False, drop_first=True)

In [73]:
# concatenate the dataframe with dummies to our original dataframe
# via column (axis=1)
df = pd.concat([df, dummy_df], axis=1)

In [74]:
# drop object columns from df
df.drop(columns=obj_df.columns, inplace=True)

In [75]:
df.head()

Unnamed: 0,total_bill,tip,size,tip_percentage,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,16.99,1.01,2,0.059447,8.495,0,0,0,1,0,0
2,10.34,1.66,3,0.160542,3.446667,1,0,0,1,0,0
3,21.01,3.5,3,0.166587,7.003333,1,0,0,1,0,0
4,23.68,3.31,2,0.13978,11.84,1,0,0,1,0,0
5,24.59,3.61,4,0.146808,6.1475,0,0,0,1,0,0


In [76]:
# split into train and test
from sklearn.model_selection import train_test_split
train_validate, test = train_test_split(df, test_size=.2, random_state=123)
train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [77]:
# x df's are all cols except tips
X_train = train.drop(columns=['tip'])
X_validate = validate.drop(columns=['tip'])
X_test = test.drop(columns=['tip'])

# y df's are just tips
y_train = train[['tip']]
y_validate = validate[['tip']]
y_test = test[['tip']]

In [78]:
#Scale the data
scaler=MinMaxScaler(copy=True).fit(X_train)
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

In [79]:
# Making a data frame of the scaled data
X_train_scaled=pd.DataFrame(
    X_train_scaled, columns=X_train.columns.values).\
    set_index([X_train.index.values])

X_validate_scaled=pd.DataFrame(
    X_validate_scaled, columns=X_validate.columns.values).\
    set_index([X_validate.index.values])

X_test_scaled= pd.DataFrame(
    X_test_scaled, columns=X_test.columns.values).\
    set_index([X_test.index.values])

X_train_scaled = pd.DataFrame(X_train_scaled)

Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

I think time of day and party size would be most important for predicting the tip amount. I think sex would be most important for determining tip percentage.

In [129]:
from sklearn.feature_selection import SelectKBest, f_regression

In [81]:
## Initialize the f_selector object, which defines the test for scoring the features and the number of features we want to keep.)
f_selector = SelectKBest(f_regression, k=2)

In [82]:
## Fit the object to our data. In doing this, our selector is scoring, ranking, and identifying the top k features.
f_selector.fit(X_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fafb365b830>)

In [83]:
##Transform our dataset to reduce to the k best features.

X_reduced = f_selector.transform(X_train)

print(X_train.shape)
print(X_reduced.shape)

(136, 10)
(136, 2)


In [84]:
##one line of code to get same result as above
X_reduced2 = SelectKBest(f_regression, k=2).fit_transform(X_train, y_train)
print(X_reduced2.shape)

(136, 2)


In [85]:
#a list of booleans that relate to the feature indices.
f_support = f_selector.get_support()

print(f_support) 

[ True  True False False False False False False False False]


In [86]:
'''We get a list of the feature names selected from X_train using .loc with our mask, 
using .columns to get the column names, and convert the values to a list using .tolist().
'''
f_feature = X_train.loc[:,f_support].columns.tolist()

print(str(len(f_feature)), 'selected features')
print(f_feature)
print("To summarize, we used the SelectKBest method to select the top k features, and these features are scored and ranked using a statistical test, which we used the f-regression test in this case. We found total bill and size are the top 2 features related to tip.")

2 selected features
['total_bill', 'size']
To summarize, we used the SelectKBest method to select the top k features, and these features are scored and ranked using a statistical test, which we used the f-regression test in this case. We found total bill and size are the top 2 features related to tip.


In [87]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [88]:
##Initialize the linear regression object
lm = LinearRegression()

In [89]:
#Initialize the RFE object, setting the hyperparameters to be our linear regression object created above (as the algorithm to test the features on) and the number of features to return to be 2.
rfe = RFE(lm, 2)

In [90]:
# Transforming data using RFE
X_rfe = rfe.fit_transform(X_train,y_train)  

In [91]:
#Fitting the data to model
lm.fit(X_rfe,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [92]:
mask = rfe.support_

In [93]:
rfe_features = X_train.loc[:,mask].columns.tolist()

In [94]:
print(str(len(rfe_features)), 'selected features')
print(rfe_features)

2 selected features
['tip_percentage', 'day_Sun']


Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [95]:
# x df's are all cols except tips
X_train2 = train.drop(columns=['tip_percentage'])
X_validate2 = validate.drop(columns=['tip_percentage'])
X_test2 = test.drop(columns=['tip_percentage'])

# y df's are just tips
y_train2 = train[['tip_percentage']]
y_validate2 = validate[['tip_percentage']]
y_test2 = test[['tip_percentage']]

In [96]:
#Scale the data
scaler2=MinMaxScaler(copy=True).fit(X_train2)
scaler2.fit(X_train2)
X_train_scaled2 = scaler.transform(X_train2)
X_validate_scaled2 = scaler.transform(X_validate2)
X_test_scaled2 = scaler.transform(X_test2)

In [97]:
# Making a data frame of the scaled data
X_train_scaled2=pd.DataFrame(
    X_train_scaled, columns=X_train2.columns.values).\
    set_index([X_train2.index.values])

X_validate_scaled2=pd.DataFrame(
    X_validate_scaled, columns=X_validate2.columns.values).\
    set_index([X_validate2.index.values])

X_test_scaled2= pd.DataFrame(
    X_test_scaled, columns=X_test2.columns.values).\
    set_index([X_test2.index.values])

X_train_scaled2 = pd.DataFrame(X_train_scaled2)

In [98]:
##one line of code to get same result as above
X_reduced3 = SelectKBest(f_regression, k=2).fit_transform(X_train2, y_train2)
print(X_reduced3.shape)

(136, 2)


In [99]:
#a list of booleans that relate to the feature indices.
f_support = f_selector.get_support()

print(f_support) 

[ True  True False False False False False False False False]


In [100]:
'''We get a list of the feature names selected from X_train using .loc with our mask, 
using .columns to get the column names, and convert the values to a list using .tolist().
'''
f_feature = X_train.loc[:,f_support].columns.tolist()

print(str(len(f_feature)), 'selected features')
print(f_feature)
print("To summarize, we used the SelectKBest method to select the top k features, and these features are scored and ranked using a statistical test, which we used the f-regression test in this case. We found total bill and size are the top 2 features related to tip.")

2 selected features
['total_bill', 'size']
To summarize, we used the SelectKBest method to select the top k features, and these features are scored and ranked using a statistical test, which we used the f-regression test in this case. We found total bill and size are the top 2 features related to tip.


In [101]:
##Initialize the linear regression object
lm = LinearRegression()

In [102]:
#Initialize the RFE object, setting the hyperparameters to be our linear regression object created above (as the algorithm to test the features on) and the number of features to return to be 2.
rfe = RFE(lm, 2)

In [103]:
# Transforming data using RFE
X_rfe = rfe.fit_transform(X_train2,y_train2)  

In [104]:
#Fitting the data to model
lm.fit(X_rfe,y_train2)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [105]:
mask = rfe.support_

In [106]:
rfe_features = X_train2.loc[:,mask].columns.tolist()
print(str(len(rfe_features)), 'selected features')
print(rfe_features)

2 selected features
['tip', 'size']


Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?
select k best removes all but the  highest scoring features.
recursive feature elimination assigns weights to features and recursively evaluates the features by weights.

Redoing the RFE and Select K Best for tips 
with removing total_price and size in order to reduce multicollinearity

In [107]:
df.drop(columns=['total_bill', 'size'])

Unnamed: 0,tip,tip_percentage,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,1.01,0.059447,8.495000,0,0,0,1,0,0
2,1.66,0.160542,3.446667,1,0,0,1,0,0
3,3.50,0.166587,7.003333,1,0,0,1,0,0
4,3.31,0.139780,11.840000,1,0,0,1,0,0
5,3.61,0.146808,6.147500,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
240,5.92,0.203927,9.676667,1,0,1,0,0,0
241,2.00,0.073584,13.590000,0,1,1,0,0,0
242,2.00,0.088222,11.335000,1,1,1,0,0,0
243,1.75,0.098204,8.910000,1,0,1,0,0,0


In [108]:
df.head()

Unnamed: 0,total_bill,tip,size,tip_percentage,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,16.99,1.01,2,0.059447,8.495,0,0,0,1,0,0
2,10.34,1.66,3,0.160542,3.446667,1,0,0,1,0,0
3,21.01,3.5,3,0.166587,7.003333,1,0,0,1,0,0
4,23.68,3.31,2,0.13978,11.84,1,0,0,1,0,0
5,24.59,3.61,4,0.146808,6.1475,0,0,0,1,0,0


In [109]:
train_validate, test = train_test_split(df, test_size=.2, random_state=123)
train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

In [110]:
# x df's are all cols except tips
X_train = train.drop(columns=['tip'])
X_validate = validate.drop(columns=['tip'])
X_test = test.drop(columns=['tip'])

# y df's are just tips
y_train = train[['tip']]
y_validate = validate[['tip']]
y_test = test[['tip']]

In [111]:
#Scale the data
scaler=MinMaxScaler(copy=True).fit(X_train)
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

In [112]:
# Making a data frame of the scaled data
X_train_scaled=pd.DataFrame(
    X_train_scaled, columns=X_train.columns.values).\
    set_index([X_train.index.values])

X_validate_scaled=pd.DataFrame(
    X_validate_scaled, columns=X_validate.columns.values).\
    set_index([X_validate.index.values])

X_test_scaled= pd.DataFrame(
    X_test_scaled, columns=X_test.columns.values).\
    set_index([X_test.index.values])

X_train_scaled = pd.DataFrame(X_train_scaled)

In [113]:
## Initialize the f_selector object, which defines the test for scoring the features and the number of features we want to keep.)
f_selector = SelectKBest(f_regression, k=2)

In [114]:
## Fit the object to our data. In doing this, our selector is scoring, ranking, and identifying the top k features.
f_selector.fit(X_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fafb365b830>)

In [115]:
##Transform our dataset to reduce to the k best features.

X_reduced = f_selector.transform(X_train)

print(X_train.shape)
print(X_reduced.shape)

(136, 10)
(136, 2)


In [116]:
##one line of code to get same result as above
X_reduced2 = SelectKBest(f_regression, k=2).fit_transform(X_train, y_train)
print(X_reduced2.shape)

(136, 2)


In [117]:
#a list of booleans that relate to the feature indices.
f_support = f_selector.get_support()

print(f_support) 

[ True  True False False False False False False False False]


In [118]:
'''We get a list of the feature names selected from X_train using .loc with our mask, 
using .columns to get the column names, and convert the values to a list using .tolist().
'''
f_feature = X_train.loc[:,f_support].columns.tolist()

print('There are ', str(len(f_feature)), 'selected features. The features are', f_feature )
print("To summarize, we used the SelectKBest method to select the top k features, and these features are scored and ranked using a statistical test, which we used the f-regression test in this case. We found tip_percentage and price_per_person are the top 2 features related to tip when correcting for multicolinearity")

There are  2 selected features. The features are ['total_bill', 'size']
To summarize, we used the SelectKBest method to select the top k features, and these features are scored and ranked using a statistical test, which we used the f-regression test in this case. We found tip_percentage and price_per_person are the top 2 features related to tip when correcting for multicolinearity


In [119]:
#Initialize the RFE object, setting the hyperparameters to be our linear regression object created above (as the algorithm to test the features on) and the number of features to return to be 2.
lm = LinearRegression()
rfe = RFE(lm, 2)

In [120]:
# Transforming data using RFE
X_rfe = rfe.fit_transform(X_train,y_train)  

In [121]:
#Fitting the data to model
lm.fit(X_rfe,y_train)
mask = rfe.support_

In [122]:
rfe_features = X_train.loc[:,mask].columns.tolist()
print(str(len(rfe_features)), 'selected features')
print(rfe_features)

2 selected features
['tip_percentage', 'day_Sun']


Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [130]:
def select_kbest(x, y, k):
    kbest= sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k)
    kbest.fit(X_train, y_train, 2)
    return X.columns[k.best.get_support()]

In [131]:
select_kbest(X_train, y_train, 2)

NameError: name 'sklearn' is not defined

Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [None]:
def rfe (x,y,k):
    lm = LinearRegression()
    rfe = RFE(lm, k)
    rfe.fit(X_train, y_train)
    X_train.columns[rfe.support_]
    rfe_features = X_train.loc[:,mask].columns.tolist()
    return rfe_features

In [None]:
rfe(X_train, y_train, 2)

In [None]:
swiss= pydataset.