In [1]:
#doing things with data
import pandas as pd
import itertools
import numpy as np
import scipy.stats as stats
from scipy.stats import pearsonr

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
#import r2
from sklearn.metrics import r2_score

import env
import wrangle as w
import explore as e
import prepare as p
import evaluate as eva

import warnings
warnings.filterwarnings("ignore")

In [2]:
from pydataset import data

# 1 

In [3]:
#get data
df = data('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.50,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3
241,27.18,2.00,Female,Yes,Sat,Dinner,2
242,22.67,2.00,Male,Yes,Sat,Dinner,2
243,17.82,1.75,Male,No,Sat,Dinner,2


In [4]:
df.dtypes

total_bill    float64
tip           float64
sex            object
smoker         object
day            object
time           object
size            int64
dtype: object

Create a column named price_per_person. This should be the total bill divided by the party size.

In [5]:
df['price_per_person'] = round(df['total_bill']/df['size'],2)
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.49
2,10.34,1.66,Male,No,Sun,Dinner,3,3.45
3,21.01,3.50,Male,No,Sun,Dinner,3,7.00
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.15
...,...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3,9.68
241,27.18,2.00,Female,Yes,Sat,Dinner,2,13.59
242,22.67,2.00,Male,Yes,Sat,Dinner,2,11.34
243,17.82,1.75,Male,No,Sat,Dinner,2,8.91


In [6]:
df.time.value_counts()

Dinner    176
Lunch      68
Name: time, dtype: int64

In [7]:
df['smoker'] = df['smoker'].replace(['No', "Yes"],[0, 1]).astype(int)
df['sex'] = df['sex'].replace(['Female', 'Male'], [0, 1]).astype(int)
df['time'] = df['time'].replace(['Lunch', 'Dinner'], [0,1]).astype(int)
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,0,0,Sun,1,2,8.49
2,10.34,1.66,1,0,Sun,1,3,3.45
3,21.01,3.50,1,0,Sun,1,3,7.00
4,23.68,3.31,1,0,Sun,1,2,11.84
5,24.59,3.61,0,0,Sun,1,4,6.15
...,...,...,...,...,...,...,...,...
240,29.03,5.92,1,0,Sat,1,3,9.68
241,27.18,2.00,0,1,Sat,1,2,13.59
242,22.67,2.00,1,1,Sat,1,2,11.34
243,17.82,1.75,1,0,Sat,1,2,8.91


In [8]:
dummy_df = pd.get_dummies(df['day'], drop_first=False)
df = pd.concat([df, dummy_df], axis=1)
df = df.drop(columns=['day'])
df

Unnamed: 0,total_bill,tip,sex,smoker,time,size,price_per_person,Fri,Sat,Sun,Thur
1,16.99,1.01,0,0,1,2,8.49,0,0,1,0
2,10.34,1.66,1,0,1,3,3.45,0,0,1,0
3,21.01,3.50,1,0,1,3,7.00,0,0,1,0
4,23.68,3.31,1,0,1,2,11.84,0,0,1,0
5,24.59,3.61,0,0,1,4,6.15,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
240,29.03,5.92,1,0,1,3,9.68,0,1,0,0
241,27.18,2.00,0,1,1,2,13.59,0,1,0,0
242,22.67,2.00,1,1,1,2,11.34,0,1,0,0
243,17.82,1.75,1,0,1,2,8.91,0,1,0,0


In [9]:
# split into train validate and test
train, validate, test = p.split_data(df)
train.head()

Unnamed: 0,total_bill,tip,sex,smoker,time,size,price_per_person,Fri,Sat,Sun,Thur
225,13.42,1.58,1,1,0,2,6.71,1,0,0,0
182,23.33,5.65,1,1,1,2,11.66,0,0,1,0
103,44.3,2.5,0,1,1,3,14.77,0,1,0,0
165,17.51,3.0,0,1,1,2,8.76,0,0,1,0
74,25.28,5.0,0,1,1,2,12.64,0,1,0,0


In [10]:
X_train_scaled, X_validate_scaled, X_validate_scaled, y_train, y_validate, y_test = p.scaled_df_tips(train, validate, test)
X_train_scaled.head()

Unnamed: 0,total_bill,sex,smoker,time,size,price_per_person,Fri,Sat,Sun,Thur
0,0.228679,1.0,1.0,0.0,0.2,0.211628,1.0,0.0,0.0,0.0
1,0.447636,1.0,1.0,1.0,0.2,0.499419,0.0,0.0,1.0,0.0
2,0.910959,0.0,1.0,1.0,0.4,0.680233,0.0,1.0,0.0,0.0
3,0.319046,0.0,1.0,1.0,0.2,0.330814,0.0,0.0,1.0,0.0
4,0.49072,0.0,1.0,1.0,0.2,0.556395,0.0,1.0,0.0,0.0


In [11]:
y_train.head()

225    1.58
182    5.65
103    2.50
165    3.00
74     5.00
Name: tip, dtype: float64

Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

- party size, time, day

Use select k best to select the top 2 features for predicting tip amount. What are they?

In [12]:
# MAKE the thing
kbest = SelectKBest(f_regression, k=2)

# FIT the thing
kbest.fit(X_train_scaled, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7f967b7d1790>)

In [13]:
# statistical f-value / feature's scores:
kbest.scores_

array([1.11115028e+02, 5.96049171e-01, 8.45071808e-02, 2.18955757e+00,
       5.76071336e+01, 1.38356417e+01, 1.34704421e+00, 2.50128222e-02,
       3.28560161e+00, 9.78681779e-01])

In [14]:
# p value: 
kbest.pvalues_

array([1.30561953e-19, 4.41354619e-01, 7.71698118e-01, 1.41133637e-01,
       3.66901154e-12, 2.85187703e-04, 2.47716010e-01, 8.74556685e-01,
       7.19730319e-02, 3.24183852e-01])

In [15]:
kbest.feature_names_in_

array(['total_bill', 'sex', 'smoker', 'time', 'size', 'price_per_person',
       'Fri', 'Sat', 'Sun', 'Thur'], dtype=object)

In [16]:
kbest_results = pd.DataFrame(
                dict(p=kbest.pvalues_, f=kbest.scores_),
                index= X_train_scaled.columns)
kbest_results

Unnamed: 0,p,f
total_bill,1.30562e-19,111.115028
sex,0.4413546,0.596049
smoker,0.7716981,0.084507
time,0.1411336,2.189558
size,3.669012e-12,57.607134
price_per_person,0.0002851877,13.835642
Fri,0.247716,1.347044
Sat,0.8745567,0.025013
Sun,0.07197303,3.285602
Thur,0.3241839,0.978682


In [17]:
# get-support() will output a boolean mask to tell me which features were selected
kbest.get_support()

array([ True, False, False, False,  True, False, False, False, False,
       False])

In [18]:
# we can apply this mask to the columns in our original dataframe
X_train_scaled.columns[kbest.get_support()]

Index(['total_bill', 'size'], dtype='object')

In [19]:
# kbest transform will convert our information to the selected feature subspace
# ****buuuuuut, its just a numpy array
kbest.transform(X_train_scaled)[:5]

array([[0.22867875, 0.2       ],
       [0.44763588, 0.2       ],
       [0.9109589 , 0.4       ],
       [0.31904551, 0.2       ],
       [0.49072028, 0.2       ]])

In [20]:
# So let's turn it into a df
X_train_KBtransformed = pd.DataFrame(
                        kbest.transform(X_train_scaled),
                        columns = X_train_scaled.columns[kbest.get_support()],
                        index = X_train_scaled.index
)
X_train_KBtransformed.head()

Unnamed: 0,total_bill,size
0,0.228679,0.2
1,0.447636,0.2
2,0.910959,0.4
3,0.319046,0.2
4,0.49072,0.2


# the best two features are Party Size and Total Bill

Use recursive feature elimination to select the top 2 features for tip amount. What are they?


In [21]:
model= LinearRegression()
# MAKE the thing
rfe = RFE(model, n_features_to_select=2)

# FIT the thing
rfe.fit(X_train_scaled, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [22]:
# Get feature ranking
# Selected features are assigned a rank 1
rfe.ranking_


array([1, 3, 9, 6, 7, 1, 4, 5, 2, 8])

In [23]:
#rankings in a dataframe
pd.DataFrame(
{'rfe_ranking':rfe.ranking_},
index=X_train_scaled.columns)

Unnamed: 0,rfe_ranking
total_bill,1
sex,3
smoker,9
time,6
size,7
price_per_person,1
Fri,4
Sat,5
Sun,2
Thur,8


In [24]:
rfe.get_support()

array([ True, False, False, False, False,  True, False, False, False,
       False])

In [25]:
X_train_RFEtransformed = pd.DataFrame(
    rfe.transform(X_train_scaled),
    index=X_train_scaled.index,
    columns=X_train_scaled.columns[rfe.support_])
X_train_RFEtransformed.head()

Unnamed: 0,total_bill,price_per_person
0,0.228679,0.211628
1,0.447636,0.499419
2,0.910959,0.680233
3,0.319046,0.330814
4,0.49072,0.556395


Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?



- The differences in the top features selected by SelectKBest and RFE can be attributed to their distinct evaluation approaches. SelectKBest solely focuses on individual feature scores, while RFE takes into account the overall contribution of features within a model.

# 2
Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [26]:
selected_features = eva.select_kbest(X_train_scaled, y_train, k=2)
selected_features



Index(['total_bill', 'size'], dtype='object')

# 3

Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [27]:
selected_features = eva.rfe(X_train_scaled, y_train, k=2)
selected_features

Index(['total_bill', 'price_per_person'], dtype='object')

# 4
Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [28]:
#get data
df = data('swiss')
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [29]:
df.dtypes

Fertility           float64
Agriculture         float64
Examination           int64
Education             int64
Catholic            float64
Infant.Mortality    float64
dtype: object

In [30]:
# split data
train, validate, test = p.split_data(df)
train.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Rive Droite,44.7,46.6,16,29,50.43,18.2
Aubonne,66.9,67.5,14,7,2.27,19.1
Rolle,60.5,60.8,16,10,7.72,16.3
Lavaux,65.1,73.0,19,9,2.84,20.0
Nyone,56.6,50.9,22,12,15.14,16.7


In [32]:
#scale data
X_train_scaled, X_validate_scaled, X_validate_scaled, y_train, y_validate, y_test = p.scaled_df_swiss(train, validate, test)
X_train_scaled.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
0,0.47439,0.40625,0.903226,0.492786,0.316327
1,0.729268,0.34375,0.193548,0.0,0.408163
2,0.647561,0.40625,0.290323,0.055766,0.122449
3,0.796341,0.5,0.258065,0.005832,0.5
4,0.526829,0.59375,0.354839,0.131689,0.163265


In [33]:
selected_features = eva.select_kbest(X_train_scaled, y_train, k=3)
selected_features

Index(['Examination', 'Catholic', 'Infant.Mortality'], dtype='object')

In [34]:
selected_features = eva.rfe(X_train_scaled, y_train, k=3)
selected_features

Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object')