In [1]:
import numpy as np
import pandas as pd
import wrangle
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from pydataset import data
from sklearn.linear_model import LinearRegression

In [2]:
df = data('tips')

In [3]:
df['price_per_person'] = df.total_bill / df['size']

dummy_df = pd.get_dummies(df[['sex', 'smoker', 'day', 'time']])
df = pd.concat([df,dummy_df], axis=1)
df = df.drop(columns =['sex', 'smoker', 'day', 'time'])

In [4]:
train, validate, test = wrangle.split_data(df, df.total_bill)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146 entries, 225 to 167
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        146 non-null    float64
 1   tip               146 non-null    float64
 2   size              146 non-null    int64  
 3   price_per_person  146 non-null    float64
 4   sex_Female        146 non-null    uint8  
 5   sex_Male          146 non-null    uint8  
 6   smoker_No         146 non-null    uint8  
 7   smoker_Yes        146 non-null    uint8  
 8   day_Fri           146 non-null    uint8  
 9   day_Sat           146 non-null    uint8  
 10  day_Sun           146 non-null    uint8  
 11  day_Thur          146 non-null    uint8  
 12  time_Dinner       146 non-null    uint8  
 13  time_Lunch        146 non-null    uint8  
dtypes: float64(3), int64(1), uint8(10)
memory usage: 7.1 KB


### Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

- tip
- price per person
- size

# Use select k best to select the top 2 features for predicting tip amount. What are they?

In [5]:
X_train, y_train = train.iloc[:], train.tip
X_train = X_train.drop(columns = 'tip')
kbest = SelectKBest(f_regression, k=2)

kbest.fit(X_train, y_train)

In [6]:
kbest.feature_names_in_

array(['total_bill', 'size', 'price_per_person', 'sex_Female', 'sex_Male',
       'smoker_No', 'smoker_Yes', 'day_Fri', 'day_Sat', 'day_Sun',
       'day_Thur', 'time_Dinner', 'time_Lunch'], dtype=object)

In [7]:
kbest.pvalues_

array([1.30561953e-19, 3.66901154e-12, 2.85709015e-04, 4.41354619e-01,
       4.41354619e-01, 7.71698118e-01, 7.71698118e-01, 2.47716010e-01,
       8.74556685e-01, 7.19730319e-02, 3.24183852e-01, 1.41133637e-01,
       1.41133637e-01])

In [8]:
kbest.scores_

array([1.11115028e+02, 5.76071336e+01, 1.38318779e+01, 5.96049171e-01,
       5.96049171e-01, 8.45071808e-02, 8.45071808e-02, 1.34704421e+00,
       2.50128222e-02, 3.28560161e+00, 9.78681779e-01, 2.18955757e+00,
       2.18955757e+00])

In [9]:
kbest_results = pd.DataFrame(
                dict(p=kbest.pvalues_, f=kbest.scores_),
                index = X_train.columns)

In [10]:
kbest_results

Unnamed: 0,p,f
total_bill,1.30562e-19,111.115028
size,3.669012e-12,57.607134
price_per_person,0.000285709,13.831878
sex_Female,0.4413546,0.596049
sex_Male,0.4413546,0.596049
smoker_No,0.7716981,0.084507
smoker_Yes,0.7716981,0.084507
day_Fri,0.247716,1.347044
day_Sat,0.8745567,0.025013
day_Sun,0.07197303,3.285602


In [11]:
X_train.columns[kbest.get_support()]

Index(['total_bill', 'size'], dtype='object')

### Tip and price per person

# Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [12]:
model = LinearRegression()

In [13]:
rfe = RFE(model, n_features_to_select=2)

In [14]:
rfe.fit(X_train, y_train)

In [15]:
the_df = pd.DataFrame(
{'rfe_ranking':rfe.ranking_},
index=X_train.columns)

the_df[the_df['rfe_ranking'] == 1]

Unnamed: 0,rfe_ranking
day_Fri,1
day_Sun,1


## size and price per person

# Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

Because kbest compares 1x to 1y
and rfe compares all the x's to 1 y
there is different math behind the scenes becasuse kbest uses statistical tests and 
rfe uses a logistic regression model

# Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [16]:
def select_kbest(X_train, y_train, the_k):
    kbest = SelectKBest(f_regression, k=the_k)
    kbest.fit(X_train, y_train)
    return X_train.columns[kbest.get_support()]

In [17]:
select_kbest(X_train, y_train, the_k= 2)

Index(['total_bill', 'size'], dtype='object')

# Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [18]:
def rfe(X_train, y_train, the_k):
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select=the_k)
    rfe.fit(X_train, y_train)
    the_df = pd.DataFrame(
    {'rfe_ranking':rfe.ranking_},
    index=X_train.columns)
    return the_df[the_df['rfe_ranking'] == 1]

In [19]:
rfe(X_train, y_train, 2)

Unnamed: 0,rfe_ranking
day_Fri,1
day_Sun,1


# Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [20]:
df = data('swiss')

In [21]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [22]:
train, validate, test = wrangle.split_data(df, df.Fertility)

In [23]:
X_train, y_train = train.iloc[:], train.Fertility
X_train = X_train.drop(columns = 'Fertility')
X_train.shape

(27, 5)

In [24]:
select_kbest(X_train, y_train, 3)

Index(['Examination', 'Catholic', 'Infant.Mortality'], dtype='object')

In [25]:
rfe(X_train, y_train, 3)

Unnamed: 0,rfe_ranking
Examination,1
Education,1
Infant.Mortality,1


In [26]:
## changing column names back to the origonal names

X_train_array, X_validate, X_test = wrangle.scaler_robust(X_train, X_train, X_train)
the_df = pd.DataFrame(X_train_array)

columns_list = list(X_train.columns) 
for i in range(len(the_df.columns)):
    the_df = the_df.rename({i: columns_list[i]}, axis='columns')

In [27]:
select_kbest(X_train, y_train, 3)

Index(['Examination', 'Catholic', 'Infant.Mortality'], dtype='object')

In [28]:
rfe(X_train, y_train, 3)

Unnamed: 0,rfe_ranking
Examination,1
Education,1
Infant.Mortality,1


In [29]:
X_train_array, X_validate, X_test = wrangle.standard_scaler(X_train, X_validate, X_test)
the_df = pd.DataFrame(X_train_array)
columns_list = list(X_train.columns) 
for i in range(len(the_df.columns)):
    the_df = the_df.rename({i: columns_list[i]}, axis='columns')

In [30]:
select_kbest(X_train, y_train, 3)

Index(['Examination', 'Catholic', 'Infant.Mortality'], dtype='object')

In [31]:
rfe(X_train, y_train, 3)

Unnamed: 0,rfe_ranking
Examination,1
Education,1
Infant.Mortality,1


In [32]:
X_train_array, X_validate, X_test = wrangle.scaler_min_max(X_train, X_validate, X_test)
the_df = pd.DataFrame(X_train_array)
columns_list = list(X_train.columns) 
for i in range(len(the_df.columns)):
    the_df = the_df.rename({i: columns_list[i]}, axis='columns')

In [33]:
select_kbest(X_train, y_train, 3)

Index(['Examination', 'Catholic', 'Infant.Mortality'], dtype='object')

In [34]:
rfe(X_train, y_train, 3)

Unnamed: 0,rfe_ranking
Examination,1
Education,1
Infant.Mortality,1
