In [40]:
import numpy as np
import os
import seaborn as sns
import scipy.stats as stat
from scipy.stats import pearsonr
from scipy.stats import pointbiserialr
import pandas as pd
import matplotlib.pyplot as plt
from pydataset import data
import warnings
warnings.filterwarnings("ignore")
import wrangle as wra
import env
import explore as exp
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_classif

### 1. Load the tips dataset

In [2]:
tips_df = data('tips')
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


* a. Create a column named price_per_person. This should be the total bill divided by the party size.


In [3]:
tips_df['price_per_person'] = tips_df['total_bill'] / tips_df['size']

In [10]:
sex_dummies = pd.get_dummies(tips_df['sex'], prefix='sex')
smoker_dummies = pd.get_dummies(tips_df['smoker'], prefix='smoker')
day_dummies = pd.get_dummies(tips_df['day'], prefix='day')
time_dummies = pd.get_dummies(tips_df['time'], prefix='time')

# Convert the column names of the dummy variables to lowercase
sex_dummies.columns = sex_dummies.columns.str.lower()
smoker_dummies.columns = smoker_dummies.columns.str.lower()
day_dummies.columns = day_dummies.columns.str.lower()
time_dummies.columns = time_dummies.columns.str.lower()

# Concatenate the dummy variables with the original DataFrame
tips_df = pd.concat([tips_df, sex_dummies, smoker_dummies, day_dummies, time_dummies], axis=1)


In [11]:
train, validate, test = wra.split_data(tips_df)

In [14]:
X_train, y_train = train[['tip','sex_male', 'smoker_yes', 'day_fri', 'day_sat', 'day_sun', 'day_thur', 'time_dinner', 'size', 'price_per_person']], train.total_bill
X_validate, y_validate = validate[['tip','sex_male', 'smoker_yes', 'day_fri', 'day_sat', 'day_sun', 'day_thur', 'time_dinner', 'size', 'price_per_person']], validate.total_bill
X_test, y_test = test[['tip','sex_male', 'smoker_yes', 'day_fri', 'day_sat', 'day_sun', 'day_thur', 'time_dinner', 'size', 'price_per_person']], test.total_bill

In [15]:
train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,sex_Female,sex_Male,...,sex_female,sex_male,smoker_no,smoker_yes,day_fri,day_sat,day_sun,day_thur,time_dinner,time_lunch
225,13.42,1.58,Male,Yes,Fri,Lunch,2,6.71,0,1,...,0,1,0,1,1,0,0,0,0,1
182,23.33,5.65,Male,Yes,Sun,Dinner,2,11.665,0,1,...,0,1,0,1,0,0,1,0,1,0
103,44.3,2.5,Female,Yes,Sat,Dinner,3,14.766667,1,0,...,1,0,0,1,0,1,0,0,1,0
165,17.51,3.0,Female,Yes,Sun,Dinner,2,8.755,1,0,...,1,0,0,1,0,0,1,0,1,0
74,25.28,5.0,Female,Yes,Sat,Dinner,2,12.64,1,0,...,1,0,0,1,0,1,0,0,1,0


* b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

    -- total_bill, tip, size

* c. Use select k best to select the top 2 features for predicting tip amount. What are they?

In [16]:
# MAKE the thing
kbest = SelectKBest(f_regression, k=2)

# FIT the thing
kbest.fit(X_train, y_train)

In [17]:
# statistical f-value / feature's scores:
kbest.scores_

array([111.1150282 ,   3.74425143,   0.41180105,   4.30613732,
         1.36931763,   1.38682491,   1.4136872 ,   4.16732867,
        92.20000781, 100.79420314])

In [18]:
# p value: 
kbest.pvalues_

array([1.30561953e-19, 5.49495680e-02, 5.22076950e-01, 3.97543703e-02,
       2.43862187e-01, 2.40884666e-01, 2.36402231e-01, 4.30366766e-02,
       3.52935999e-17, 2.62393956e-18])

In [19]:
kbest.feature_names_in_

array(['tip', 'sex_male', 'smoker_yes', 'day_fri', 'day_sat', 'day_sun',
       'day_thur', 'time_dinner', 'size', 'price_per_person'],
      dtype=object)

In [20]:
kbest_results = pd.DataFrame(
                dict(p=kbest.pvalues_, f=kbest.scores_),
                index = X_train.columns)

In [21]:
kbest_results

Unnamed: 0,p,f
tip,1.30562e-19,111.115028
sex_male,0.05494957,3.744251
smoker_yes,0.5220769,0.411801
day_fri,0.03975437,4.306137
day_sat,0.2438622,1.369318
day_sun,0.2408847,1.386825
day_thur,0.2364022,1.413687
time_dinner,0.04303668,4.167329
size,3.52936e-17,92.200008
price_per_person,2.62394e-18,100.794203


### Takeaways:
  * tip and price_per_person have the highest feature scores
  * They also have the smallest p-values!

In [22]:
# get_support() will output a boolean mask to tell me which features were selected
kbest.get_support()

array([ True, False, False, False, False, False, False, False, False,
        True])

In [23]:
# we can apply this mask to the columns in our original dataframe
X_train.columns[kbest.get_support()]

Index(['tip', 'price_per_person'], dtype='object')

In [24]:
# kbest transform will convert our information to the selected feature subspace
# its just a numpy array
kbest.transform(X_train)[:5]

array([[ 1.58      ,  6.71      ],
       [ 5.65      , 11.665     ],
       [ 2.5       , 14.76666667],
       [ 3.        ,  8.755     ],
       [ 5.        , 12.64      ]])

In [25]:
# So let's turn it into a df
X_train_KBtransformed = pd.DataFrame(
    kbest.transform(X_train),
    columns = X_train.columns[kbest.get_support()],
    index=X_train.index
)

In [27]:
X_train_KBtransformed.head()

Unnamed: 0,tip,price_per_person
225,1.58,6.71
182,5.65,11.665
103,2.5,14.766667
165,3.0,8.755
74,5.0,12.64


* d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [29]:
# make a model object to use in RFE process.
# The model is here to give us metrics on feature importance and model score
# allowing us to recursively reduce the number of features to reach our desired space

model = LinearRegression()

In [30]:
# MAKE the thing
rfe = RFE(model, n_features_to_select=2)

# FIT the thing
rfe.fit(X_train, y_train)

In [31]:
# Get feature ranking
# Selected features are assigned a rank 1

rfe.ranking_

array([3, 8, 4, 9, 2, 5, 6, 7, 1, 1])

In [32]:
# Dataframe of rankings
pd.DataFrame(
{'rfe_ranking':rfe.ranking_},
index=X_train.columns)

Unnamed: 0,rfe_ranking
tip,3
sex_male,8
smoker_yes,4
day_fri,9
day_sat,2
day_sun,5
day_thur,6
time_dinner,7
size,1
price_per_person,1


In [33]:
rfe.get_support()

array([False, False, False, False, False, False, False, False,  True,
        True])

In [34]:
X_train_RFEtransformed = pd.DataFrame(
    rfe.transform(X_train),
    index=X_train.index,
    columns = X_train.columns[rfe.support_])

In [36]:
X_train_RFEtransformed.head()

Unnamed: 0,size,price_per_person
225,2.0,6.71
182,2.0,11.665
103,3.0,14.766667
165,2.0,8.755
74,2.0,12.64


* e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?
    * SelectKBest may identify features that have a high correlation with the target variable but may not be the most important features for the model, while RFE may identify features that have a lower correlation with the target variable but are more important for the model's performance. The difference between the top features selected by SelectKBest and RFE may change as the number of features selected changes. When selecting a small number of features (e.g., K=5), SelectKBest and RFE may agree on the top features because there are fewer features to choose from and the most important features may be more obvious. However, as the number of features selected increases (e.g., K=10 or more), there may be more variation in the top features selected by SelectKBest and RFE because there are more features to choose from and the importance of features may become more nuanced.

### 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [42]:
def select_kbest(X, y, k):
    """
    Select the top k features based on the SelectKBest class and return their names.
    
    Parameters:
    X (array-like): The predictors
    y (array-like): The target variable
    k (int): The number of features to select
    
    Returns:
    list: A list of the names of the top k selected features
    """
    # Create a SelectKBest object and fit it to the data
    selector = SelectKBest(f_regression, k=k)
    selector.fit(X, y)
    
    # Get the indices of the top k selected features
    idxs_selected = selector.get_support(indices=True)
    
    # Get the names of the top k selected features
    features_selected = list(X.columns[idxs_selected])
    
    # Return the names of the top k selected features
    return features_selected

In [43]:
# Call the select_kbest function
top_k_features = select_kbest(X_train, y_train, k=2)

# Print the names of the top k selected features
print(top_k_features)

['tip', 'price_per_person']


### 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually

In [44]:
def rfe(X, y, k):
    """
    Select the top k features based on the RFE class and return their names.
    
    Parameters:
    X (array-like): The predictors
    y (array-like): The target variable
    k (int): The number of features to select
    
    Returns:
    list: A list of the names of the top k selected features
    """
    # Create a linear regression model
    model = LinearRegression()
    
    # Create an RFE object and fit it to the data
    selector = RFE(model, n_features_to_select=k)
    selector.fit(X, y)
    
    # Get the indices of the top k selected features
    idxs_selected = selector.get_support(indices=True)
    
    # Get the names of the top k selected features
    features_selected = list(X.columns[idxs_selected])
    
    # Return the names of the top k selected features
    return features_selected


In [45]:
rfe(X_train, y_train, k=2)

['size', 'price_per_person']

### 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).