## 1. Load the tips dataset.

   a. Create a column named price_per_person. This should be the total bill divided by the party size.
   
   b. Before using any of the methods discussed in the lesson, which features do you think would be most important            for predicting the tip amount?
   
          answer: party size, dinner or lunch
   
   c. Use select k best to select the top 2 features for predicting tip amount. What are they?
   
          answer: size and total bill
   
   d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?
         
         answer: time and size.
    
   e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?
   
    answer: Because select k best isolates each column and compares the direct correlation to the target variable, and recursive feature elimination based on importance to the model which is linear regression. the feature selection did changed because I removed size and time attributes.

In [11]:
import pandas as pd 
import env as env
import wrangle as w
import explore as e
from pydataset import data
# scaling and modeling
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# feature engineering
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector

In [2]:
df=data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


In [4]:
df['price_per_person']=round((df.total_bill/df['size']),2)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.49
2,10.34,1.66,Male,No,Sun,Dinner,3,3.45
3,21.01,3.5,Male,No,Sun,Dinner,3,7.0
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.15


In [5]:
df.time.value_counts()

Dinner    176
Lunch      68
Name: time, dtype: int64

In [6]:
df.day.value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [7]:
df['sex_encoded'] = df.sex.map({'Female': 1, 'Male': 0})
df['smoker_encoded'] = df.smoker.map({'Yes': 1, 'No': 0})
df['time_encoded'] = df.time.map({'Dinner': 1, 'Lunch': 0})
dummy_df = pd.get_dummies(df[['day']],drop_first=True)
df = pd.concat( [df,dummy_df], axis=1 )
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,sex_encoded,smoker_encoded,time_encoded,day_Sat,day_Sun,day_Thur
1,16.99,1.01,Female,No,Sun,Dinner,2,8.49,1,0,1,0,1,0
2,10.34,1.66,Male,No,Sun,Dinner,3,3.45,0,0,1,0,1,0
3,21.01,3.5,Male,No,Sun,Dinner,3,7.0,0,0,1,0,1,0
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84,0,0,1,0,1,0
5,24.59,3.61,Female,No,Sun,Dinner,4,6.15,1,0,1,0,1,0


In [8]:
train,test,validate=w.split_data(df)

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146 entries, 225 to 167
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        146 non-null    float64
 1   tip               146 non-null    float64
 2   sex               146 non-null    object 
 3   smoker            146 non-null    object 
 4   day               146 non-null    object 
 5   time              146 non-null    object 
 6   size              146 non-null    int64  
 7   price_per_person  146 non-null    float64
 8   sex_encoded       146 non-null    int64  
 9   smoker_encoded    146 non-null    int64  
 10  time_encoded      146 non-null    int64  
 11  day_Sat           146 non-null    uint8  
 12  day_Sun           146 non-null    uint8  
 13  day_Thur          146 non-null    uint8  
dtypes: float64(3), int64(4), object(4), uint8(3)
memory usage: 14.1+ KB


In [10]:
x_train= train[['total_bill','sex_encoded','smoker_encoded','time_encoded','size','day_Sat','day_Sun','day_Thur']]
y_train=train.tip

x_validate= validate[['total_bill','sex_encoded','smoker_encoded','time_encoded','size','day_Sat','day_Sun','day_Thur']]
y_validate=validate.tip

x_test= test[['total_bill','sex_encoded','smoker_encoded','time_encoded','size','day_Sat','day_Sun','day_Thur']]
y_test=test.tip


###  Select kbest, 2 features:

In [14]:
kbest = SelectKBest(f_regression, k=2)
kbest.fit(x_train,y_train)


SelectKBest(k=2, score_func=<function f_regression at 0x7fcef08d3670>)

In [15]:
kbest.scores_

array([1.11115028e+02, 5.96049171e-01, 8.45071808e-02, 2.18955757e+00,
       5.76071336e+01, 2.50128222e-02, 3.28560161e+00, 9.78681779e-01])

In [16]:
kbest.pvalues_

array([1.30561953e-19, 4.41354619e-01, 7.71698118e-01, 1.41133637e-01,
       3.66901154e-12, 8.74556685e-01, 7.19730319e-02, 3.24183852e-01])

In [17]:
kbest.get_support()

array([ True, False, False, False,  True, False, False, False])

In [18]:
x_train.columns[kbest.get_support()]

Index(['total_bill', 'size'], dtype='object')

## RFE, using Linear Regression model

In [19]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=2)
rfe.fit(x_train,y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [20]:
rfe.ranking_

array([5, 2, 6, 1, 1, 3, 7, 4])

In [21]:
pd.DataFrame(
{'rfe_ranking':rfe.ranking_},
index=x_train.columns)

Unnamed: 0,rfe_ranking
total_bill,5
sex_encoded,2
smoker_encoded,6
time_encoded,1
size,1
day_Sat,3
day_Sun,7
day_Thur,4


### Modifying features:


In [41]:
# removed size and time attributes
x_train= train[['total_bill','sex_encoded','smoker_encoded','day_Sat','day_Sun','day_Thur']]
y_train=train.tip

x_validate= validate[['total_bill','sex_encoded','smoker_encoded','day_Sat','day_Sun','day_Thur']]
y_validate=validate.tip

x_test= test[['total_bill','sex_encoded','smoker_encoded','day_Sat','day_Sun','day_Thur']]
y_test=test.tip

In [42]:
select_kbest(x_train,y_train,2)

['total_bill', 'day_Sun']

In [43]:
rfe(x_train,y_train,2)

['day_Sat', 'day_Thur']

In [44]:
sequential_feature_selector(x_train,y_train,2)

['total_bill', 'smoker_encoded']

2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [24]:
def select_kbest(X, y, k):
    """
    Select the top k features based on the SelectKBest class.
    Args:
        X  The predictors.
        y  The target.
        k  The number of features to select.
    Returns:
        List of str: The names of the top k selected features.
    """
    # Create the SelectKBest object and fit it to the data
    selector = SelectKBest(score_func=f_regression, k=k)
    selector.fit(X, y)
    
    # Get the indices of the top k selected features
    feature_indices = selector.get_support(indices=True)
    
    # Get the names of the top k selected features
    feature_names = list(X.columns[feature_indices])
    
    return feature_names

In [25]:
select_kbest(x_train,y_train,2)

['total_bill', 'size']

3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [26]:
def rfe(X, y, k):
    """
    Select the top k features based on the Recursive Feature Elimination (RFE) class.
    Returns:
        List of str: The names of the top k selected features.
    """
    # Create the RFE object and fit it to the data
    estimator = LinearRegression()
    selector = RFE(estimator, n_features_to_select=k)
    selector.fit(X, y)
    
    # Get the indices of the top k selected features
    feature_indices = selector.get_support(indices=True)
    
    # Get the names of the top k selected features
    feature_names = list(X.columns[feature_indices])
    
    return feature_names


In [27]:
rfe(x_train,y_train,2)

['time_encoded', 'size']

4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [29]:
swiss=data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [30]:
swiss.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [31]:
x_features=swiss[['Agriculture','Examination','Education','Catholic','Infant.Mortality']]
y_target=swiss.Fertility

In [32]:
select_kbest(x_features,y_target,3)

['Examination', 'Education', 'Catholic']

In [34]:
rfe(x_features,y_target,3)

['Examination', 'Education', 'Infant.Mortality']

## Sequential Feature Selector

In [37]:
def sequential_feature_selector(X, y, k):
    """
    Select the top k features based on the Sequential Feature Selector class.
    Returns:
        List of str: The names of the top k selected features.
    """
    # Create the SequentialFeatureSelector object and fit it to the data
    estimator = LinearRegression()
    selector = SequentialFeatureSelector(estimator, n_features_to_select=k, direction='forward', scoring='neg_mean_squared_error', cv=5)
    selector.fit(X, y)
    
    # Get the mask of the top k selected features
    feature_mask = selector.get_support()
    
    # Get the names of the top k selected features
    feature_names = list(X.columns[feature_mask])
    
    return feature_names



In [38]:
# Tips dataset
sequential_feature_selector(x_train,y_train,2)

['total_bill', 'size']

In [39]:
# swiss dataset
sequential_feature_selector(x_features,y_target,3)

['Agriculture', 'Examination', 'Infant.Mortality']