# Imports

In [1]:
# Basics
import pandas as pd
import numpy as np
from pydataset import data

# Visuals
import matplotlib.pyplot as plt
import seaborn as sns

# Metric Tools
import sklearn.linear_model
import sklearn.feature_selection
import sklearn.preprocessing

# Custom
import wrangle as wrg

# Exercise 1.1

- Load the tips dataset. Create a column named price_per_person. This should be the total bill divided by the party size.

In [2]:
df = data('tips')
df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3


In [3]:
# Rename Size (.size is an attribute)

df = df.rename(columns={'size':'number_of_people'})

In [4]:
# Add price per person to the nearest 2 decimals

df['price_per_person'] = round((df.total_bill / df.number_of_people), 2)
df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,number_of_people,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.49
2,10.34,1.66,Male,No,Sun,Dinner,3,3.45


In [5]:
# A bit more data prep

cat_columns = ['sex', 'day', 'time', 'smoker']

for col in cat_columns:
    tips_dummy = pd.get_dummies(df[col],
                                    prefix=df[col].name,
                                    dummy_na=False,
                                    drop_first = True)
    df = pd.concat([df, tips_dummy], axis=1)
    df = df.drop(columns=[col])
    

In [6]:
df.head(2)

Unnamed: 0,total_bill,tip,number_of_people,price_per_person,sex_Male,day_Sat,day_Sun,day_Thur,time_Lunch,smoker_Yes
1,16.99,1.01,2,8.49,0,0,1,0,0,0
2,10.34,1.66,3,3.45,1,0,1,0,0,0


In [7]:
# For good measure, lets split the data as well

train, validate, test = wrg.split_data(df)

train <> (136, 10)
validate <> (59, 10)
test <> (49, 10)


In [8]:
# Variables for constructing models
x_train = train.drop(columns=['tip'])
y_train = train.churn

x_validate = validate.drop(columns=['churn'])
y_validate = validate.churn

x_test = test.drop(columns=['churn'])
y_test = test.churn



x_train.head(1)

Unnamed: 0,total_bill,number_of_people,price_per_person,sex_Male,day_Sat,day_Sun,day_Thur,time_Lunch,smoker_Yes
195,16.58,2,8.29,1,0,0,1,1,1


In [9]:
# Scale data as well

# Make it
scaler = sklearn.preprocessing.MinMaxScaler()

# Fit it
scaler.fit(x_train)

# Use it
x_train_scaled = scaler.transform(x_train)
x_validate_scaled = scaler.transform(x_validate)
x_test_scaled = scaler.transform(x_test)

In [10]:
x_train_scaled

array([[0.24034621, 0.2       , 0.37465374, ..., 1.        , 1.        ,
        1.        ],
       [0.60097648, 0.2       , 0.93767313, ..., 0.        , 0.        ,
        1.        ],
       [0.40523746, 0.6       , 0.21606648, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.65490457, 0.6       , 0.41135734, ..., 0.        , 0.        ,
        0.        ],
       [0.5337328 , 1.        , 0.14473684, ..., 1.        , 1.        ,
        0.        ],
       [0.37727474, 0.2       , 0.58864266, ..., 0.        , 0.        ,
        0.        ]])

# Exercise 1.2

- Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? 

#### I would guess the total bill and number of people would be the largest predictors

---------------------------------------------------

# Exercise 1.3

- Use select k best to select the top 2 features for predicting tip amount. What are they?

In [11]:
# Tip via select-k-best

# Make it
kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k=2)

# Fit it
kbest.fit(x_train_scaled, y_train)

# Use it
kbest_features = x_train.columns[kbest.get_support()].tolist()


print(f'The two best features to build our model on will be', kbest_features)

The two best features to build our model on will be ['total_bill', 'number_of_people']


# Exercise 1.4

- Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [12]:
# Tips features via RFE

# Make it
lm = sklearn.linear_model.LinearRegression()
rfe = sklearn.feature_selection.RFE(lm, n_features_to_select=2)

# Fit it
rfe.fit(x_train_scaled, y_train)

# Use it
rfe_columns = x_train.columns[rfe.support_].tolist()

print(f'According to RFE, the two best features to build our model on will be', rfe_columns)

According to RFE, the two best features to build our model on will be ['total_bill', 'price_per_person']


# Exercise 1.5

- Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

-------------------------------------------

#### Both feature selection methods chose features based on different priorities.  The features chosen do change slightly when chosing the number of features

In [13]:
# [WITH 3 FEATURES] Tips features via RFE

# Make it
lm = sklearn.linear_model.LinearRegression()
rfe = sklearn.feature_selection.RFE(lm, n_features_to_select=3)

# Fit it
rfe.fit(x_train_scaled, y_train)

# Use it
rfe_columns = x_train.columns[rfe.support_].tolist()

print(f'According to RFE, the two best features to build our model on will be', rfe_columns)

According to RFE, the two best features to build our model on will be ['total_bill', 'number_of_people', 'price_per_person']


In [14]:
# [WITH 4 FEATURES] Tips features via RFE

# Make it
lm = sklearn.linear_model.LinearRegression()
rfe = sklearn.feature_selection.RFE(lm, n_features_to_select=4)

# Fit it
rfe.fit(x_train_scaled, y_train)

# Use it
rfe_columns = x_train.columns[rfe.support_].tolist()

print(f'According to RFE, the two best features to build our model on will be', rfe_columns)

According to RFE, the two best features to build our model on will be ['total_bill', 'number_of_people', 'price_per_person', 'day_Sun']


# Exercies 2

- Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [15]:
# Build the function

def select_kbest(x_scaled, x, y, k):
    '''
    Takes in an x and y dataframe and the number of features to select
    and returns the names of the top features based on SelectKBest
    '''
    # Make it
    kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k=k)

    # Fit tit
    kbest.fit(x_scaled, y)
    
    # Use it 
    return x.columns[kbest.get_support()].tolist()

select_kbest(x_train_scaled, x_train, y_train, 2)

['total_bill', 'number_of_people']

# Exercise 3

- Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [16]:
# Build the function

def rfe(x_scaled, x, y, k):
    '''
    Takes in the predictors, the target, and the number of features to select,
    and it should return the top k features based on the RFE class. 
    '''
    # Make it
    lm = sklearn.linear_model.LinearRegression()
    rfe = sklearn.feature_selection.RFE(lm, n_features_to_select=k)

    # Fit it
    rfe.fit(x_scaled, y)
    
    # Use it
    features_to_use = x.columns[rfe.support_].tolist()
    
    #
    #all_rankings = show_features_rankings(x, rfe)
    
    return features_to_use

In [17]:
rfe(x_train_scaled, x_train, y_train, 3)

['total_bill', 'number_of_people', 'price_per_person']

In [18]:
def rfe_feature_rankings(x_scaled, x, y, k):
    '''
    Takes in the predictors, the target, and the number of features to select,
    and it should return a database of the features ranked by importance
    '''
    
    # Make it
    lm = sklearn.linear_model.LinearRegression()
    rfe = sklearn.feature_selection.RFE(lm, n_features_to_select=k)

    # Fit it
    rfe.fit(x_scaled, y)
    
    var_ranks = rfe.ranking_
    var_names = x_train.columns.tolist()
    ranks = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})
    ranks = ranks.sort_values(by="Rank", ascending=True)
    return ranks

In [19]:
rfe_feature_rankings(x_train_scaled, x_train, y_train, 2)

Unnamed: 0,Var,Rank
0,total_bill,1
2,price_per_person,1
1,number_of_people,2
5,day_Sun,3
4,day_Sat,4
7,time_Lunch,5
8,smoker_Yes,6
6,day_Thur,7
3,sex_Male,8


# Exercise 4

- Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [22]:
df = data('swiss')
df.head(10)

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6
Porrentruy,76.1,35.3,9,7,90.57,26.6
Broye,83.8,70.2,16,7,92.85,23.6
Glane,92.4,67.8,14,8,97.16,24.9
Gruyere,82.4,53.3,12,7,97.67,21.0
Sarine,82.9,45.2,16,13,91.38,24.4


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [None]:
train, validate, test = wrg.split_data(df)