In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = sns.load_dataset('tips')

In [3]:
df['tip_percentage'] = (df['tip'] / df['total_bill']) * 100
df['price_per_person'] = df['total_bill'] / df['size']
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,5.944673,8.495
1,10.34,1.66,Male,No,Sun,Dinner,3,16.054159,3.446667
2,21.01,3.5,Male,No,Sun,Dinner,3,16.658734,7.003333
3,23.68,3.31,Male,No,Sun,Dinner,2,13.978041,11.84
4,24.59,3.61,Female,No,Sun,Dinner,4,14.680765,6.1475


# Initial Thoughts

- I feel like tip percentage, price per person, total bill, and party size will be the most important features

- So far it doesn't seem to make a huge difference between when someone eats there, if they are male or female, or a a smoker or not

In [4]:
from sklearn.model_selection import train_test_split

train_validate, test = train_test_split(df, test_size = .2, random_state = 123)
train, validate = train_test_split(train_validate, test_size = .3, random_state = 123)
train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
18,16.97,3.5,Female,No,Sun,Dinner,3,20.624632,5.656667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,71.034483,3.625
118,12.43,1.8,Female,No,Thur,Lunch,2,14.481094,6.215
28,21.7,4.3,Male,No,Sat,Dinner,2,19.815668,10.85
237,32.83,1.17,Male,Yes,Sat,Dinner,2,3.563814,16.415


In [5]:
x_train = train.drop(columns= ['tip', 'sex', 'smoker', 'day', 'time'])
x_validate = validate.drop(columns= ['tip', 'sex', 'smoker', 'day', 'time'])
x_test = test.drop(columns= ['tip', 'sex', 'smoker', 'day', 'time'])

y_train = train[['tip']]
y_validate = validate[['tip']]
y_test = test[['tip']]

In [6]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(copy = True).fit(x_train)

x_train_scaled = scaler.transform(x_train)
x_validate_scaled = scaler.transform(x_validate)
x_test_scaled = scaler.transform(x_test)

In [7]:
x_train_scaled = pd.DataFrame(x_train_scaled, columns = x_train.columns.values).set_index([x_train.index.values])
x_validate_scaled = pd.DataFrame(x_validate_scaled, columns = x_validate.columns.values).set_index([x_validate.index.values])
x_test_scaled = pd.DataFrame(x_test_scaled, columns = x_test.columns.values).set_index([x_test.index.values])

In [8]:
from sklearn.feature_selection import SelectKBest, f_regression

In [9]:
f_selector = SelectKBest(f_regression, k=2)

In [10]:
f_selector = f_selector.fit(x_train_scaled, y_train.tip)

In [11]:
x_train_reduced = f_selector.transform(x_train_scaled)
print(x_train.shape)
print(x_train_reduced.shape)

(136, 4)
(136, 2)


In [12]:
f_support = f_selector.get_support()
f_support

array([ True,  True, False, False])

In [13]:
f_feature = x_train_scaled.iloc[:, f_support].columns.tolist()
f_feature

['total_bill', 'size']

In [14]:
x_reduced_scaled = x_train_scaled.iloc[:, f_support]
x_reduced_scaled.head()

Unnamed: 0,total_bill,size
18,0.307114,0.4
172,0.092355,0.2
118,0.206805,0.2
28,0.411622,0.2
237,0.657534,0.2


In [15]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [16]:
lm = LinearRegression()

In [17]:
rfe = RFE(lm, 2)



In [18]:
x_rfe = rfe.fit_transform(x_train_scaled, y_train)

In [19]:
mask = rfe.support_

In [20]:
x_reduced_scaled_rfe = x_train_scaled.iloc[:, mask]

In [21]:
x_reduced_scaled = x_train_scaled.iloc[:, f_support]
x_reduced_scaled.head()

Unnamed: 0,total_bill,size
18,0.307114,0.4
172,0.092355,0.2
118,0.206805,0.2
28,0.411622,0.2
237,0.657534,0.2


# Takeaways
    - KBest returned the best features based on stats algorythms its running in the background so based on the math, it will return the ones that it thinks are the best.
    
    - The reason RFE is going to potentially be different is because it is running mini models in the background, with these mini models it'll return more accurate features

In [22]:
def select_kbest(dfx, dfy, k):
    #DFx = the x dataframe you want to assess, example(x_train, x_validate, etc)
    #DFy = the y dataframe you want to assess, example(y_train, y_validate, etc)
    #k = the number of best features you want the function to return
    scaler = MinMaxScaler(copy = True).fit(dfx)
    dfx_scaled = scaler.transform(dfx)
    dfx_scaled = pd.DataFrame(dfx_scaled, columns = dfx.columns.values).set_index([dfx.index.values])
    f_selector = SelectKBest(f_regression, k)
    f_selector = f_selector.fit(dfx_scaled, dfy)
    x_train_reduced = f_selector.transform(dfx_scaled)
    f_support = f_selector.get_support()
    x_reduced_scaled = dfx_scaled.iloc[:, f_support]
    return x_reduced_scaled

In [23]:
x_train = train.drop(columns= ['tip_percentage', 'sex', 'smoker', 'day', 'time'])
y_train = train[['tip_percentage']]
select_kbest(x_train, y_train, 2)

  return f(**kwargs)


Unnamed: 0,tip,price_per_person
18,0.31250,0.150344
172,0.51875,0.032258
118,0.10000,0.182796
28,0.41250,0.452194
237,0.02125,0.775647
...,...,...
233,0.05875,0.134554
6,0.12500,0.076431
7,0.26500,0.212148
115,0.31250,0.324615


In [24]:
def rfe(dfx, dfy, k):
    scaler = MinMaxScaler(copy = True).fit(dfx)
    dfx_scaled = scaler.transform(dfx)
    dfx_scaled = pd.DataFrame(dfx_scaled, columns = dfx.columns.values).set_index([dfx.index.values])
    lm = LinearRegression()
    rfe = RFE(lm, k)
    x_rfe = rfe.fit_transform(dfx_scaled, dfy)
    mask = rfe.support_
    x_reduced_scaled_rfe = dfx_scaled.iloc[:, mask]
    x_reduced_scaled = dfx_scaled.iloc[:, mask]
    return x_reduced_scaled

In [25]:
rfe(x_train, y_train, 2)



Unnamed: 0,total_bill,tip
18,0.307114,0.31250
172,0.092355,0.51875
118,0.206805,0.10000
28,0.411622,0.41250
237,0.657534,0.02125
...,...,...
233,0.170128,0.05875
6,0.125939,0.12500
7,0.526072,0.26500
115,0.314627,0.31250


In [26]:
from pydataset import data

In [27]:
df = data('swiss')

In [28]:
x = df.drop(columns= ['Fertility'])
y = df[['Fertility']]
select_kbest(x, y, 2).head()

  return f(**kwargs)


Unnamed: 0,Examination,Education
Courtelary,0.352941,0.211538
Delemont,0.088235,0.153846
Franches-Mnt,0.058824,0.076923
Moutier,0.264706,0.115385
Neuveville,0.411765,0.269231


In [29]:
rfe(x, y, 2).head()



Unnamed: 0,Education,Infant.Mortality
Courtelary,0.211538,0.721519
Delemont,0.153846,0.721519
Franches-Mnt,0.076923,0.594937
Moutier,0.115385,0.601266
Neuveville,0.269231,0.620253
