In [1]:
import pandas as pd
import numpy as np
from pydataset import data

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression

import split_scale

In [2]:
df = data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df['tip_percentage'] = df.tip / df.total_bill
df['price_per_person'] = df.total_bill / df['size']
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


# D - tip amount

In [4]:
X = df[['total_bill', 'size', 'tip_percentage', 'price_per_person']]
y = df['tip']
X_train, X_test, y_train, y_test = (
                    split_scale.split_my_data_twice(X, y, .8))
X_train.head()

Unnamed: 0,total_bill,size,tip_percentage,price_per_person
229,13.28,2,0.204819,6.64
209,24.27,2,0.083642,12.135
97,27.28,2,0.146628,13.64
168,31.71,4,0.141911,7.9275
85,15.98,2,0.127034,7.99


I predict total bill will be the best predictor of tip amount, and same for the tip percentage

In [5]:
selector = SelectKBest(f_regression, k = 2).fit(X_train, y_train)
X_kbest = selector.transform(X_train)
support = selector.get_support()
X_train.columns[support].tolist()

['total_bill', 'size']

In [6]:
rfe = RFE(LinearRegression(), 2)
X_rfe = rfe.fit_transform(X_train, y_train)
mask = rfe.support_
X_train.columns[mask].tolist()

['total_bill', 'tip_percentage']

# E - tip percentage

In [7]:
X = df[['total_bill', 'size', 'tip', 'price_per_person']]
y = df['tip_percentage']
X_train, X_test, y_train, y_test = (
                    split_scale.split_my_data_twice(X, y, .8))
X_train.head()

Unnamed: 0,total_bill,size,tip,price_per_person
229,13.28,2,2.72,6.64
209,24.27,2,2.03,12.135
97,27.28,2,4.0,13.64
168,31.71,4,4.5,7.9275
85,15.98,2,2.03,7.99


In [8]:
selector = SelectKBest(f_regression, k = 2)
selector.fit(X_train, y_train)
X_kbest = selector.transform(X_train)
support = selector.get_support()
X_train.loc[:, support]

Unnamed: 0,tip,price_per_person
229,2.72,6.640000
209,2.03,12.135000
97,4.00,13.640000
168,4.50,7.927500
85,2.03,7.990000
...,...,...
107,4.06,10.245000
15,3.02,7.415000
93,1.00,2.875000
180,3.55,17.315000


In [9]:
rfe = RFE(LinearRegression(), 2)
X_rfe = rfe.fit_transform(X_train, y_train)
mask = rfe.support_
X_train.loc[:, mask].columns.tolist()

['total_bill', 'tip']

RFE goes more in depth because it constantly reevaluates its' data as it continues, so it has more accurate data

In [10]:
X = df[['total_bill', 'size', 'tip_percentage', 'price_per_person']]
y = df['tip']
X_train, X_test, y_train, y_test = (
                    split_scale.split_my_data_twice(X, y, .8))
X_train.head()

Unnamed: 0,total_bill,size,tip_percentage,price_per_person
229,13.28,2,0.204819,6.64
209,24.27,2,0.083642,12.135
97,27.28,2,0.146628,13.64
168,31.71,4,0.141911,7.9275
85,15.98,2,0.127034,7.99


In [11]:
def select_kbest(X, y, k):
    selector = SelectKBest(f_regression, k).fit(X, y)
    X_kbest = selector.transform(X)
    support = selector.get_support()
    best_features = X.columns[support]
    return X_kbest, best_features

In [12]:
def select_rfe(X, y, k):
    rfe = RFE(LinearRegression(), k)
    X_rfe = rfe.fit_transform(X, y)
    mask = rfe.support_
    best_features = X.columns[mask]
    return X_rfe, best_features

In [13]:
X_kbest, k_features = select_kbest(X_train, y_train, 2)
k_features

Index(['total_bill', 'size'], dtype='object')

In [14]:
X_rfe, rfe_features = select_rfe(X_train, y_train, 2)
rfe_features

Index(['total_bill', 'tip_percentage'], dtype='object')

In [15]:
df2 = data('swiss')
df2.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [28]:
X2 = df2.drop(columns='Fertility')
y2 = df2['Fertility']
X2_train, X2_test, y2_train, y2_test = (
                    split_scale.split_my_data_twice(X2, y2, .8))
X2_train.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Gruyere,53.3,12,7,97.67,21.0
Moutier,36.5,12,7,33.77,20.3
Broye,70.2,16,7,92.85,23.6
Le Locle,16.7,22,13,11.22,18.9
Martigwy,78.2,12,6,98.96,19.4


In [29]:
X2_kbest, k2_features = select_kbest(X2_train, y2_train, 3)
k2_features

Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object')

In [18]:
X2_rfe, rfe2_features = select_rfe(X2_train, y2_train, 3)
rfe2_features

Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object')

In [19]:
rfe = RFE(LinearRegression(), 3)
X_rfe = rfe.fit_transform(X2_train, y2_train)
mask = rfe.support_
best_features = X2_train.loc[:, mask].columns.tolist()
best_features

['Examination', 'Education', 'Infant.Mortality']

In [20]:
var_ranks = rfe.ranking_
var_ranks

array([2, 1, 1, 3, 1])