In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import evaluate
from acquire import *
import math
from scipy import stats

import matplotlib.pyplot as plt

from statsmodels.formula.api import ols

from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE, SelectKBest, f_regression
from math import sqrt

from scipy import stats

warnings.filterwarnings('ignore')

In [2]:
from pydataset import data

tips = data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
tips['tip_percentage'] = tips.total_bill / tips.tip
tips['price_per_person'] = tips.total_bill / tips['size']
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,16.821782,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,6.228916,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,6.002857,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,7.154079,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.811634,6.1475


In [4]:
# Create a replacement key to 
replacement_key = dict()
revert_key = dict()
for col in tips.columns:
    if tips[col].dtype == 'object':
        temp_dict = {n : key for n, key in enumerate(tips[col].unique())}
        revert_key[col] = temp_dict
        replacement_key[col] = {v: k for k, v in temp_dict.items()}
        
tips = tips.replace(replacement_key)
tips.head()        

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,0,0,0,0,2,16.821782,8.495
2,10.34,1.66,1,0,0,0,3,6.228916,3.446667
3,21.01,3.5,1,0,0,0,3,6.002857,7.003333
4,23.68,3.31,1,0,0,0,2,7.154079,11.84
5,24.59,3.61,0,0,0,0,4,6.811634,6.1475


In [5]:
target = 'tip'
train, validate, test = train_validate_test_split(tips, target)

x_cols = [col for col in train.columns if col not in [target, 'tip_percentage']]

x_train = train[[*x_cols]]
y_train = train[[target]]
x_validate = validate[[*x_cols]]
y_validate = validate[[target]]
x_test = test[[*x_cols]]
y_test =test[[target]]

x_train.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size,price_per_person
19,16.97,0,0,0,0,3,5.656667
173,7.25,1,1,0,0,2,3.625
119,12.43,0,0,2,1,2,6.215
29,21.7,1,0,1,0,2,10.85
238,32.83,1,1,1,0,2,16.415


In [6]:
def show_features_ranking(X, rfe):
    var_ranks = rfe.ranking_
    var_names = X.columns.tolist()
    return pd.DataFrame({'feature': var_names, 'rank': var_ranks}).sort_values(by='rank')

In [7]:
def select_kbest(X, y, k):
    kbest = SelectKBest(f_regression, k=k)
    kbest.fit(X, y)
    return X.columns[(kbest.get_support())].to_list()

In [10]:
def select_rft(X_train, y_train, k):
    pass

In [8]:
select_kbest(x_train, y_train, 2)

['total_bill', 'size']

In [9]:
lr = LinearRegression()
rfe = RFE(lr, 2)
rfe.fit(x_train, y_train)

show_features_ranking(x_train, rfe)

Unnamed: 0,feature,rank
0,total_bill,1
1,sex,1
6,price_per_person,2
5,size,3
4,time,4
2,smoker,5
3,day,6
