In [1]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
import pandas as pd
from scipy import stats
from pydataset import data
import numpy as np
import env
import matplotlib.pyplot as plt
import os
import prepare
import wrangle
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
import explore
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from math import sqrt
from sklearn.feature_selection import SelectKBest, f_regression, RFE

In [2]:
def get_auto_mpg():
    
    '''Acquire, clean, and return the auto-mpg dataset'''
    
    df = pd.read_fwf('auto-mpg.data', header=None)
    
    df.columns = ['mpg', 'cylinders', 'displ', 'horsepower', 'weight', 'acc',
              'model_year', 'origin', 'name']
    
    df = df[df['horsepower'] != '?']
    
    df['horsepower'] = df['horsepower'].astype('float')
    
    return df

In [3]:
df = get_auto_mpg()
df.head()

Unnamed: 0,mpg,cylinders,displ,horsepower,weight,acc,model_year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,"""chevrolet chevelle malibu"""
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,"""buick skylark 320"""
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,"""plymouth satellite"""
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,"""amc rebel sst"""
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,"""ford torino"""


In [4]:
train, val, test = wrangle.train_val_test(df)

In [5]:
mms = MinMaxScaler()

train[['displ','horsepower','weight','acc']] = mms.fit_transform(train[['displ', 'horsepower', 'weight','acc']])
train.head()

Unnamed: 0,mpg,cylinders,displ,horsepower,weight,acc,model_year,origin,name
216,31.5,4,0.077519,0.122905,0.11828,0.625,77,3,"""honda accord cvcc"""
348,37.7,4,0.054264,0.089385,0.119773,0.553571,81,3,"""toyota tercel"""
232,16.0,8,0.731266,0.575419,0.80227,0.386905,77,1,"""ford thunderbird"""
201,18.5,6,0.470284,0.357542,0.596177,0.488095,76,1,"""pontiac ventura sj"""
102,26.0,4,0.074935,0.0,0.089904,0.77381,73,2,"""volkswagen super beetle"""


## Select K best


In [6]:
x_train_scaled = train[['displ', 'horsepower', 'weight','acc']]
y_train = train['mpg']

In [7]:
f_selector = SelectKBest(f_regression, k = 2)

In [8]:
f_selector.fit(x_train_scaled, y_train)

In [9]:
f_select_mask = f_selector.get_support()

In [10]:
x_train_scaled.columns[f_select_mask]

Index(['displ', 'weight'], dtype='object')

## RFE

In [17]:
x_train = train.drop(columns = ['mpg', 'model_year','name'])

In [21]:
x_train = pd.get_dummies(x_train, columns = ['cylinders', 'origin'])

In [22]:
lm = LinearRegression()

rfe = RFE(lm, n_features_to_select = 7)

In [23]:
rfe.fit(x_train, y_train)

In [24]:
ranks = rfe.ranking_
columns = x_train.columns.tolist()

In [27]:
feature_ranks = pd.DataFrame({'ranking':ranks,'feature': columns})

In [28]:
feature_ranks.sort_values('ranking')

Unnamed: 0,ranking,feature
0,1,displ
1,1,horsepower
2,1,weight
4,1,cylinders_3
6,1,cylinders_5
7,1,cylinders_6
11,1,origin_3
3,2,acc
5,3,cylinders_4
8,4,cylinders_8
