In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import math

from pydataset import data
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector

from wrangle import *

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = clean_data()
df.head()

Unnamed: 0,cms_certification_number_(ccn),city,state,county_name,cms_region,condition,number_of_instances,footnote
0,13025,BIRMINGHAM,AL,Jefferson,4,All other conditions,346,0.0
1,13025,BIRMINGHAM,AL,Jefferson,4,Brain disease or condition (non-traumatic),150,0.0
2,13025,BIRMINGHAM,AL,Jefferson,4,Brain injury (traumatic),38,0.0
3,13025,BIRMINGHAM,AL,Jefferson,4,Hip or femur fracture,156,0.0
4,13025,BIRMINGHAM,AL,Jefferson,4,"Hip or knee replacement, amputation or other bone or joint condition",329,0.0


In [3]:
train, validate, test = wrangle(df)

data has been split
(5931, 69) (2543, 69) (2119, 69)


In [4]:
train

Unnamed: 0,cms_certification_number_(ccn),city,state,county_name,cms_region,condition,number_of_instances,footnote,state_ak,state_al,...,state_wy,condition_all_other_conditions,condition_brain_disease_or_condition_(non_traumatic),condition_brain_injury_(traumatic),condition_hip_or_femur_fracture,"condition_hip_or_knee_replacement,_amputation_or_other_bone_or_joint_condition",condition_nervous_system_disorder_(excluding_stroke),condition_spinal_cord_disease_or_condition_(non_traumatic),condition_spinal_cord_injury_(traumatic),condition_stroke
6737,36T068,SYLVANIA,OH,Lucas,5,Nervous system disorder (excluding stroke),10,1.0,0,0,...,0,0,0,0,0,0,1,0,0,0
8915,45T184,HOUSTON,TX,Harris,6,Nervous system disorder (excluding stroke),29,0.0,0,0,...,0,0,0,0,0,0,1,0,0,0
86,01T033,BIRMINGHAM,AL,Jefferson,4,Nervous system disorder (excluding stroke),12,0.0,0,1,...,0,0,0,0,0,0,1,0,0,0
1216,05T438,PASADENA,CA,Los Angeles,9,Brain disease or condition (non-traumatic),10,1.0,0,0,...,0,0,1,0,0,0,0,0,0,0
8870,45T119,EDINBURG,TX,Hidalgo,6,Nervous system disorder (excluding stroke),41,0.0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5382,293032,HENDERSON,NV,Clark,9,All other conditions,154,0.0,0,0,...,0,1,0,0,0,0,0,0,0,0
5102,26T040,SPRINGFIELD,MO,Greene,7,Stroke,47,0.0,0,0,...,0,0,0,0,0,0,0,0,0,1
6441,35T006,MINOT,ND,Ward,8,Spinal cord disease or condition (non-traumatic),10,1.0,0,0,...,0,0,0,0,0,0,0,1,0,0
8921,45T193,HOUSTON,TX,Harris,6,Brain injury (traumatic),10,1.0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [5]:
train.columns

Index(['cms_certification_number_(ccn)', 'city', 'state', 'county_name',
       'cms_region', 'condition', 'number_of_instances', 'footnote',
       'state_ak', 'state_al', 'state_ar', 'state_az', 'state_ca', 'state_co',
       'state_ct', 'state_dc', 'state_de', 'state_fl', 'state_ga', 'state_hi',
       'state_ia', 'state_id', 'state_il', 'state_in', 'state_ks', 'state_ky',
       'state_la', 'state_ma', 'state_md', 'state_me', 'state_mi', 'state_mn',
       'state_mo', 'state_ms', 'state_mt', 'state_nc', 'state_nd', 'state_ne',
       'state_nh', 'state_nj', 'state_nm', 'state_nv', 'state_ny', 'state_oh',
       'state_ok', 'state_or', 'state_pa', 'state_pr', 'state_ri', 'state_sc',
       'state_sd', 'state_tn', 'state_tx', 'state_ut', 'state_va', 'state_vt',
       'state_wa', 'state_wi', 'state_wv', 'state_wy',
       'condition_all_other_conditions',
       'condition_brain_disease_or_condition_(non_traumatic)',
       'condition_brain_injury_(traumatic)', 'condition_hip_or_femu

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5931 entries, 6737 to 1233
Data columns (total 69 columns):
 #   Column                                                                          Non-Null Count  Dtype 
---  ------                                                                          --------------  ----- 
 0   cms_certification_number_(ccn)                                                  5931 non-null   object
 1   city                                                                            5931 non-null   object
 2   state                                                                           5931 non-null   object
 3   county_name                                                                     5931 non-null   object
 4   cms_region                                                                      5931 non-null   object
 5   condition                                                                       5931 non-null   object
 6   number_of_instances  

Remove all object columns to be able to fit them in the model

In [7]:
train.drop(columns=['cms_certification_number_(ccn)', 'city', 'state','county_name', 'condition', 'footnote', 'cms_region'], inplace=True)
validate.drop(columns=['cms_certification_number_(ccn)', 'city', 'state','county_name', 'condition', 'footnote', 'cms_region'], inplace=True)
test.drop(columns=['cms_certification_number_(ccn)', 'city', 'state','county_name', 'condition', 'footnote', 'cms_region'], inplace=True)

Split train into X and y

In [8]:
X_train = train.drop(columns=['number_of_instances'])
y_train = train.number_of_instances

X_validate = validate.drop(columns=['number_of_instances'])
y_validate = validate.number_of_instances

X_test = test.drop(columns=['number_of_instances'])
y_test = test.number_of_instances

Select K best

In [10]:
f_selector = SelectKBest(f_regression, k=8)

# find the top 8 X's correlated with y
f_selector.fit(X_train, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X_train.iloc[:,feature_mask].columns.tolist()

In [11]:
f_feature

['state_fl',
 'state_ma',
 'state_nj',
 'condition_all_other_conditions',
 'condition_brain_injury_(traumatic)',
 'condition_spinal_cord_disease_or_condition_(non_traumatic)',
 'condition_spinal_cord_injury_(traumatic)',
 'condition_stroke']

Recursive Feature Elimination

In [15]:
# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, n_features_to_select=4)

# fit the data using RFE
rfe.fit(X_train,y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train.iloc[:,feature_mask].columns.tolist()

In [16]:
rfe_feature

['condition_all_other_conditions',
 'condition_hip_or_knee_replacement,_amputation_or_other_bone_or_joint_condition',
 'condition_nervous_system_disorder_(excluding_stroke)',
 'condition_stroke']

In [9]:
# Fit a linear regresion model (ordinary least squares) and compute yhat, predictions of tip using total_bill

model = LinearRegression().fit(df[['total_bill']], df.tip)
predictions = model.predict(df[['total_bill']])
df['yhat'] = predictions

KeyError: "None of [Index(['total_bill'], dtype='object')] are in the [columns]"