## Modeling Heart Disease

#### Import libraries

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper

In [67]:
data = pd.read_csv('../heart_clean.csv')

### Split data into training/test sets so test data does not influence z-score normalization

In [97]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns='target'), data.target,
                                                    test_size=0.3, stratify=data.target, random_state=3)

### Normalize continuous data (z-score) and one hot encode categorical

In [98]:
categorical = ['cp', 'restecg', 'slope', 'ca', 'thal']
binary_cat = ['sex', 'fbs', 'exang'] ## 'target' is omitted
numerical = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

In [99]:
SS = preprocessing.StandardScaler()
OH = preprocessing.OneHotEncoder(categories = 'auto', sparse=False)
mapper = DataFrameMapper([([b], None) for b in binary_cat] + [([c], OH) for c in categorical] +
                         [([n], SS) for n in numerical], df_out=True)

In [100]:
print(X_train.shape, X_test.shape)

(210, 13) (91, 13)


In [101]:
X_train = mapper.fit_transform(X_train.astype('float'))
X_test = mapper.transform(X_test.astype('float'))

In [102]:
X_train.columns

Index(['sex', 'fbs', 'exang', 'cp_x0_0.0', 'cp_x0_1.0', 'cp_x0_2.0',
       'cp_x0_3.0', 'restecg_x0_0.0', 'restecg_x0_1.0', 'restecg_x0_2.0',
       'slope_x0_0.0', 'slope_x0_1.0', 'slope_x0_2.0', 'ca_x0_0.0',
       'ca_x0_1.0', 'ca_x0_2.0', 'ca_x0_3.0', 'ca_x0_4.0', 'thal_x0_0.0',
       'thal_x0_1.0', 'thal_x0_2.0', 'thal_x0_3.0', 'age', 'trestbps', 'chol',
       'thalach', 'oldpeak'],
      dtype='object')

In [103]:
X_test.columns

Index(['sex', 'fbs', 'exang', 'cp_x0_0.0', 'cp_x0_1.0', 'cp_x0_2.0',
       'cp_x0_3.0', 'restecg_x0_0.0', 'restecg_x0_1.0', 'restecg_x0_2.0',
       'restecg_x0_3.0', 'slope_x0_0.0', 'slope_x0_1.0', 'slope_x0_2.0',
       'slope_x0_3.0', 'ca_x0_0.0', 'ca_x0_1.0', 'ca_x0_2.0', 'ca_x0_3.0',
       'thal_x0_0.0', 'thal_x0_1.0', 'thal_x0_2.0', 'thal_x0_3.0', 'age',
       'trestbps', 'chol', 'thalach', 'oldpeak'],
      dtype='object')

In [104]:
mapper.fit_transform?

In [54]:
from sklearn.svm import SVC

In [63]:
clf = SVC()
clf.fit(X = X_train, y = y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [64]:
SVC()

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [66]:
clf.predict(X_test)

ValueError: X.shape[1] = 28 should be equal to 27, the number of features at training time