In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

## Day 33 Lecture 2 Assignment

In this assignment, we will learn about non linear SVM models. We will use the heart disease dataset loaded below and analyze the model generated for this dataset.

In [23]:
import warnings

import numpy as np
import pandas as pd

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    make_scorer,
    f1_score,
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

<IPython.core.display.Javascript object>

In [3]:
heart = pd.read_csv(
    "https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/framingham_heart_disease.csv"
)

<IPython.core.display.Javascript object>

In [4]:
heart.shape

(4238, 16)

<IPython.core.display.Javascript object>

In [5]:
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


<IPython.core.display.Javascript object>

This dataset helps us predict the probability of coronary heart diease (CHD) in the next 10 years given the risk factors for each subject in the study. Our target variable is `TenYearCHD`.

We'll start off by removing any rows containing missing data.

In [6]:
# answer below:
heart = heart.dropna()


<IPython.core.display.Javascript object>

In [9]:
heart["education"].value_counts()

1.0    1526
2.0    1101
3.0     606
4.0     423
Name: education, dtype: int64

<IPython.core.display.Javascript object>

In [7]:
heart.shape

(3656, 16)

<IPython.core.display.Javascript object>

Then, we split the data into train and test with 20% of the data in the test subset.

In [8]:
# answer below:
X = heart.drop('TenYearCHD',1)
y = heart['TenYearCHD']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42, stratify=y)


<IPython.core.display.Javascript object>

We will then scale the data using the standard scaler. Do this in the cell below.

In [24]:
# binary columns
bin_cols = ["currentSmoker", "prevalentStroke", "prevalentHyp", "diabetes", "male"]

# numeric (and ordinal columns)
num_cols = [
    "age",
    "cigsPerDay",
    "BPMeds",
    "totChol",
    "sysBP",
    "diaBP",
    "BMI",
    "heartRate",
    "glucose",
]

# categorical columns and the corresponding categories to drop from each
cat_cols = ["education"]
drop_cats = [1.0]

<IPython.core.display.Javascript object>

In [40]:
preprocessing = ColumnTransformer(
    [
        ("scale", StandardScaler(), num_cols),
        ("one_hot_encode", OneHotEncoder(drop=drop_cats), cat_cols),
    ],
    remainder="passthrough",
)

<IPython.core.display.Javascript object>

Generate a polynomial SVC model and a RBF SVC model. Compare the performance, and the runtime, for the two models.

In [41]:
# answer below:
pipeline_poly = Pipeline(
    [("preprocessing", preprocessing), ("svm", SVC(kernel="poly"))]
)
pipeline_poly.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('scale',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['age', 'cigsPerDay',
                                                   'BPMeds', 'totChol', 'sysBP',
                                                   'diaBP', 'BMI', 'heartRate',
                                                   'glucose']),
                                                 ('one_hot_encode',
                                                  OneHotEncoder(categories...
             

<IPython.core.display.Javascript object>

In [42]:
train_score = pipeline_poly.score(X_train, y_train)
test_score = pipeline_poly.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.8683310533515732
Test score: 0.855191256830601


<IPython.core.display.Javascript object>

In [43]:
# answer below:
pipeline_rbf = Pipeline([("preprocessing", preprocessing), ("svm", SVC(kernel="rbf"))])
pipeline_rbf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('scale',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['age', 'cigsPerDay',
                                                   'BPMeds', 'totChol', 'sysBP',
                                                   'diaBP', 'BMI', 'heartRate',
                                                   'glucose']),
                                                 ('one_hot_encode',
                                                  OneHotEncoder(categories...
             

<IPython.core.display.Javascript object>

In [44]:
train_score = pipeline_rbf.score(X_train, y_train)
test_score = pipeline_rbf.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.8577291381668947
Test score: 0.8483606557377049


<IPython.core.display.Javascript object>

Which model overfits more? How would you improve the overfitting?

Look at a classification report and confusion matrix. How does the class balance affect your results?

In [46]:
y_pred_poly = pipeline_poly.predict(X_test)
y_pred_rbf = pipeline_rbf.predict(X_test)

<IPython.core.display.Javascript object>

In [47]:
confusion_matrix(y_test, y_pred_poly)

array([[615,   5],
       [101,  11]], dtype=int64)

<IPython.core.display.Javascript object>

In [48]:
confusion_matrix(y_test, y_pred_rbf)

array([[620,   0],
       [111,   1]], dtype=int64)

<IPython.core.display.Javascript object>

In [49]:
print(classification_report(y_test, y_pred_poly))

              precision    recall  f1-score   support

           0       0.86      0.99      0.92       620
           1       0.69      0.10      0.17       112

    accuracy                           0.86       732
   macro avg       0.77      0.55      0.55       732
weighted avg       0.83      0.86      0.81       732



<IPython.core.display.Javascript object>

In [50]:
print(classification_report(y_test, y_pred_rbf))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92       620
           1       1.00      0.01      0.02       112

    accuracy                           0.85       732
   macro avg       0.92      0.50      0.47       732
weighted avg       0.87      0.85      0.78       732



<IPython.core.display.Javascript object>

In [45]:
# answer below:
# answer below:
pipeline = Pipeline([("preprocessing", preprocessing), ("svm", SVC(kernel="rbf", C=0.1))])
pipeline.fit(X_train, y_train)


Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('scale',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['age', 'cigsPerDay',
                                                   'BPMeds', 'totChol', 'sysBP',
                                                   'diaBP', 'BMI', 'heartRate',
                                                   'glucose']),
                                                 ('one_hot_encode',
                                                  OneHotEncoder(categories...
             

<IPython.core.display.Javascript object>

In [39]:
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.8478112175102599
Test score: 0.8469945355191257


<IPython.core.display.Javascript object>