# SVM


In [1]:
import pandas as pd
import sys
import matplotlib
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
from sklearn.decomposition import PCA

## Preparing data

In [2]:
rawData = pd.read_csv('~/Documents/GaTech/2022 summer/Project/healthcare-dataset-stroke-data.csv')
print("The raw Data:")
print(rawData.head())
print("==============================================================================")
print('In the raw data, the target value and corrsponding number of data points are:')
print(rawData['stroke'].value_counts())

The raw Data:
      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private          Rural             105.92  32.5     never smoked   
3        Private          Urban             171.23  34.4           smokes   
4  Self-employed          Rural             174.12  24.0     never smoked   

   stroke  
0       1  
1       1  
2       1  
3       1  
4       1  
In the raw data, the target value and corr

In [3]:
## Drop Patient ID feature & Target:stroke
processed = rawData.drop(['id'], axis = 1)


## Label encoder:
label_encoder = preprocessing.LabelEncoder()

for feature in processed:
    if (processed[feature].dtype == object):
        processed[feature]= label_encoder.fit_transform(processed[feature])

## fill NaN data in BMI with colum average
meanBMI = processed['bmi'].mean()
processed['bmi'] = processed['bmi'].fillna(value=meanBMI)

#processed = processed0.drop(['stroke'], axis = 1)

print (processed.head())

   gender   age  hypertension  heart_disease  ever_married  work_type  \
0       1  67.0             0              1             1          2   
1       0  61.0             0              0             1          3   
2       1  80.0             0              1             1          2   
3       0  49.0             0              0             1          2   
4       0  79.0             1              0             1          3   

   Residence_type  avg_glucose_level        bmi  smoking_status  stroke  
0               1             228.69  36.600000               1       1  
1               0             202.21  28.893237               2       1  
2               0             105.92  32.500000               2       1  
3               1             171.23  34.400000               3       1  
4               0             174.12  24.000000               2       1  


In [4]:
## Balance the data using SMOTE

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 42)

feature = processed.iloc[:,:-1]
target = processed.iloc[:,-1]

balanced, target_balanced = sm.fit_resample(feature, target)
balanced['stroke'] = target_balanced

#balanced = balanced0.drop(['stroke'], axis = 1)

print("The balanced processed data:")
print(balanced)
print("==============================================================================")
print('In the balanced data, the target value and corrsponding number of data points are:')
print(balanced['stroke'].value_counts())

The balanced processed data:
      gender        age  hypertension  heart_disease  ever_married  work_type  \
0          1  67.000000             0              1             1          2   
1          0  61.000000             0              0             1          3   
2          1  80.000000             0              1             1          2   
3          0  49.000000             0              0             1          2   
4          0  79.000000             1              0             1          3   
...      ...        ...           ...            ...           ...        ...   
9717       0  79.871507             1              0             1          2   
9718       0  70.908861             0              0             1          0   
9719       1  80.000000             0              0             1          2   
9720       0  74.953742             0              0             1          3   
9721       0  60.467509             0              0             1          2   

In [7]:
## Normalized data
sc = StandardScaler()
unbNorm = sc.fit_transform(processed.iloc[:,:-1]) # normalzied unbalanced processed data
balNorm = sc.fit_transform(balanced.iloc[:,:-1]) # normalzied balanced data


In [9]:
## Balanced dataset X and target (with gender)
X_all = balanced.drop('stroke', axis=1)
y = balanced['stroke']
print(X_all)
print(y)

      gender        age  hypertension  heart_disease  ever_married  work_type  \
0          1  67.000000             0              1             1          2   
1          0  61.000000             0              0             1          3   
2          1  80.000000             0              1             1          2   
3          0  49.000000             0              0             1          2   
4          0  79.000000             1              0             1          3   
...      ...        ...           ...            ...           ...        ...   
9717       0  79.871507             1              0             1          2   
9718       0  70.908861             0              0             1          0   
9719       1  80.000000             0              0             1          2   
9720       0  74.953742             0              0             1          3   
9721       0  60.467509             0              0             1          2   

      Residence_type  avg_g

In [12]:
## Balanced dataset X and target (without gender)
X_all = balanced.drop('stroke', axis=1)
X_no_gender = X_all.drop('gender', axis=1)

y = balanced['stroke']
print(X_no_gender)
print(y)

            age  hypertension  heart_disease  ever_married  work_type  \
0     67.000000             0              1             1          2   
1     61.000000             0              0             1          3   
2     80.000000             0              1             1          2   
3     49.000000             0              0             1          2   
4     79.000000             1              0             1          3   
...         ...           ...            ...           ...        ...   
9717  79.871507             1              0             1          2   
9718  70.908861             0              0             1          0   
9719  80.000000             0              0             1          2   
9720  74.953742             0              0             1          3   
9721  60.467509             0              0             1          2   

      Residence_type  avg_glucose_level        bmi  smoking_status  
0                  1         228.690000  36.600000    

In [13]:
## Train Test Split

### Balanced dataset X and target (with gender)
from sklearn.model_selection import train_test_split
X_all_train, X_all_test, y_all_train, y_all_test = train_test_split(X_all, y, test_size = 0.20)


### Balanced dataset X and target (without gender)
from sklearn.model_selection import train_test_split
X_no_gender_train, X_no_gender_test, y_no_gender_train, y_no_gender_test = train_test_split(X_no_gender, y, test_size = 0.20)


## Training the Algorithm
### Balanced dataset X (without gender)
#### Hard SVM (C = 1.0 or greater)

In [17]:
# Simple SVM (no kernel)

from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_no_gender_train, y_no_gender_train)

## Test
y_no_gender_pred = svclassifier.predict(X_no_gender_test)

## Evaluating
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_no_gender_test, y_no_gender_pred))
print(classification_report(y_no_gender_test, y_no_gender_pred))

### Expensive Calculation


[[754 228]
 [133 830]]
              precision    recall  f1-score   support

           0       0.85      0.77      0.81       982
           1       0.78      0.86      0.82       963

    accuracy                           0.81      1945
   macro avg       0.82      0.81      0.81      1945
weighted avg       0.82      0.81      0.81      1945



In [14]:
# Polynomial Kernel
## degree of 10

from sklearn.svm import SVC
svclassifier = SVC(kernel='poly', degree=10)
svclassifier.fit(X_no_gender_train, y_no_gender_train)

## Test
y_no_gender_pred = svclassifier.predict(X_no_gender_test)

## Evaluating
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_no_gender_test, y_no_gender_pred))
print(classification_report(y_no_gender_test, y_no_gender_pred))


### Expensive Calculation

[[839 143]
 [341 622]]
              precision    recall  f1-score   support

           0       0.71      0.85      0.78       982
           1       0.81      0.65      0.72       963

    accuracy                           0.75      1945
   macro avg       0.76      0.75      0.75      1945
weighted avg       0.76      0.75      0.75      1945



In [18]:
# Polynomial Kernel
## degree of 9

from sklearn.svm import SVC
svclassifier = SVC(kernel='poly', degree=9)
svclassifier.fit(X_no_gender_train, y_no_gender_train)

## Test
y_no_gender_pred = svclassifier.predict(X_no_gender_test)

## Evaluating
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_no_gender_test, y_no_gender_pred))
print(classification_report(y_no_gender_test, y_no_gender_pred))


### Expensive Calculation

[[821 161]
 [310 653]]
              precision    recall  f1-score   support

           0       0.73      0.84      0.78       982
           1       0.80      0.68      0.73       963

    accuracy                           0.76      1945
   macro avg       0.76      0.76      0.76      1945
weighted avg       0.76      0.76      0.76      1945



In [19]:
# Polynomial Kernel
## degree of 11

from sklearn.svm import SVC
svclassifier = SVC(kernel='poly', degree=11)
svclassifier.fit(X_no_gender_train, y_no_gender_train)

## Test
y_no_gender_pred = svclassifier.predict(X_no_gender_test)

## Evaluating
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_no_gender_test, y_no_gender_pred))
print(classification_report(y_no_gender_test, y_no_gender_pred))


### Expensive Calculation (so expensive)
### (don't try to run it again, it will take a long long time)

[[854 128]
 [378 585]]
              precision    recall  f1-score   support

           0       0.69      0.87      0.77       982
           1       0.82      0.61      0.70       963

    accuracy                           0.74      1945
   macro avg       0.76      0.74      0.73      1945
weighted avg       0.76      0.74      0.74      1945



In [15]:
# Gaussian Kernel

svclassifier = SVC(kernel='rbf')
svclassifier.fit(X_no_gender_train, y_no_gender_train)

## Test
y_no_gender_pred = svclassifier.predict(X_no_gender_test)

## Evaluating
print(confusion_matrix(y_no_gender_test, y_no_gender_pred))
print(classification_report(y_no_gender_test, y_no_gender_pred))

[[660 322]
 [121 842]]
              precision    recall  f1-score   support

           0       0.85      0.67      0.75       982
           1       0.72      0.87      0.79       963

    accuracy                           0.77      1945
   macro avg       0.78      0.77      0.77      1945
weighted avg       0.78      0.77      0.77      1945



In [16]:
# Sigmoid Kernel

svclassifier = SVC(kernel='sigmoid')
svclassifier.fit(X_no_gender_train, y_no_gender_train)

## Test
y_no_gender_pred = svclassifier.predict(X_no_gender_test)

## Evaluating
print(confusion_matrix(y_no_gender_test, y_no_gender_pred))
print(classification_report(y_no_gender_test, y_no_gender_pred))


[[393 589]
 [576 387]]
              precision    recall  f1-score   support

           0       0.41      0.40      0.40       982
           1       0.40      0.40      0.40       963

    accuracy                           0.40      1945
   macro avg       0.40      0.40      0.40      1945
weighted avg       0.40      0.40      0.40      1945



In [21]:
#  Radial Basis Function (RBF) kernel SVM

svclassifier = SVC(C = 1, gamma = 0.1)
svclassifier.fit(X_no_gender_train, y_no_gender_train)

## Test
y_no_gender_pred = svclassifier.predict(X_no_gender_test)

## Evaluating
print(confusion_matrix(y_no_gender_test, y_no_gender_pred))
print(classification_report(y_no_gender_test, y_no_gender_pred))

[[902  80]
 [ 34 929]]
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       982
           1       0.92      0.96      0.94       963

    accuracy                           0.94      1945
   macro avg       0.94      0.94      0.94      1945
weighted avg       0.94      0.94      0.94      1945



In [20]:
#  Radial Basis Function (RBF) kernel SVM

svclassifier = SVC(C = 10, gamma = 0.1)
svclassifier.fit(X_no_gender_train, y_no_gender_train)

## Test
y_no_gender_pred = svclassifier.predict(X_no_gender_test)

## Evaluating
print(confusion_matrix(y_no_gender_test, y_no_gender_pred))
print(classification_report(y_no_gender_test, y_no_gender_pred))



#### Overall best performance

[[906  76]
 [ 29 934]]
              precision    recall  f1-score   support

           0       0.97      0.92      0.95       982
           1       0.92      0.97      0.95       963

    accuracy                           0.95      1945
   macro avg       0.95      0.95      0.95      1945
weighted avg       0.95      0.95      0.95      1945



In [29]:
#  Radial Basis Function (RBF) kernel SVM

svclassifier = SVC(C = 10, gamma = 1)
svclassifier.fit(X_no_gender_train, y_no_gender_train)

## Test
y_no_gender_pred = svclassifier.predict(X_no_gender_test)

## Evaluating
print(confusion_matrix(y_no_gender_test, y_no_gender_pred))
print(classification_report(y_no_gender_test, y_no_gender_pred))

[[979   3]
 [149 814]]
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       982
           1       1.00      0.85      0.91       963

    accuracy                           0.92      1945
   macro avg       0.93      0.92      0.92      1945
weighted avg       0.93      0.92      0.92      1945



In [31]:
#  Radial Basis Function (RBF) kernel SVM

svclassifier = SVC(C = 10, gamma = 0.001)
svclassifier.fit(X_no_gender_train, y_no_gender_train)

## Test
y_no_gender_pred = svclassifier.predict(X_no_gender_test)

## Evaluating
print(confusion_matrix(y_no_gender_test, y_no_gender_pred))
print(classification_report(y_no_gender_test, y_no_gender_pred))




[[741 241]
 [ 77 886]]
              precision    recall  f1-score   support

           0       0.91      0.75      0.82       982
           1       0.79      0.92      0.85       963

    accuracy                           0.84      1945
   macro avg       0.85      0.84      0.84      1945
weighted avg       0.85      0.84      0.84      1945



In [None]:
svclassifier = SVC(C = 10, gamma = 'auto')
svclassifier.fit(X_no_gender_train, y_no_gender_train)

## Test
y_no_gender_pred = svclassifier.predict(X_no_gender_test)

## Evaluating
print(confusion_matrix(y_no_gender_test, y_no_gender_pred))
print(classification_report(y_no_gender_test, y_no_gender_pred))

Gamma (default='scale' = 1 / (n_features * X.var()). ('auto'= 1 / n_features)
    Value of gamma: defines how far the influence of a single training example reaches, with low values meaning ‘far’ and high values meaning ‘close’. The gamma parameters can be seen as the inverse of the radius of influence of samples selected by the model as support vectors.
    
C (default = 1.0) : The C parameter trades off correct classification of training examples against maximization of the decision function’s margin. For larger values of C, a smaller margin will be accepted if the decision function is better at classifying all training points correctly. A lower C will encourage a larger margin, therefore a simpler decision function, at the cost of training accuracy. In other words C behaves as a regularization parameter in the SVM.

### Soft SVM
#### C<1

In [34]:
# Simple SVM (no kernel)

from sklearn.svm import SVC
svclassifier = SVC(kernel='linear',C = 0.01)
svclassifier.fit(X_no_gender_train, y_no_gender_train)

## Test
y_no_gender_pred = svclassifier.predict(X_no_gender_test)

## Evaluating
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_no_gender_test, y_no_gender_pred))
print(classification_report(y_no_gender_test, y_no_gender_pred))

### same result with hrad SVM

[[745 237]
 [136 827]]
              precision    recall  f1-score   support

           0       0.85      0.76      0.80       982
           1       0.78      0.86      0.82       963

    accuracy                           0.81      1945
   macro avg       0.81      0.81      0.81      1945
weighted avg       0.81      0.81      0.81      1945



In [40]:
# Polynomial Kernel
## degree of 9

from sklearn.svm import SVC
svclassifier = SVC(kernel='poly', degree=9,C=0.01)
svclassifier.fit(X_no_gender_train, y_no_gender_train)

## Test
y_no_gender_pred = svclassifier.predict(X_no_gender_test)

## Evaluating
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_no_gender_test, y_no_gender_pred))
print(classification_report(y_no_gender_test, y_no_gender_pred))


### Expensive Calculation
### similar with hard

[[907  75]
 [567 396]]
              precision    recall  f1-score   support

           0       0.62      0.92      0.74       982
           1       0.84      0.41      0.55       963

    accuracy                           0.67      1945
   macro avg       0.73      0.67      0.65      1945
weighted avg       0.73      0.67      0.65      1945



In [35]:
# Gaussian Kernel

svclassifier = SVC(kernel='rbf',C=0.01)
svclassifier.fit(X_no_gender_train, y_no_gender_train)

## Test
y_no_gender_pred = svclassifier.predict(X_no_gender_test)

## Evaluating
print(confusion_matrix(y_no_gender_test, y_no_gender_pred))
print(classification_report(y_no_gender_test, y_no_gender_pred))

### worse than hard

[[595 387]
 [ 99 864]]
              precision    recall  f1-score   support

           0       0.86      0.61      0.71       982
           1       0.69      0.90      0.78       963

    accuracy                           0.75      1945
   macro avg       0.77      0.75      0.75      1945
weighted avg       0.77      0.75      0.74      1945



In [37]:
# Sigmoid Kernel

svclassifier = SVC(kernel='sigmoid', C=0.01)
svclassifier.fit(X_no_gender_train, y_no_gender_train)

## Test
y_no_gender_pred = svclassifier.predict(X_no_gender_test)

## Evaluating
print(confusion_matrix(y_no_gender_test, y_no_gender_pred))
print(classification_report(y_no_gender_test, y_no_gender_pred))

### similar with hard

[[245 737]
 [408 555]]
              precision    recall  f1-score   support

           0       0.38      0.25      0.30       982
           1       0.43      0.58      0.49       963

    accuracy                           0.41      1945
   macro avg       0.40      0.41      0.40      1945
weighted avg       0.40      0.41      0.40      1945



In [39]:
#  Radial Basis Function (RBF) kernel SVM

svclassifier = SVC(C = 0.1, gamma = 0.1)
svclassifier.fit(X_no_gender_train, y_no_gender_train)

## Test
y_no_gender_pred = svclassifier.predict(X_no_gender_test)

## Evaluating
print(confusion_matrix(y_no_gender_test, y_no_gender_pred))
print(classification_report(y_no_gender_test, y_no_gender_pred))

### Worse, decreasing C will even worsen the result

[[922  60]
 [208 755]]
              precision    recall  f1-score   support

           0       0.82      0.94      0.87       982
           1       0.93      0.78      0.85       963

    accuracy                           0.86      1945
   macro avg       0.87      0.86      0.86      1945
weighted avg       0.87      0.86      0.86      1945

