# SVM notebook

### Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score, recall_score, precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB

### Load datasets

In [2]:
wpbc_original = pd.read_csv(
    '../dataset/wpbc_original.csv',
    sep=',',
    header=0
)

wpbc_normalized = pd.read_csv(
    '../dataset/wpbc_normalized.csv',
    sep=',',
    header=0
)

wpbc_best_features = pd.read_csv(
    '../dataset/wpbc_best_features.csv',
    sep=',',
    header=0
)

In [3]:
display(
    wpbc_original.head(),
    wpbc_normalized.head(),
    wpbc_best_features.head()
)

Unnamed: 0,ID,OUTCOME,RADIUS_1,TEXTURE_1,PERIMETER_1,AREA_1,SMOOTHNESS_1,COMPACTNESS_1,CONCAVITY_1,CONCAVE_POINTS_1,...,PERIMETER_3,AREA_3,SMOOTHNESS_3,COMPACTNESS_3,CONCAVITY_3,CONCAVE_POINTS_3,SYMMETRY_3,FRACTAL_DIMENSIONS_3,TUMOR_SIZE,LYMPH_NODE_STATUS
0,119513,0,18.02,27.6,117.5,1013.0,0.09489,0.1036,0.1086,0.07055,...,139.7,1436.0,0.1195,0.1926,0.314,0.117,0.2677,0.08113,5.0,5.0
1,8423,0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,3.0,2.0
2,842517,0,21.37,17.44,137.5,1373.0,0.08836,0.1189,0.1255,0.0818,...,159.1,1949.0,0.1188,0.3449,0.3414,0.2032,0.4334,0.09067,2.5,0.0
3,843483,0,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,2.0,0.0
4,843584,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,3.5,0.0


Unnamed: 0,ID,OUTCOME,RADIUS_1,TEXTURE_1,PERIMETER_1,AREA_1,SMOOTHNESS_1,COMPACTNESS_1,CONCAVITY_1,CONCAVE_POINTS_1,...,PERIMETER_3,AREA_3,SMOOTHNESS_3,COMPACTNESS_3,CONCAVITY_3,CONCAVE_POINTS_3,SYMMETRY_3,FRACTAL_DIMENSIONS_3,TUMOR_SIZE,LYMPH_NODE_STATUS
0,119513,0,0.434542,0.595848,0.413793,0.344948,0.285673,0.216883,0.210069,0.277738,...,0.371176,0.273322,0.267183,0.140351,0.253067,0.336803,0.2192,0.171127,0.479167,0.185185
1,8423,0,0.432698,0.0,0.461887,0.338594,0.622831,0.872621,0.685467,0.700923,...,0.676411,0.44505,0.570687,0.610208,0.600269,0.904711,0.598462,0.418864,0.270833,0.074074
2,842517,0,0.640443,0.244291,0.595281,0.535586,0.192026,0.274543,0.252023,0.33993,...,0.503059,0.424431,0.262208,0.291639,0.276976,0.666679,0.545831,0.233701,0.21875,0.0
3,843483,0,0.028888,0.346021,0.051543,0.012974,0.96845,0.896363,0.539745,0.469291,...,0.09361,0.017556,0.90902,0.809574,0.578454,0.874479,1.0,0.773711,0.166667,0.0
4,843584,1,0.574063,0.137024,0.573503,0.49534,0.363258,0.326927,0.432004,0.464315,...,0.456152,0.314266,0.394413,0.152669,0.328109,0.510926,0.1575,0.142595,0.322917,0.0


Unnamed: 0,ID,OUTCOME,RADIUS_1,PERIMETER_1,AREA_1,PERIMETER_2,AREA_2,RADIUS_3,PERIMETER_3,AREA_3,TUMOR_SIZE,LYMPH_NODE_STATUS
0,119513,0,18.02,117.5,1013.0,3.972,71.55,21.63,139.7,1436.0,5.0,5.0
1,8423,0,17.99,122.8,1001.0,8.589,153.4,25.38,184.6,2019.0,3.0,2.0
2,842517,0,21.37,137.5,1373.0,3.928,82.15,24.9,159.1,1949.0,2.5,0.0
3,843483,0,11.42,77.58,386.1,3.445,27.23,14.91,98.87,567.7,2.0,0.0
4,843584,1,20.29,135.1,1297.0,5.438,94.44,22.54,152.2,1575.0,3.5,0.0


In [4]:
(original_train, original_test) = train_test_split(wpbc_original, test_size=0.25, train_size=0.75, shuffle=True)
(normalized_train, normalized_test) = train_test_split(wpbc_normalized, test_size=0.25, train_size=0.75, shuffle=True)
(best_train, best_test) = train_test_split(wpbc_best_features, test_size=0.25, train_size=0.75, shuffle=True)

original_train_labels = original_train['OUTCOME']
original_train_features = original_train.iloc[:,2:]
original_test_labels = original_test['OUTCOME']
original_test_features = original_test.iloc[:,2:]

normalized_train_labels = normalized_train['OUTCOME']
normalized_train_features = normalized_train.iloc[:,2:]
normalized_test_labels = normalized_test['OUTCOME']
normalized_test_features = normalized_test.iloc[:,2:]

best_train_labels = best_train['OUTCOME']
best_train_features = best_train.iloc[:,2:]
best_test_labels = best_test['OUTCOME']
best_test_features =best_test.iloc[:,2:]

# Model development

## Original dataset

### SVC dev

In [5]:
svc_classifier = SVC(
    C=5.5,
    kernel='linear',
    gamma='scale',
    class_weight='balanced'
)

svc_classifier.fit(
    X=original_train_features,
    y=original_train_labels
)

predictions = svc_classifier.predict(
    X=original_test_features
)
acc = accuracy_score(y_true=original_test_labels, y_pred=predictions)
f1 = f1_score(y_true=original_test_labels, y_pred=predictions, zero_division=0)
rec = recall_score(y_true=original_test_labels, y_pred=predictions, zero_division=0)
prec = precision_score(y_true=original_test_labels, y_pred=predictions, zero_division=0)
print(acc, f1, rec, prec)

0.54 0.41025641025641024 0.8888888888888888 0.26666666666666666


In [6]:
svc_classifier = SVC(
    C=5.5,
    kernel='linear',
    gamma='scale',
    class_weight='balanced'
)

svc_classifier.fit(
    X=normalized_train_features,
    y=normalized_train_labels
)

predictions = svc_classifier.predict(
    X=normalized_test_features
)
acc = accuracy_score(y_true=normalized_test_labels, y_pred=predictions)
f1 = f1_score(y_true=normalized_test_labels, y_pred=predictions, zero_division=0)
rec = recall_score(y_true=normalized_test_labels, y_pred=predictions, zero_division=0)
prec = precision_score(y_true=normalized_test_labels, y_pred=predictions, zero_division=0)
print(acc, f1, rec, prec)

0.64 0.47058823529411764 0.5 0.4444444444444444


In [21]:
svc_classifier = SVC(
    C=5.5,
    kernel='linear',
    gamma='scale',
    class_weight='balanced',
    random_state=42
)

svc_classifier.fit(
    X=best_train_features,
    y=best_train_labels
)

predictions = svc_classifier.predict(
    X=best_test_features
)
acc = accuracy_score(y_true=best_test_labels, y_pred=predictions)
f1 = f1_score(y_true=best_test_labels, y_pred=predictions, zero_division=0)
rec = recall_score(y_true=best_test_labels, y_pred=predictions, zero_division=0)
prec = precision_score(y_true=best_test_labels, y_pred=predictions, zero_division=0)
print(acc, f1, rec, prec)

0.56 0.3888888888888889 0.5833333333333334 0.2916666666666667
