In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.exceptions import UndefinedMetricWarning
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import (recall_score, accuracy_score, 
                             precision_score, roc_auc_score, confusion_matrix)

# Load data

In [2]:
def load_dataset(csv_path):

    """
    Load dataset from csv file, display dataframe, and return data value and target
    
    Parameters:
        csv_path (str): csv file path
    Returns:
        X (np.ndarray): dataset
        y (np.ndarray): labels
    """

    # Load dataset
    df = pd.read_csv(data_path, index_col= "id")
    # Replace targets labels by binary values
    df['diagnosis'] = df['diagnosis'].replace({'B': 0, 'M': 1})
    # Indexes of the 10 most important features
    features_idx = [22, 23, 7, 6, 27, 20, 2, 0, 3, 13]
    df_new_features = pd.concat([df.iloc[:,features_idx], df[['diagnosis']]], axis = 1)
    display(df_new_features)
    # extract output targets and values
    y = df['diagnosis'].values # labels
    df_vals = df_new_features.drop(['diagnosis'],axis=1).values

    # replacing Nan values (only present when no 3 points attempts have been performed by a player)
    for x in np.argwhere(np.isnan(df_vals)):
        df_vals[x]=0.0

    # normalize dataset
    X = MinMaxScaler().fit_transform(df_vals)

    return X, y

# Original data

In [3]:
# Extract data and target
data_path = "../data/breast-cancer.csv"
X_original, y_original = load_dataset(data_path)

Unnamed: 0_level_0,texture_worst,perimeter_worst,concavity_mean,compactness_mean,concavity_worst,fractal_dimension_se,texture_mean,diagnosis,perimeter_mean,perimeter_se,diagnosis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
842302,17.33,184.60,0.30010,0.27760,0.7119,0.006193,10.38,1,122.80,8.589,1
842517,23.41,158.80,0.08690,0.07864,0.2416,0.003532,17.77,1,132.90,3.398,1
84300903,25.53,152.50,0.19740,0.15990,0.4504,0.004571,21.25,1,130.00,4.585,1
84348301,26.50,98.87,0.24140,0.28390,0.6869,0.009208,20.38,1,77.58,3.445,1
84358402,16.67,152.20,0.19800,0.13280,0.4000,0.005115,14.34,1,135.10,5.438,1
...,...,...,...,...,...,...,...,...,...,...,...
926424,26.40,166.10,0.24390,0.11590,0.4107,0.004239,22.39,1,142.00,7.673,1
926682,38.25,155.00,0.14400,0.10340,0.3215,0.002498,28.25,1,131.20,5.203,1
926954,34.12,126.70,0.09251,0.10230,0.3403,0.003892,28.08,1,108.30,3.425,1
927241,39.42,184.60,0.35140,0.27700,0.9387,0.006185,29.33,1,140.10,5.772,1


# SVM classifier grid search

In [4]:
# Using gridserch for SVM hyperparameter finetuning

param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 'auto', 'scale'],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

grid_original = GridSearchCV(SVC(), param_grid, refit = 'recall', verbose = 0, scoring = ['recall', 'accuracy', 'precision'], cv=3)
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    grid_original.fit(X_original, y_original)
svm_original = grid_original.best_estimator_
print(svm_original)

SVC(C=100, kernel='poly')


# SVM classifier train and test

In [5]:
# Split data to train and test
X_train, X_test, y_train, y_test = train_test_split(X_original, y_original, test_size=0.2, random_state=0)

# normalize dataset
minmaxscaler = MinMaxScaler()
X_train_scaled = minmaxscaler.fit_transform(X_train)
X_test_scaled = minmaxscaler.transform(X_test)

# Train SVM classifier
svm_original.fit(X_train_scaled, y_train)

# Predict test data
y_hut = svm_original.predict(X_test_scaled)

# test results
confusion_mat = confusion_matrix(y_test, y_hut)
print(f"confusion matrix = {confusion_mat}")
recall = recall_score(y_test, y_hut)
print(f"test recall = {recall}")
accuracy = accuracy_score(y_test, y_hut)
print(f"test accuracy = {accuracy}")
precision = precision_score(y_test, y_hut)
print(f"test precision = {precision}")

# 'precision', 'roc_auc'

confusion matrix = [[65  2]
 [ 4 43]]
test recall = 0.9148936170212766
test accuracy = 0.9473684210526315
test precision = 0.9555555555555556
