In [1]:
# load part of the toolkit
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load data
data = pd.read_csv('Coimbra_breast_cancer_dataset.csv')
data.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,48,23.5,70,2.707,0.467409,8.8071,9.7024,7.99585,417.114,1
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,1
2,82,23.12467,91,4.498,1.009651,17.9393,22.43204,9.27715,554.697,1
3,68,21.367521,77,3.226,0.612725,9.8827,7.16956,12.766,928.22,1
4,86,21.111111,92,3.549,0.805386,6.6994,4.81924,10.57635,773.92,1


In [3]:
# view info:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             116 non-null    int64  
 1   BMI             116 non-null    float64
 2   Glucose         116 non-null    int64  
 3   Insulin         116 non-null    float64
 4   HOMA            116 non-null    float64
 5   Leptin          116 non-null    float64
 6   Adiponectin     116 non-null    float64
 7   Resistin        116 non-null    float64
 8   MCP.1           116 non-null    float64
 9   Classification  116 non-null    int64  
dtypes: float64(7), int64(3)
memory usage: 9.2 KB


In [4]:
# modify Classification so we have 0 and 1 values:

data['Classification'] = data['Classification'] - 1

# Define X and y:

X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [5]:
# train test split:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=639)

In [7]:
# standar scaler for z-score scaling

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
# in this first aproximation, we use only SVC withouth any dimensionality treatment

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

svm_model= SVC()

param_grid = {
                    'C': [i for i in range(1,20)],
                    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                    'degree': [i for i in range(1,10)],
                    'random_state': [639],
                    'probability': [True]
}

grid_search = GridSearchCV(svm_model, param_grid, cv=10, scoring='roc_auc')
grid_search.fit(X_train, y_train)
print(f'Best hyperparameters: {grid_search.best_params_}')

best_model = grid_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
best_auc = roc_auc_score(y_test, y_pred_proba)
print(f'AUC on the test set: {best_auc:.4f}')

Best hyperparameters: {'C': 2, 'degree': 1, 'kernel': 'rbf', 'probability': True, 'random_state': 639}
AUC on the test set: 0.9037


In [11]:
# let´s try now a dimensionality reduction using PCA

from sklearn.decomposition import PCA

n_comp = []
auc = []
for i in range(1,10):

    pca = PCA(n_components=i)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    svm_model= SVC()
    param_grid = {
                        'C': [i for i in range(1,20)],
                        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                        'degree': [i for i in range(1,10)],
                        'random_state': [639],
                        'probability': [True]
    }

    grid_search = GridSearchCV(svm_model, param_grid, cv=10, scoring='roc_auc')
    grid_search.fit(X_train_pca, y_train)
    print(f'Best hyperparameters: {grid_search.best_params_}, with {i} components')

    best_model = grid_search.best_estimator_
    y_pred_proba = best_model.predict_proba(X_test_pca)[:, 1]
    best_auc = roc_auc_score(y_test, y_pred_proba)
    print(f'AUC on the test set: {best_auc:.4f}')
    n_comp.append(i)
    auc.append(best_auc)

Best hyperparameters: {'C': 1, 'degree': 1, 'kernel': 'linear', 'probability': True, 'random_state': 639}, with 1 components
AUC on the test set: 0.6519
Best hyperparameters: {'C': 1, 'degree': 3, 'kernel': 'poly', 'probability': True, 'random_state': 639}, with 2 components
AUC on the test set: 0.7407
Best hyperparameters: {'C': 1, 'degree': 3, 'kernel': 'poly', 'probability': True, 'random_state': 639}, with 3 components
AUC on the test set: 0.8000
Best hyperparameters: {'C': 17, 'degree': 1, 'kernel': 'rbf', 'probability': True, 'random_state': 639}, with 4 components
AUC on the test set: 0.7778
Best hyperparameters: {'C': 10, 'degree': 3, 'kernel': 'poly', 'probability': True, 'random_state': 639}, with 5 components
AUC on the test set: 0.8148
Best hyperparameters: {'C': 6, 'degree': 1, 'kernel': 'rbf', 'probability': True, 'random_state': 639}, with 6 components
AUC on the test set: 0.8444
Best hyperparameters: {'C': 11, 'degree': 1, 'kernel': 'rbf', 'probability': True, 'random_s

In [12]:
from sklearn.model_selection import RandomizedSearchCV

n_comp = []
auc = []
for i in range(1,10):

    pca = PCA(n_components=i)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    svm_model= SVC()
    param_grid = {
                        'C': [i for i in range(1,20)],
                        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                        'degree': [i for i in range(1,10)],
                        'random_state': [639],
                        'probability': [True]
    }

    rand_search = RandomizedSearchCV(svm_model, param_grid, n_iter=10, scoring='roc_auc')
    rand_search.fit(X_train_pca, y_train)
    print(f'Best hyperparameters: {rand_search.best_params_}, with {i} components')

    best_model = rand_search.best_estimator_
    y_pred_proba = best_model.predict_proba(X_test_pca)[:, 1]
    best_auc = roc_auc_score(y_test, y_pred_proba)
    print(f'AUC on the test set: {best_auc:.4f}')
    n_comp.append(i)
    auc.append(best_auc)

Best hyperparameters: {'random_state': 639, 'probability': True, 'kernel': 'linear', 'degree': 5, 'C': 10}, with 1 components
AUC on the test set: 0.6519
Best hyperparameters: {'random_state': 639, 'probability': True, 'kernel': 'linear', 'degree': 6, 'C': 3}, with 2 components
AUC on the test set: 0.7111
Best hyperparameters: {'random_state': 639, 'probability': True, 'kernel': 'linear', 'degree': 6, 'C': 18}, with 3 components
AUC on the test set: 0.8148
Best hyperparameters: {'random_state': 639, 'probability': True, 'kernel': 'rbf', 'degree': 5, 'C': 11}, with 4 components
AUC on the test set: 0.7926
Best hyperparameters: {'random_state': 639, 'probability': True, 'kernel': 'rbf', 'degree': 3, 'C': 8}, with 5 components
AUC on the test set: 0.7556
Best hyperparameters: {'random_state': 639, 'probability': True, 'kernel': 'rbf', 'degree': 4, 'C': 12}, with 6 components
AUC on the test set: 0.8370
Best hyperparameters: {'random_state': 639, 'probability': True, 'kernel': 'rbf', 'degr