# Dataset Preparation


### Setup

In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.datasets import load_breast_cancer

warnings.filterwarnings('ignore')
seed = 1855


## Data Collection


In [109]:
cancer = load_breast_cancer()
dataset = pd.DataFrame(np.c_[cancer['data'], cancer['target']],
                  columns= np.append(cancer['feature_names'], ['target']))


dataset.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


## Handling missing values


In [110]:
from pandas.api.types import is_numeric_dtype

dataset = dataset.fillna(lambda x: x.fillna(x.median()) if is_numeric_dtype(x) else x.fillna(x.median()[0]))

## Encoding categorical features with One-Hot Encoding


## Encoding binary class label


In [111]:
# reencode target from 0/1 to -1/1

dataset['target'] = dataset['target'].apply(lambda x: -1 if x == 0.0 else 1)

dataset.target.value_counts()

target
 1    357
-1    212
Name: count, dtype: int64

# Model Building


In [112]:
from sklearn.model_selection import *
from sklearn.tree import *
from sklearn.ensemble import *
from sklearn.metrics import *
from sklearn.tree import *
from sklearn.naive_bayes import *
from sklearn.linear_model import *

## Splitting the dataset into training and testing sets


In [113]:
x = dataset.drop('target', axis=1)
y = dataset['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)

## Evaluation function


In [114]:
def evaluate(expected, predicted):
  results = {
      'accuracy': accuracy_score(expected, predicted),
      'precision': precision_score(expected, predicted),
      'recall': recall_score(expected, predicted),
      'f1': f1_score(expected, predicted)
  }
  for metric, score in results.items():
    print(f'{metric}: {score:.2f}')

## Model Building


In [115]:
model = LogisticRegression()

## Model Evaluation


### Cross Validation

In [116]:
cross_validation_results = cross_validate(model, x_train, y_train, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'])

for metric, scores in cross_validation_results.items():
  if 'time' not in metric:
    name  = metric.replace('test_','' ).capitalize()
    print(f'{name}: {np.mean(scores):.2f} += {np.std(scores):.2f}')
  

Accuracy: 0.95 += 0.01
Precision: 0.96 += 0.03
Recall: 0.96 += 0.03
F1: 0.96 += 0.01


### Cross Validation with KFold

In [117]:

k_fold = KFold(n_splits=10, random_state=seed, shuffle=True)
cross_validation_results = cross_validate(model, x_train, y_train, cv=k_fold, scoring=['accuracy', 'precision', 'recall', 'f1'])

for metric, scores in cross_validation_results.items():
  if 'time' not in metric:
    name  = metric.replace('test_','' ).capitalize()
    print(f'{name}: {np.mean(scores):.2f} += {np.std(scores):.2f}')
  

Accuracy: 0.94 += 0.03
Precision: 0.94 += 0.04
Recall: 0.96 += 0.04
F1: 0.95 += 0.02


### Cross Validation with StratifiedKFold

In [118]:

k_fold = StratifiedKFold(n_splits=10, random_state=seed, shuffle=True)
cross_validation_results = cross_validate(model, x_train, y_train, cv=k_fold, scoring=['accuracy', 'precision', 'recall', 'f1'])

for metric, scores in cross_validation_results.items():
  if 'time' not in metric:
    name  = metric.replace('test_','' ).capitalize()
    print(f'{name}: {np.mean(scores):.2f} += {np.std(scores):.2f}')
  

Accuracy: 0.94 += 0.03
Precision: 0.95 += 0.05
Recall: 0.96 += 0.03
F1: 0.95 += 0.02


## Model Building with Hyperparameter Tuning


In [119]:
params = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}


grid_search = GridSearchCV(model, param_grid=params, cv=k_fold, scoring='accuracy')

grid_search.fit(x_train, y_train)

print(grid_search.best_params_)

{'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}



## Model Evaluation


In [120]:
best_model = grid_search.best_estimator_

y_pred = best_model.predict(x_test)

evaluate(y_test, y_pred)

accuracy: 0.96
precision: 0.96
recall: 0.97
f1: 0.97


# Comparing Different Models Performance

## Model Building

In [121]:
models = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'NaiveBayes': GaussianNB()
}

## Model Comparison

In [122]:

results = {}

for name, model in models.items():
    results[name] = cross_val_score(model, x_train, y_train, cv=k_fold, scoring='accuracy')


result_df = pd.DataFrame(results).transpose()

result_df['mean'] = result_df.mean(axis=1)
result_df['std'] = result_df.std(axis=1)
result_df = result_df.sort_values(by=['mean', 'std'], ascending=False)
result_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,mean,std
RandomForest,0.978261,0.956522,0.956522,1.0,0.956522,0.977778,0.933333,0.933333,0.888889,0.977778,0.955894,0.029853
LogisticRegression,0.956522,0.956522,0.869565,0.978261,0.956522,0.911111,0.955556,0.933333,0.955556,0.955556,0.94285,0.02961
NaiveBayes,0.978261,0.913043,0.913043,0.978261,0.934783,0.977778,0.955556,0.955556,0.911111,0.911111,0.94285,0.02812
DecisionTree,0.934783,0.913043,0.869565,0.891304,0.913043,0.955556,0.955556,0.933333,0.911111,0.911111,0.918841,0.02556
