# 

# <center> MACHINE LEARNING - CLASSIFIERS

## Imports

In [5]:
import warnings 
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

---

## Data

In [14]:
# Load data
from sklearn.datasets import load_breast_cancer

df = load_breast_cancer()
X, y = df.data, df.target

# Get target names
target_names = df.target_names

---

## EDA - Exploratoory Data Analisys

In [73]:
# Features
df.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [85]:
# Dataset
data = pd.concat([pd.DataFrame(X, columns=df.feature_names), pd.DataFrame(y, columns=["target"])], axis=1)

data

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [None]:
# BaLance

---

## Features Engeneering

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [18]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Scale data
scaler = MinMaxScaler().fit(X_train)

In [25]:
# Scale data
std_scaler = StandardScaler()
X_train_std = std_scaler.fit_transform(X_train)
X_test_std = std_scaler.fit_transform(X_train)

---

## Fit Funcion

In [28]:
def fit_models(X_train, X_test, y_train, y_test, models, scaled_required):

    acc = []
    mse = []
    names = []

    # Scale data
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # fit models
    for name, model in models:
        model.fit(X_train, y_train) if name not in scaled_required else model.fit(X_train_scaled, y_train)
        result = model.score(X_test, y_test) if name not in scaled_required else model.score(X_test_scaled, y_test)

        y_pred = model.predict(X_test) if name not in scaled_required else model.predict(X_test_scaled)
        m_s_e = mean_squared_error(y_test, y_pred)

        acc.append(result)
        mse.append(m_s_e)
        names.append(name)

    # Resume base line
    result_dict = {
        "model": names,
        "r2": acc,
        "mse": mse
    }

    resume_models = pd.DataFrame(result_dict).sort_values(by="r2", ascending=False)

    return resume_models

---

## Base Line Models

In [7]:
# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Metrics
from sklearn.metrics import r2_score, mean_squared_error

In [29]:
# Base line models list
base_line_models = []

base_line_models.append(("DT", DecisionTreeClassifier()))
base_line_models.append(("KNN", KNeighborsClassifier()))
base_line_models.append(("LogReg", LogisticRegression()))

# Scale data required
scale_required = ["KNN"]

In [32]:
# fit
fit_models(X_train, X_test, y_train, y_test, base_line_models, scale_required)

Unnamed: 0,model,r2,mse
1,KNN,0.964912,0.035088
2,LogReg,0.964912,0.035088
0,DT,0.947368,0.052632


---

## Models

In [10]:
# Models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

In [34]:
# Models List
models = []
models.append(('SVM', SVC()))
models.append(('RandomForest', RandomForestClassifier()))
models.append(('ExtreTrees', ExtraTreesClassifier()))
models.append(('AdaBoost', AdaBoostClassifier()))
models.append(('GradientBoost', GradientBoostingClassifier()))

# Scale data required
models_scaled_required = ['SVM']

In [35]:
# fit
fit_models(X_train, X_test, y_train, y_test, models, models_scaled_required)

Unnamed: 0,model,r2,mse
0,SVM,0.973684,0.026316
2,ExtreTrees,0.973684,0.026316
3,AdaBoost,0.973684,0.026316
1,RandomForest,0.964912,0.035088
4,GradientBoost,0.95614,0.04386


---

## Tuning Hyperparameters

In [12]:
# Imports
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [40]:
SVM_Params = [{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],   #Specifies the kernel type to be used in the algorithm.
               'tol': [1e-6, 1e-4, 1e-2, 1e-0, 1e2, 1e4, 1e6],                  #Tolerance for stopping criterion.
               'degree': [1,2,3],                                               #Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.
               'C': [50, 10, 1.0, 0.1, 0.01],                                   #Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive.
               'decision_function_shape': ['ovo', 'ovr'],                       #The parameter is ignored for binary classification. Whether to return a one-vs-rest (‘ovr’) decision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). 
               'gamma': ['scale', 'auto'] }]    

### Random Search Parameter Tuning

In [55]:
# model
model = SVC()

# Scale data
X_train_scaled = scaler.transform(X_train)
    
# Hyperparameters
rcv = RandomizedSearchCV(model, SVM_Params, n_iter=100, cv=10, scoring="roc_auc")

# Fit
rcv.fit(X_train_scaled, y_train)

# Results
best_rcv_paramns = rcv.best_params_

In [56]:
print(f"best_r2: {rcv.best_score_.round(4)}")
best_rcv_paramns

best_r2: 0.9937


{'tol': 1e-06,
 'kernel': 'linear',
 'gamma': 'auto',
 'degree': 3,
 'decision_function_shape': 'ovo',
 'C': 10}

### Grid Search Parameter Tuning

In [58]:
SVM_Params_Grid = [{'kernel': ['linear'],                  #Specifies the kernel type to be used in the algorithm.
                    'tol': [1e-8, 1e-6, 1e-4],                  #Tolerance for stopping criterion.
                    'degree': [2,3,4],                                               #Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.
                    'C': [5,10,15],                                   #Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive.
                    'decision_function_shape': ['ovo'],                       #The parameter is ignored for binary classification. Whether to return a one-vs-rest (‘ovr’) decision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). 
                    'gamma': ['auto'] }]  

In [59]:
# model
model = SVC()

# Hyperparameters
gscv = GridSearchCV(model, SVM_Params_Grid, cv=10, scoring='roc_auc', n_jobs=2)

# fit
gscv.fit(X_train, y_train)

# Results
best_gscv_parameters = gscv.best_params_

In [60]:
print(f"best_r2: {rcv.best_score_.round(4)}")
best_rcv_paramns

best_r2: 0.9937


{'tol': 1e-06,
 'kernel': 'linear',
 'gamma': 'auto',
 'degree': 3,
 'decision_function_shape': 'ovo',
 'C': 10}

---

## Save Model

In [13]:
import pickle

In [64]:
file = 'best_classifier.sav'
pickle.dump(gscv, open(file, 'wb'))
print("Model saved successfully!")

Model saved successfully!


---

## Load Model

In [65]:
# Load model
file = 'best_classifier.sav'
final_model = pickle.load(open(file, 'rb'))
print("Model loaded successfully!")

Model loaded successfully!


## Making Predictions

In [68]:
# Make Predictions
X_test_scaled = scaler.transform(X_test)
y_pred = final_model.predict(X_test_scaled).round(1)

y_test_names = ['Malign' if x == 0 else 'Benign' for x in y_test]
y_pred_names = ['Malign' if x == 0 else 'Benign' for x in y_pred]

In [69]:
# View Predictions
dict_df = {
    "y_test": y_test_names,
    "y_pred": y_pred_names
}

pd.DataFrame(dict_df)

Unnamed: 0,y_test,y_pred
0,Benign,Benign
1,Malign,Benign
2,Malign,Benign
3,Benign,Benign
4,Benign,Benign
...,...,...
109,Benign,Benign
110,Malign,Benign
111,Benign,Benign
112,Benign,Benign


---