# SVM classifier

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
 
# Sklearn modules & classes
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from sklearn import metrics

In [2]:
# Load the data set
bc = datasets.load_breast_cancer(as_frame=True)
X = bc.data
y = bc.target
 
# Create training and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

The breast cancer dataset is a classic and very easy binary classification dataset.

Classes:2

Samples per class:212(M),357(B)

Samples total:569

Dimensionality:30

Features:real, positive

lets see some discreption of the data 

In [3]:
print(bc.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [4]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

kernel parameters selects the type of hyperplane used to separate the data. 
Using ‘linear’ will use a linear hyperplane (a line in the case of 2D data). 
‘rbf’ and ‘poly’ uses a non linear hyper-plane

In [5]:
kernels = ["linear", "rbf", "poly"]
for kernel in kernels:
    svc= SVC(kernel=kernel).fit(X_train_std, y_train)
    print(" kernel: "+ str(kernel))
    print('svc model intercept: {}'
     .format(svc.intercept_))
#     print('svc model coeff:\n{}'
#      .format(svc.coef_))
    print('R-squared score (training): {:.3f}'
     .format(svc.score(X_train_std, y_train)))
    print('R-squared score (test): {:.3f}'
     .format(svc.score(X_test_std, y_test)))

 kernel: linear
svc model intercept: [0.31885692]
R-squared score (training): 0.990
R-squared score (test): 0.953
 kernel: rbf
svc model intercept: [-0.25627728]
R-squared score (training): 0.990
R-squared score (test): 0.977
 kernel: poly
svc model intercept: [0.6672142]
R-squared score (training): 0.902
R-squared score (test): 0.901


gamma is a parameter for non linear hyperplanes.
The higher the gamma value it tries to exactly fit the training data set

overfit عند ارتفاع قيمتها نلاحظ حصول

In [6]:
gammas = ["auto",0.1, 1, 10, 100]
for gamma in gammas:
    svc = SVC(kernel="rbf", gamma=gamma).fit(X_train_std, y_train)
    print(" gammas' value : "+ str(gamma))
    print('svc model intercept: {}'
     .format(svc.intercept_))
#     print('svc model coeff:\n{}'
#      .format(svc.coef_))
    print('R-squared score (training): {:.3f}'
     .format(svc.score(X_train_std, y_train)))
    print('R-squared score (test): {:.3f}'
     .format(svc.score(X_test_std, y_test)))

 gammas' value : auto
svc model intercept: [-0.25627728]
R-squared score (training): 0.990
R-squared score (test): 0.977
 gammas' value : 0.1
svc model intercept: [-0.20591511]
R-squared score (training): 0.995
R-squared score (test): 0.942
 gammas' value : 1
svc model intercept: [0.32223834]
R-squared score (training): 1.000
R-squared score (test): 0.626
 gammas' value : 10
svc model intercept: [0.4079998]
R-squared score (training): 1.000
R-squared score (test): 0.626
 gammas' value : 100
svc model intercept: [0.408]
R-squared score (training): 1.000
R-squared score (test): 0.626


C is the penalty parameter of the error term.
It controls the trade off between smooth decision boundary and classifying the training points correctly.


regularization يمثل هذا المتغير قيمة ال

In [7]:
cs = [0.1, 1, 10, 100, 1000]
for c in cs:
    svc = SVC(kernel="rbf",gamma="auto", C=c).fit(X_train_std, y_train)
    print("C =" + str(c))
    print('svc model intercept: {}'
     .format(svc.intercept_))
    print('R-squared score (training): {:.3f}'
     .format(svc.score(X_train_std, y_train)))
    print('R-squared score (test): {:.3f}'
     .format(svc.score(X_test_std, y_test)))

C =0.1
svc model intercept: [-0.22787928]
R-squared score (training): 0.950
R-squared score (test): 0.947
C =1
svc model intercept: [-0.25627728]
R-squared score (training): 0.990
R-squared score (test): 0.977
C =10
svc model intercept: [-0.30622816]
R-squared score (training): 0.995
R-squared score (test): 0.971
C =100
svc model intercept: [-0.35306207]
R-squared score (training): 1.000
R-squared score (test): 0.971
C =1000
svc model intercept: [-0.35306207]
R-squared score (training): 1.000
R-squared score (test): 0.971


عند ارتفاع قيمته نلاحظ حصول overfit 
وأفضل قيمة له هي 1

best parameter (kernel = rbf,C = 1, gamma = auto)

    -svc model intercept: [-0.25627728]
    -R-squared score (training): 0.990
    -R-squared score (test): 0.977

In [10]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score

classifier_svm = SVC (kernel = 'rbf', random_state = 0)
classifier_svm.fit (X_train, y_train)
Y_pred_svm = classifier_svm.predict (X_test)
acc_svm = accuracy_score (y_test, Y_pred_svm)

In [11]:
acc_svm

0.9239766081871345

In [13]:
from sklearn.model_selection import GridSearchCV

parameters = [{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'kernel': ['linear', 'rbf'],
                'gamma': ['auto',0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
grid_search = GridSearchCV(estimator = classifier_svm,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy_svm = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy_svm)
print(best_parameters)

KeyboardInterrupt: 

# Logistic Regression

In [3]:
# Load the data set
bc = datasets.load_breast_cancer(as_frame=True)
X = bc.data
y = bc.target
 
# Create training and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [4]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import confusion_matrix, accuracy_score

classifier_log = LogisticRegression ()
classifier_log.fit (X_train, y_train)
Y_pred_log = classifier_log.predict (X_test)
acc_log = accuracy_score (y_test, Y_pred_log)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [8]:
acc_log

0.935672514619883

In [10]:
from sklearn.model_selection import GridSearchCV
# solver --> optimization
# C --> regularization strength >> Smaller values specify stronger regularization
# penalty --> regularization
parameters = [{'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}]
grid_search = GridSearchCV(estimator = classifier_log,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy_log = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy_log)
print(best_parameters)

210 fits failed out of a total of 700.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
70 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\HP\anaconda3\envs\nlp\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\HP\anaconda3\envs\nlp\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\HP\anaconda3\envs\nlp\lib\site-packages\sklearn\linear_model\_logistic.py", line 449, in _check_solver
    % (solver, penalty)
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

--

0.9673717948717948
{'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}




## knn

In [11]:
# Load the data set
bc = datasets.load_breast_cancer(as_frame=True)
X = bc.data
y = bc.target
 
# Create training and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [12]:
from sklearn.neighbors import KNeighborsClassifier
classifier_knn = KNeighborsClassifier ()
classifier_knn.fit (X_train, y_train)
Y_pred_knn = classifier_knn.predict (X_test)
acc_knn = accuracy_score (y_test, Y_pred_knn)

In [13]:
acc_knn

0.9298245614035088

In [14]:
# n_neighbors --> represents the number of neighbors to use for kneighbors queries
# weights --> how weight should be distributed between neighbor values. uniform: weights to be distributed equally
# p --> for the Minkowski metric. p = 1 -> manhattan_distance (l1), euclidean_distance (l2) for p = 2.
parameters = [{'n_neighbors': [3,5,7,10,13,15], 'weights': ['uniform', 'distance'],
                'p': [1,2]}]
grid_search = GridSearchCV(estimator = classifier_knn,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy_knn = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy_knn)
print(best_parameters)

0.9498076923076922
{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}


## naive bayes

In [19]:
from sklearn.naive_bayes import GaussianNB
classifier_nb = GaussianNB ()
classifier_nb.fit (X_train, y_train)
Y_pred_nb = classifier_nb.predict (X_test)
cm_nb = confusion_matrix (y_test, Y_pred_nb)
acc_nb = accuracy_score (y_test, Y_pred_nb)

In [20]:
acc_nb

0.935672514619883

there are no parameters to tune 

## Decision Tree

In [21]:
from sklearn.tree import DecisionTreeClassifier
classifier_dtc = DecisionTreeClassifier (criterion = 'entropy', random_state = 0)
classifier_dtc.fit (X_train, y_train)
Y_pred_dtc = classifier_dtc.predict (X_test)
acc_dtc = accuracy_score (y_test, Y_pred_dtc)

In [22]:
acc_dtc

0.9473684210526315

In [23]:
# criterion --> measure the quality of a split
#min_samples_split --> minimum number of samples required to split an internal node:
#max_depth -> deep of the tree

parameters = [{'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150], 
                'max_leaf_nodes': [2,4,6,10,15,30,40,50,100], 'min_samples_split': [2, 3, 4]}]
grid_search = GridSearchCV(estimator = classifier_dtc,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy_dtc = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy_dtc)
print(best_parameters)

0.9475
{'criterion': 'gini', 'max_depth': 5, 'max_leaf_nodes': 15, 'min_samples_split': 2}


## Random Forest

In [27]:


from sklearn.ensemble import RandomForestClassifier
classifier_rfc = RandomForestClassifier (n_estimators = 100, criterion = 'entropy', random_state = 1)
classifier_rfc.fit (X_train, y_train)
Y_pred_rfc = classifier_rfc.predict (X_test)
acc_rfc = accuracy_score (y_test, Y_pred_rfc)

In [28]:
acc_rfc

0.9590643274853801

In [29]:
parameters = [{'n_estimators': [100,200,300],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [10,25,50,'none'],
               'min_samples_leaf': [1, 2], 
               'min_samples_split': [2, 5]}]
grid_search = GridSearchCV(estimator = classifier_rfc,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy_rfc = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy_rfc)
print(best_parameters)

240 fits failed out of a total of 960.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\HP\anaconda3\envs\nlp\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\HP\anaconda3\envs\nlp\lib\site-packages\sklearn\ensemble\_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "C:\Users\HP\anaconda3\envs\nlp\lib\site-packages\joblib\parallel.py", line 1048, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\HP\anaconda3\envs\nlp\lib\site-packages\joblib\parallel.py", line 864, in dispatch_one_batch
    self._dispatc

0.9674358974358974
{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
