In [14]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

In [15]:
data = pd.read_csv(r"../Data/03_Prepared_data.csv")

In [16]:
random_state = 1

In [17]:
data

Unnamed: 0,Clump Thickness,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,2,1,3,1,1,0
1,5,4,5,7,10,3,2,1,0
2,3,1,1,2,2,3,1,1,0
3,6,8,1,3,4,3,7,1,0
4,4,1,3,2,1,3,1,1,0
...,...,...,...,...,...,...,...,...,...
694,3,1,1,3,2,1,1,1,0
695,2,1,1,2,1,1,1,1,0
696,5,10,3,7,3,8,10,2,1
697,4,6,4,3,4,10,6,1,1


In [29]:
features = data.drop(["Class"], axis=1)
labels = data["Class"]

In [19]:
# # From all data split train (X, y) and test (X_test, y_test)
X, X_test, y, y_test = train_test_split(features, labels, test_size=0.3, random_state=random_state)

# Creating an scaler object
std_slc = StandardScaler()

# Creating a pca object
pca = PCA()

# Creating a logistic regression object
logistic_Reg = LogisticRegression(random_state=random_state)

In [20]:
# Creating a pipeline of three steps
# First, standardizing the data
# Second, transforming the data with PCA
# Third, training a logistic regression on the data
pipe = Pipeline(steps=[('std_slc', std_slc),
                       ('pca', pca),
                       ('logistic_Reg', logistic_Reg)])

In [21]:
# Creating a list of a sequence of integers from 1 to 8 (the number of features in X + 1)
n_components = list(range(1, X.shape[1] + 1, 1))
# Creating a list of values of the regularization parameter
# C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
c = np.logspace(-4, 4, 50)
# Creating a list of options for the regularization penalty
penalty = ['l1', 'l2']
# Creating a dictionary of all the parameter options
hyper_parameters = dict(pca__n_components=n_components,
                        logistic_Reg__C=c,
                        logistic_Reg__penalty=penalty)

In [22]:
# Creating a grid search object
# First split (X,y) into (X_train, y_train) and (X_val, y_val)
cv = KFold(n_splits=10, random_state=random_state, shuffle=True)
# score = ['recall', 'accuracy', 'roc_auc']
clf_GS = GridSearchCV(pipe, hyper_parameters, cv=cv, scoring='recall')
# clf_GS_acc = GridSearchCV(pipe, hyper_parameters, cv=cv, scoring='accuracy')
# clf_GS_auc = GridSearchCV(pipe, hyper_parameters, cv=cv, scoring='roc_auc')

# Fitting the grid search
clf_GS.fit(X, y)

GridSearchCV(cv=KFold(n_splits=10, random_state=1, shuffle=True),
             estimator=Pipeline(steps=[('std_slc', StandardScaler()),
                                       ('pca', PCA()),
                                       ('logistic_Reg',
                                        LogisticRegression(random_state=1))]),
             param_grid={'logistic_Reg__C': array([1.00000000e-04, 1.45634848e-04, 2.12095089e-04, 3.08884360e-04,
       4.49843267e-04, 6.55128557e-04, 9.54095476e-04, 1.38949549e-03,
       2.02358...
       1.67683294e+01, 2.44205309e+01, 3.55648031e+01, 5.17947468e+01,
       7.54312006e+01, 1.09854114e+02, 1.59985872e+02, 2.32995181e+02,
       3.39322177e+02, 4.94171336e+02, 7.19685673e+02, 1.04811313e+03,
       1.52641797e+03, 2.22299648e+03, 3.23745754e+03, 4.71486636e+03,
       6.86648845e+03, 1.00000000e+04]),
                         'logistic_Reg__penalty': ['l1', 'l2'],
                         'pca__n_components': [1, 2, 3, 4, 5, 6, 7, 8]},
        

In [24]:
# Viewing The Best Parameters
# print('Best Penalty:', clf_GS.best_estimator_.get_params()['logistic_Reg__penalty'])
# print('Best C:', clf_GS.best_estimator_.get_params()['logistic_Reg__C'])
# print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])

# Choose best estimator
best_model = clf_GS.best_estimator_.named_steps['logistic_Reg']

# Confirm transformations into X_test data StandardScaler() and PCA()
test_full_imp = clf_GS.best_estimator_.named_steps['std_slc'].transform(X_test)
test_final = clf_GS.best_estimator_.named_steps['pca'].transform(test_full_imp)

In [25]:
# Use best model for prediction
y_pred = best_model.predict(test_final)

In [26]:
# Print classification report
target_names = ['Malignant', 'Benign']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Malignant       0.95      0.98      0.96       137
      Benign       0.96      0.90      0.93        73

    accuracy                           0.95       210
   macro avg       0.95      0.94      0.95       210
weighted avg       0.95      0.95      0.95       210



## Saving the model via pickle

In [30]:
# import pickle
# filename = r'../Trained models/LogisticRegression_trained.sav'
# pickle.dump(best_model, open(filename, 'wb'))