# Import required libaries

In [111]:
# Numerical Python and Pandas for data manipulation
import numpy as np
import pandas as pd

In [115]:
# Necessary libraries from Sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import learning_curve

In [116]:
# Algorithms
from sklearn.linear_model import LogisticRegression
from xgboost import plot_importance

In [117]:
# Visualization
from matplotlib import pyplot
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [118]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Import Data

In [119]:
# Load data
data = pd.read_excel("breast_cancer_dataset_2.xlsx")

In [120]:
# Sneak peak
data.head(3)

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Diagnosis
0,842302,11.0,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,842517,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,84300903,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1


In [121]:
# Data rows vs. columns
data.shape

(569, 32)

In [122]:
# Calculation class distribution
target_counts = data.groupby("Diagnosis").size()
target_counts

Diagnosis
0    357
1    212
dtype: int64

# Training the Machine Learning Algorithm

In [123]:
# Based on XGBoost feature importance, the following are the 3 most important features:
X = data[['texture_worst', 'texture_mean', 'concave points_mean']]

In [124]:
# Defining y
y = data.Diagnosis 
y.head(2)

0    1
1    1
Name: Diagnosis, dtype: int64

In [125]:
validation_size = 0.2
seed = 12
X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=validation_size, random_state=seed)

In [126]:
num_fold = 10
kfold = KFold(n_splits=10, random_state=12)
seed = 12

In [127]:
# MinMaxScale (without it, accuracy would be around 60%!)
mms = MinMaxScaler()

In [128]:
# Avoid date leakage by scaling train and test seperately
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

In [129]:
model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.95, fit_intercept=True, 
                           intercept_scaling=1, class_weight=None, random_state=12, 
                           solver='warn', max_iter=100, multi_class='warn', verbose=0, 
                           warm_start=False, n_jobs=None)

# C = 0.95. C is the inverse of regularization strength. Large values of C give more freedom (higher variance).
# L2 Regularization adds a penalty to the cost function -> in less extreme weight values.
# class_weight = "balanced" will automatically weigh classes inversely proportional to their frequency
# Random state = 12 as always for the pseudorandom number generator.

In [130]:
model.fit(X_train_norm, y_train)

LogisticRegression(C=0.95, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=12, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [131]:
num_folds = 3
seed = 12

kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(model, X_test_norm, y_test, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Accuracy: 92.982% (4.473%)


# Preparation for connecting with Tableau

In [132]:
# We need to pickle the model
import pickle

In [133]:
pickle.dump(model, open('/Users/francoarda/Documents/Private/DBA/Thesis/Breast_Cancer/cancer_model.p', 'wb'))

In [134]:
# _arg1 = texture_worsts, _arg2 = texture_mean, _arg3 = concave poits_mean
def Breast_Cancer(_arg1, _arg2, _arg3):

    import numpy as np
    import pandas as pd
    df_ = np.column_stack((_arg1, _arg2, _arg3))
    
    # low & high limit values
    df_limits = pd.DataFrame([[12.020000, 9.710000, 0.000000],
                [49.540000, 39.280000, 0.201200]])
    df_ = pd.DataFrame(df_).append(df_limits)
    
    from sklearn.preprocessing import MinMaxScaler
    mmsNew = MinMaxScaler()
    df_ = mmsNew.fit_transform(df_)
    
    import pickle
    model = pickle.load(open('/Users/francoarda/Documents/Private/DBA/Thesis/Breast_Cancer/cancer_model.p', 'rb'))
    predict = model.predict_proba(df_)

    # skip prediction for the limits which are the last two rows
    pred = [x[1] for x in predict][:-2]
    return pred

In [135]:
import tabpy_client
connection = tabpy_client.Client('http://localhost:9004/')

In [136]:
connection.deploy('Breast_Cancer', Breast_Cancer, 'Breast_Cancer', override = True)