In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, classification_report
from sklearn.preprocessing import StandardScaler

In [None]:
def train_lr_model(X_train, y_train, n_cross_train):
    n_cross_train = 10
    c_range = np.arange(0.01, 0.1, 0.02)
    param_grid = [ 
        {'penalty': ['l1'], 'solver': [ 'liblinear' ], 'C': c_range, 'n_jobs': [1]},
        {'penalty': ['l2'], 'solver': [ 'liblinear' ], 'C': c_range, 'n_jobs': [1]},
        {'penalty': ['l1'], 'solver': [ 'saga'], 'C': c_range,  'n_jobs': [-1]},
        {'penalty': ['l2'], 'solver': [ 'lbfgs', 'newton-cg', 'sag'], 'C': c_range,  'n_jobs': [-1]}
    ]
    scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
    log_clf = LogisticRegression(random_state = 0, max_iter = 400)
    log_clf = GridSearchCV(log_clf, param_grid, cv = n_cross_train, refit='AUC', return_train_score = True, n_jobs = -1, scoring = scoring)
    log_clf = log_clf.fit(X_train, y_train)
    return log_clf

def test_model(classifier, X_test_df, y_tes_df):
    y_pred = classifier.predict(X_test)

    print("Global Precision:")
    print(accuracy_score(y_test, y_pred))
    print("\n")

    print("General Report:")
    print(classification_report(y_test, y_pred, target_names=['yes','no']))
    print("\n")

    print("Confusion Matrix:")
    matriz_confusion = confusion_matrix(y_test, y_pred)
    
    table = pd.DataFrame(matriz_confusion)
    table.columns = sorted(set(y_test))
    table.index = sorted(set(y_test))
    print(table)

    show_roc_cuve(y_test, y_pred)
    
def scale_data(X):
    columns = list(X)
    scaler = StandardScaler()
    scaler = scaler.fit(X)
    scaled_df = scaler.fit_transform(X)
    return pd.DataFrame(scaled_df, columns=columns)

def show_train_results(classifier, features):
    print("Best Params:", classifier.best_params_, classifier.best_score_)
    print()
    
    params = classifier.cv_results_['params']
    
    mean_train_auc = classifier.cv_results_['mean_train_AUC']
    mean_train_acc  = classifier.cv_results_['std_train_Accuracy']
    print('Train Scores')
    print_scores(mean_train_auc, mean_train_acc, params)
    
    mean_test_auc = classifier.cv_results_['mean_test_AUC']
    mean_test_acc  = classifier.cv_results_['std_test_Accuracy']
    print('Test Scores')
    print_scores(mean_test_auc, mean_test_acc, params)
     
    return [mean_test_auc, mean_test_acc]

def show_roc_cuve(y_test, y_pred):
    fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred)
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_rf, tpr_rf, label='Model')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC Curve')
    plt.legend(loc='best')
    plt.show()

def print_scores(means, stds, clf_params):
    for mean, std, params in zip(means, stds, clf_params):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

In [3]:
database_folder = file_name = '../../Databases/Sinteticas'
database_path = '/breast-cancer'

In [4]:
#Load Attributes
file_name = database_folder + database_path + '/database_attr.csv'
X = pd.read_csv(file_name, sep=',')
X.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [9]:
#Load Gold Standard Label
file_name = database_folder + database_path + '/database_labels.csv'
labels = pd.read_csv(file_name, sep=',')
y = labels[['y']]
y.head()

Unnamed: 0,y
0,True
1,True
2,True
3,True
4,True


In [10]:
#Pre Process Data
X_scaled = scale_data(X)
X_scaled.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,-1.256562e-16,1.049736e-16,-1.272171e-16,-1.900452e-16,1.045834e-16,2.528733e-16,-1.08876e-16,-5.619407e-17,2.14435e-16,5.80672e-16,...,-7.988142e-16,-1.834112e-17,-4.015534e-16,-2.848727e-17,-2.189227e-16,-2.579464e-16,1.143393e-16,2.98531e-16,2.064351e-16,2.25947e-16
std,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,...,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088
min,-2.029648,-2.229249,-1.984504,-1.454443,-3.112085,-1.610136,-1.114873,-1.26182,-2.744117,-1.819865,...,-1.726901,-2.223994,-1.693361,-1.222423,-2.682695,-1.443878,-1.305831,-1.745063,-2.16096,-1.601839
25%,-0.6893853,-0.7259631,-0.6919555,-0.6671955,-0.7109628,-0.747086,-0.7437479,-0.7379438,-0.7032397,-0.7226392,...,-0.6749213,-0.7486293,-0.6895783,-0.6421359,-0.6912304,-0.6810833,-0.7565142,-0.7563999,-0.6418637,-0.6919118
50%,-0.2150816,-0.1046362,-0.23598,-0.2951869,-0.03489108,-0.2219405,-0.3422399,-0.3977212,-0.0716265,-0.1782793,...,-0.2690395,-0.04351564,-0.2859802,-0.3411812,-0.04684277,-0.2695009,-0.2182321,-0.2234689,-0.1274095,-0.2164441
75%,0.4693926,0.5841756,0.4996769,0.3635073,0.636199,0.4938569,0.5260619,0.6469351,0.5307792,0.4709834,...,0.5220158,0.6583411,0.540279,0.3575891,0.5975448,0.5396688,0.5311411,0.71251,0.4501382,0.4507624
max,3.971288,4.651889,3.97613,5.250529,4.770911,4.568425,4.243589,3.92793,4.484751,4.910919,...,4.094189,3.885905,4.287337,5.930172,3.955374,5.112877,4.700669,2.685877,6.046041,6.846856


In [11]:
# Split in Train and Test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state = 0, shuffle = True, stratify = y)

In [12]:
lr_model = train_lr_model(X_train, y_train, 5)
train_results = show_train_results(lr_model, X_train.columns)

Best Params: {'C': 0.08999999999999998, 'n_jobs': 1, 'penalty': 'l2', 'solver': 'liblinear'} 0.9954438860971524

Train Scores
0.973 (+/-0.012) for {'C': 0.01, 'n_jobs': 1, 'penalty': 'l1', 'solver': 'liblinear'}
0.987 (+/-0.011) for {'C': 0.03, 'n_jobs': 1, 'penalty': 'l1', 'solver': 'liblinear'}
0.991 (+/-0.009) for {'C': 0.049999999999999996, 'n_jobs': 1, 'penalty': 'l1', 'solver': 'liblinear'}
0.993 (+/-0.007) for {'C': 0.06999999999999999, 'n_jobs': 1, 'penalty': 'l1', 'solver': 'liblinear'}
0.995 (+/-0.005) for {'C': 0.08999999999999998, 'n_jobs': 1, 'penalty': 'l1', 'solver': 'liblinear'}
0.995 (+/-0.005) for {'C': 0.01, 'n_jobs': 1, 'penalty': 'l2', 'solver': 'liblinear'}
0.996 (+/-0.005) for {'C': 0.03, 'n_jobs': 1, 'penalty': 'l2', 'solver': 'liblinear'}
0.997 (+/-0.004) for {'C': 0.049999999999999996, 'n_jobs': 1, 'penalty': 'l2', 'solver': 'liblinear'}
0.997 (+/-0.005) for {'C': 0.06999999999999999, 'n_jobs': 1, 'penalty': 'l2', 'solver': 'liblinear'}
0.997 (+/-0.004) for {'

  y = column_or_1d(y, warn=True)


In [13]:
test_model(lr_model, X_test, y_test)

Global Precision:
0.9590643274853801


General Report:
              precision    recall  f1-score   support

         yes       0.97      0.96      0.97       107
          no       0.94      0.95      0.95        64

   micro avg       0.96      0.96      0.96       171
   macro avg       0.96      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



Confusion Matrix:


ValueError: Length mismatch: Expected axis has 2 elements, new values have 1 elements