In [24]:
# Importing modules
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from statistics import mean, stdev

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE 
from imblearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix

<div class="alert alert-block alert-success">
<b>Part one</b> </div>

In [28]:
set_labels_1 = ['Full set', '95% variance set', '25 PCA set', '16 PCA set', '9 PCA set', '4 PCA set']
set_labels_2 = ['Full set', 'Balanced Class Wt.', 'Balanced Subsample Class Wt.', 'Under-sampling', 'Over-sampling', 'Over/Under Combination']

set_columns_1 = ['Average Accuracy', 'Accuracy Deviation']
set_columns_2 = ['Avg. Prec.', 'Prec. Dev.', 'Avg. Recall', 'Recall Dev.']

In [33]:
def TransformDataForTable(accuracies):
    result = []
    result_average = []
    result_deviation = []
    
    for accuracy in accuracies:
        result_average.append(round(mean(accuracy), 2))
        result_deviation.append(round(stdev(accuracy), 2))
        
    
    result.append(result_average)
    result.append(result_deviation)
    
    transposed = np.transpose(result)
    
    return transposed

In [34]:
def PrecisionAndRecallCalculator(matricies):
    precisions = []
    recalls = []
    
    for matrix in matricies:
        if(matrix[0, 1] + matrix[1, 1] != 0):
            precisions.append(float(matrix[1, 1] / (matrix[0, 1] + matrix[1, 1])))
        else:
            precisions.append(float(matrix[1, 1]))
        if(matrix[1, 0] + matrix[1, 1] != 0):
            recalls.append(float(matrix[1, 1] / (matrix[1, 0] + matrix[1, 1])))
        else:
            recalls.append(float(matrix[1, 1]))
        
    return precisions, recalls

In [35]:
def PrecisionAndRecallReportGenerator(matricies):
    
    reports = []
    
    for matrix in matricies:
        precision, recall = PrecisionAndRecallCalculator(matrix)
        report = []
        report.append(round(mean(precision), 2))
        report.append(round(stdev(precision), 2))
        report.append(round(mean(recall), 2))
        report.append(round(stdev(recall), 2))
        reports.append(report)
        
        
    return reports

In [36]:
def PrecisionAndRecallTableConstuctor(matricies, labels, columns):
    df1 = pd.DataFrame( PrecisionAndRecallReportGenerator(matricies),
                        index = pd.Index(labels),
                        columns = columns)
    print(df1)

In [30]:
def ReportConstructor(accuracies, labels, columns):
    df1 = pd.DataFrame( TransformDataForTable(accuracies),
                        index = pd.Index(labels),
                        columns = columns)
    print(df1)

In [5]:
# Initializing arrays 
accuracy_full = []
accuracy_95p = []
accuracy_25 = []
accuracy_16 = []
accuracy_9 = []
accuracy_4 = []

accuracy_bal = []
accuracy_balsub = []
accuracy_rus = []
accuracy_ros = []
accuracy_comb = []

In [37]:
# Reading the data and separate it into two variables (X - protein readings, Y - labeled cancer type) 
dataset = pd.read_csv('breast_cancer_genomic.csv')
dataset.pop('CLID')

y = dataset['Class'].values
dataset.pop('Class')
x = dataset.values

In [38]:
# Initializing the PCA classifiers that are needed for the next steps in our program
pca_95p = PCA(n_components = 0.95)
pca_25 = PCA(n_components = 25)
pca_16 = PCA(n_components = 16)
pca_9 = PCA(n_components = 9)
pca_4 = PCA(n_components = 4)

In [39]:
# Initializing RandomForestTree Classifier 
rf_params = [{'n_estimators': [10, 50, 100, 200, 400, 800, 1000]}, {'criterion': ['gini', 'entropy']}]
rf = RandomForestClassifier()

In [40]:
# Initializing GridSearchCV
grid_search = GridSearchCV(estimator = rf, param_grid = rf_params, scoring='accuracy')

In [41]:
# clearing our lists
accuracy_full.clear()
accuracy_95p.clear()
accuracy_25.clear()
accuracy_16.clear()
accuracy_9.clear()
accuracy_4.clear()

In [None]:
# Executing the 10 runs that are needed in order to calculate the model accuracy

for i in range(0, 10):
    
    # Modify how PCA works on X as it has to be done before the train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y, test_size = 0.2)
    
    # Performing all of the necessary data reductions
    x_train_95p = pca_95p.fit_transform(x_train)
    x_test_95p = pca_95p.transform(x_test)
    x_train_25 = pca_25.fit_transform(x_train)
    x_test_25 = pca_25.transform(x_test)
    x_train_16 = pca_16.fit_transform(x_train)
    x_test_16 = pca_16.transform(x_test)
    x_train_9 = pca_9.fit_transform(x_train)
    x_test_9 = pca_9.transform(x_test)
    x_train_4 = pca_4.fit_transform(x_train)
    x_test_4 = pca_4.transform(x_test)
    
    best_estimator_full = grid_search.fit(x_train, y_train)
    rf_best_full = best_estimator_full.best_estimator_
    rf_best_full.fit(x_train, y_train)
    predict_full = rf_best_full.predict(x_test)
    accuracy_full.append(rf_best_full.score(x_test, y_test))
    
    best_estimator_95p = grid_search.fit(x_train_95p, y_train)
    rf_best_95p = best_estimator_95p.best_estimator_
    rf_best_95p.fit(x_train_95p, y_train)
    predict_95p = rf_best_95p.predict(x_test_95p)
    accuracy_95p.append(rf_best_95p.score(x_test_95p, y_test))
    
    best_estimator_25 = grid_search.fit(x_train_25, y_train)
    rf_best_25 = best_estimator_25.best_estimator_
    rf_best_25.fit(x_train_25, y_train)
    predict_25 = rf_best_25.predict(x_test_25)
    accuracy_25.append(rf_best_25.score(x_test_25, y_test))
    
    best_estimator_16 = grid_search.fit(x_train_16, y_train)
    rf_best_16 = best_estimator_16.best_estimator_
    rf_best_16.fit(x_train_16, y_train)
    predict_16 = rf_best_16.predict(x_test_16)
    accuracy_16.append(rf_best_16.score(x_test_16, y_test))
    
    best_estimator_9 = grid_search.fit(x_train_9, y_train)
    rf_best_9 = best_estimator_9.best_estimator_
    rf_best_9.fit(x_train_9, y_train)
    predict_9 = rf_best_9.predict(x_test_9)
    accuracy_9.append(rf_best_9.score(x_test_9, y_test))
    
    best_estimator_4 = grid_search.fit(x_train_4, y_train)
    rf_best_4 = best_estimator_4.best_estimator_
    rf_best_4.fit(x_train_4, y_train)
    predict_4 = rf_best_4.predict(x_test_4)
    accuracy_4.append(rf_best_4.score(x_test_4, y_test))

In [None]:
accuracies = []
accuracies.append(accuracy_full)
accuracies.append(accuracy_95p)
accuracies.append(accuracy_25)
accuracies.append(accuracy_16)
accuracies.append(accuracy_9)
accuracies.append(accuracy_4)

In [None]:
ReportConstructor(accuracies, set_labels_1, set_columns_1)

<div class="alert alert-block alert-success">
<b>Part two</b> </div>

In [None]:
# Transforming the data
for i in range(0, y.size):
    if(y[i] == 4):
        y[i] = 1
    else:
        y[i] = 0

In [None]:
# Initializing the classifiers

rf_balanced = RandomForestClassifier(class_weight = 'balanced')
grid_search_bal = GridSearchCV(estimator = rf_balanced, param_grid = rf_params, scoring='accuracy')
rf_balanced_subsample = RandomForestClassifier(class_weight = 'balanced_subsample')
grid_search_balsub = GridSearchCV(estimator = rf_balanced_subsample, param_grid = rf_params, scoring='accuracy')

In [None]:
# Initilizing random under sampler, random over sampler and their combination
under_sampler = RandomUnderSampler(sampling_strategy = 'majority', random_state = 50)
over_sampler = SMOTE(sampling_strategy = 'minority', random_state = 100)
under_over = Pipeline(steps = [('u', under_sampler), ('o', over_sampler)])

In [None]:
# Going through 10 turns
confusion_matrices = []
matrices_full = []
matrices_bal = []
matrices_balsub = []
matrices_rus = []
matrices_ros = []
matrices_comb = []

for i in range(0, 10):
    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y, test_size = 0.2)
    
    # Full data set
    best_estimator_full = grid_search.fit(x_train, y_train)
    rf_best_full = best_estimator_full.best_estimator_
    rf_best_full.fit(x_train, y_train)
    predict_full = rf_best_full.predict(x_test)
    matrices_full.append(confusion_matrix(y_true = y_test, y_pred = predict_full))
    
    # Using class_weight = balanced Random Forest Classifier
    best_estimator_bal = grid_search_bal.fit(x_train, y_train)
    rf_best_bal = best_estimator_bal.best_estimator_
    rf_best_bal.fit(x_train, y_train)
    predict_bal = rf_best_bal.predict(x_test)
    matrices_bal.append(confusion_matrix(y_true = y_test, y_pred = predict_bal))
    
    # Using class_weight = balanced_subsample Random Forest Classifier
    best_estimator_balsub = grid_search_balsub.fit(x_train, y_train)
    rf_best_balsub = best_estimator_balsub.best_estimator_
    rf_best_balsub.fit(x_train, y_train)
    predict_balsub = rf_best_balsub.predict(x_test)
    matrices_balsub.append(confusion_matrix(y_true = y_test, y_pred = predict_balsub))
    
    # Using random under-sampling
    x_train_rus , y_train_rus = under_sampler.fit_resample(x_train, y_train)
    best_estimator_rus = grid_search.fit(x_train_rus , y_train_rus)
    rf_best_rus = best_estimator_rus.best_estimator_
    rf_best_rus.fit(x_train_rus , y_train_rus)
    predict_rus = rf_best_rus.predict(x_test)
    matrices_rus.append(confusion_matrix(y_true = y_test, y_pred = predict_rus))
    
    # Using random over-sampling
    x_train_ros , y_train_ros = over_sampler.fit_resample(x_train, y_train)
    best_estimator_ros = grid_search.fit(x_train_ros , y_train_ros)
    rf_best_ros = best_estimator_ros.best_estimator_
    rf_best_ros.fit(x_train_ros , y_train_ros)
    predict_ros = rf_best_ros.predict(x_test)
    matrices_ros.append(confusion_matrix(y_true = y_test, y_pred = predict_ros))
    
    # Using combination of under-sampling and over-sampling
    x_train_comb, y_train_comb = under_over.fit_resample(x_train, y_train)
    best_estimator_comb = grid_search.fit(x_train_comb, y_train_comb)
    rf_best_comb = best_estimator_comb.best_estimator_
    rf_best_comb.fit(x_train_comb, y_train_comb)
    predict_comb = rf_best_comb.predict(x_test)
    matrices_comb.append(confusion_matrix(y_true = y_test, y_pred = predict_comb))

In [None]:
confusion_matrices.append(matrices_full)
confusion_matrices.append(matrices_bal)
confusion_matrices.append(matrices_balsub)
confusion_matrices.append(matrices_rus)
confusion_matrices.append(matrices_ros)
confusion_matrices.append(matrices_comb)

In [None]:
PrecisionAndRecallTableConstuctor(confusion_matrices, set_labels_2, set_columns_2)

> Answer

> Imbalanced classification is specifically hard because of the severely skewed class distribution and the unequal misclassification costs.
This is the cause for poor performance with traditional machine learning models and evaluation metrics that assume a balanced class distribution.

> Classification algorithms are unable to make reliable models
on the datasets with huge sizes. These datasets contain many
irrelevant and redundant features that mislead the classifiers.
Furthermore, many huge datasets have imbalanced class
distribution which leads to bias over majority class in the
classification process. 