In [None]:
from imports import *

In [None]:
from utils import *

In [None]:
!pip install -U scikit-learn --user

In [None]:
import os
if not os.path.exists("./results/SL/"): os.mkdir("./results/SL/")
if not os.path.exists("./results/SSL/"): os.mkdir("./results/SSL/")

if not os.path.exists("./results/SL/thresh_20.0/"): os.mkdir("./results/SL/thresh_20.0/")
if not os.path.exists("./results/SL/thresh_60.0/"): os.mkdir("./results/SL/thresh_60.0/")
if not os.path.exists("./results/SSL/thresh_20.0/"): os.mkdir("./results/SSL/thresh_20.0/")
if not os.path.exists("./results/SSL/thresh_60.0/"): os.mkdir("./results/SSL/thresh_60.0/")
if not os.path.exists("./results/bargraph/"): os.mkdir("./results/bargraph")

### Load Test Data

In [None]:
test1, test2, test3, test4 = get_test_dataframe()

In [None]:
data_tuple, targets, X1_test, y1_test, X2_test, y2_test, X3_test, y3_test, X4_test, y4_test = get_X_and_y(test1, test2, test3, test4,normalize=True)

### Get Classifiers (KNN, LogReg, SVM, Decision Tree, Random Forest, AdaBoost)

In [None]:
clfs, model_log = get_classifiers()

### Load Pre-Trained Classifier Models for SL and SSL

In [None]:
def get_pickle_for_each_classifier(clf_name,type_learning="SL",percentage=None):
  sl_pkls = []
  for i in range(4):
    if (type_learning == "SL"):
      sl_pkls.append('./SL_models/{}/SL_Test_{}_{}_model.pkl'.format(percentage,i,clf_name))
    else: #SSL
      sl_pkls.append('./SSL_models/{}/SSL_Test_{}_{}_model.pkl'.format(percentage,i,clf_name))
  return sl_pkls

### Compute Classifier Model Performance for each of the 4 Diagnosis Tests (classes) for Cervical Cancer

In [None]:
def model_performance(data, clfs, targets, type_learning="SL", percentage=None):
  accuracyscore = {} # Dictionary to store accuracy scores
  performance = pd.DataFrame(columns=targets) # DataFrame to store classifier performance
  reports = []
  for clf in clfs:
    clf_name = clf.__class__.__name__
    pickle_files = get_pickle_for_each_classifier(clf_name,type_learning,percentage)
    # print(pickle_files)
    key = clf_name
    accuracyscore[key] = []
    for i, value in enumerate(data):
      X_test, y_test = value
      with open(pickle_files[i], 'rb') as f:
        classifier = pickle.load(f)
        # Predict logic
        y_pred = classifier.predict(X_test) 
        
        # Calculate accuracy score to select our best classifier in each test
        accuracy = round(accuracy_score(y_test, y_pred), 4)
        accuracyscore[key].append(accuracy)
        # print("# "*10, clf_name, " # "*5, "Test ", targets[i])

        cm = confusion_matrix(y_test, y_pred)
        cm_display = ConfusionMatrixDisplay(cm).plot()
        plt.title("{}_Test_{}_{}".format(percentage,targets[i],clf_name))
        plt.savefig('./results/SL/{}/Test{}_{}.png'.format(percentage,targets[i],clf_name), bbox_inches = 'tight')
        # plt.show()

        report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
        report = report.style.set_caption("Clf = {} Test = {} ".format(clf_name, targets[i]))
        reports.append(report)

    performance.loc[key, targets] = accuracyscore[key]
    

  return performance, reports

## SL Testing - 20% unlabelled data

In [None]:
performance_SL_20, reports_20 = model_performance(data_tuple, clfs, targets ,type_learning="SL", percentage="thresh_20.0")

In [None]:
display(performance_SL_20)
print(performance_SL_20.to_latex())
# for i in range(len(reports_20)):
#   display(reports_20[i])
  # print(reports_20[i].to_latex())

In [None]:
best = []
for i in range(len(model_log)):
  tests = [f for f in performance_SL_20.columns]
  clf_acc = [test for test in performance_SL_20.iloc[i]]
  index = np.argmax(clf_acc)
  best_test = model_log[index]
  best.append(tests[index])
  print("For " + str(model_log[i]) + ", the best test is: " + str(tests[index]))

In [None]:
# Plotting the test accuracy bar graph of all the SL classifiers for Hinselmann (the best diagnosis test)
height =  [clf_acc*100 for clf_acc in performance_SL_20.Hinselmann]
bars = model_log
x_pos = np.arange(len(bars))
plt.figure(figsize=(10, 6))
# Create bars and choose color
plt.barh(x_pos, height, color="green")
for index, value in enumerate(height):
  plt.text(value, index, str(value), color="blue", fontsize="xx-large")
# Add title and axis names
plt.title('Test Accuracy Bar Graph of all the SL classifiers for Hinselmann (the best diagnosis test)', fontsize="xx-large")
plt.xlabel('Test Accuracy', fontsize="xx-large")
plt.ylabel('Classifiers', fontsize="xx-large")
# Create names on the y and x axes
plt.yticks(x_pos, bars, fontsize="xx-large")
plt.xticks(fontsize="xx-large")
# Show graph
plt.grid()

plt.savefig('./results/bargraph/SL_20.png', bbox_inches = 'tight')
plt.show()

## SSL Testing (20 % Unlabelled Data)

In [None]:
performance_SSL_20, reports_SSL_20 = model_performance(data_tuple, clfs, targets, type_learning="SSL", percentage="thresh_20.0")

In [None]:
display(performance_SSL_20)
# for i in range(len(reports_SSL_20)):
#   display(reports_SSL_20[i])
  # print(reports_SSL_20[i].to_latex())

In [None]:
best = []
for i in range(len(model_log)):
  tests = [f for f in performance_SSL_20.columns]
  clf_acc = [test for test in performance_SSL_20.iloc[i]]
  index = np.argmax(clf_acc)
  best_test = model_log[index]
  print("For " + str(model_log[i]) + ", the best test is: " + str(tests[index]))

In [None]:
# Plotting the test accuracy bar graph of all the SSL classifiers (20 % unlabelled data) for Hinselmann (the best diagnosis test)
height =  [clf_acc*100 for clf_acc in performance_SSL_20.Hinselmann]
bars = model_log
x_pos = np.arange(len(bars))
plt.figure(figsize=(10, 6))
# Create bars and choose color
plt.barh(x_pos, height, color="green")
for index, value in enumerate(height):
  plt.text(value, index, str(value), color="blue", fontsize="xx-large")
# Add title and axis names
plt.title('Test Accuracy Bar Graph of all the SSL classifiers (20 % unlabelled data) for Hinselmann (the best diagnosis test)', fontsize="xx-large")
plt.xlabel('Classifiers', fontsize="xx-large")
plt.ylabel('Test Accuracy', fontsize="xx-large")
# Create names on the y and x axes
plt.yticks(x_pos, bars, fontsize="xx-large")
plt.xticks(fontsize="xx-large")
# Show graph
plt.grid()

plt.savefig('./results/bargraph/SSL_20.png', bbox_inches = 'tight')
plt.show()

##SL Testing (60% Unlabelled Data)

In [None]:
performance_SL_60, reports_60 = model_performance(data_tuple, clfs, targets ,type_learning="SL", percentage="thresh_60.0")

In [None]:
display(performance_SL_60)

In [None]:
best = []
for i in range(len(model_log)):
  tests = [f for f in performance_SL_60.columns]
  clf_acc = [test for test in performance_SL_60.iloc[i]]
  index = np.argmax(clf_acc)
  best_test = model_log[index]
  best.append(tests[index])
  print("For " + str(model_log[i]) + ", the best test is: " + str(tests[index]))

In [None]:
# Plotting the test accuracy bar graph of all the SL classifiers for Hinselmann (the best diagnosis test)
height =  [clf_acc*100 for clf_acc in performance_SL_60.Hinselmann]
bars = model_log
x_pos = np.arange(len(bars))
plt.figure(figsize=(10, 6))
# Create bars and choose color
plt.barh(x_pos, height, color="green")
for index, value in enumerate(height):
  plt.text(value, index, str(value), color="blue", fontsize="xx-large")
# Add title and axis names
plt.title('Test Accuracy Bar Graph of all the SL classifiers for Hinselmann (the best diagnosis test)', fontsize="xx-large")
plt.xlabel('Test Accuracy', fontsize="xx-large")
plt.ylabel('Classifiers', fontsize="xx-large")
# Create names on the y and x axes
plt.yticks(x_pos, bars, fontsize="xx-large")
plt.xticks(fontsize="xx-large")
# Show graph
plt.grid()

plt.savefig('./results/bargraph/SL_60.png', bbox_inches = 'tight')
plt.show()

## SSL Testing (60 % Unlabelled Data)

In [None]:
performance_SSL_60, reports_SSL_60 = model_performance(data_tuple, clfs, targets,type_learning="SSL",percentage="thresh_60.0")

In [None]:
display(performance_SSL_60)

In [None]:
display(performance_SSL_60)
best = []
for i in range(len(model_log)):
  tests = [f for f in performance_SSL_60.columns]
  clf_acc = [test for test in performance_SSL_60.iloc[i]]
  index = np.argmax(clf_acc)
  best_test = model_log[index]
  best.append(tests[index])
  print("For " + str(model_log[i]) + ", the best test is: " + str(tests[index]))

In [None]:
# Plotting the test accuracy bar graph of all the SSL classifiers (60 % unlabelled data) for Hinselmann (the best diagnosis test)
height =  [clf_acc*100 for clf_acc in performance_SSL_60.Hinselmann]
bars = model_log
x_pos = np.arange(len(bars))
plt.figure(figsize=(10, 6))
# Create bars and choose color
plt.barh(x_pos, height, color="green")
for index, value in enumerate(height):
  plt.text(value, index, str(value), color="blue", fontsize="xx-large")
# Add title and axis names
plt.title('Test Accuracy Bar Graph of all the SSL classifiers (60 % unlabelled data) for Hinselmann (the best diagnosis test)', fontsize="xx-large")
plt.xlabel('Classifiers', fontsize="xx-large")
plt.ylabel('Test Accuracy', fontsize="xx-large")
# Create names on the y and x axes
plt.yticks(x_pos, bars, fontsize="xx-large")
plt.xticks(fontsize="xx-large")
# Show graph
plt.grid()

plt.savefig('./results/bargraph/SSL_60.png', bbox_inches = 'tight')
plt.show()

In [None]:
from matplotlib.pyplot import plot
plt.plot([80,40],[86.22,  82.67],label="SL")
plt.plot([80,40],[90.55,  85.44],label="SSL")
plt.ylabel("Accuracy")
plt.xlabel("Percentage of labelled points ")
plt.grid()
plt.legend()

From the above plot we can see that when 20% of points are masked as unlabelled, 80% is labelled. In this case,
SL performs better than SSL because, for SL training is done on labelled points only therfore there is more information available as only 20% is masked.

When 60% of points are masked as unlabelled, 
SSL performs better than SL because, Sl model is overfitting the tarining data (limited labelled points available) whereas for SSL the possibility of the model to overfit on the training data is reduced.

References:

*   Kaggle
1.   https://www.kaggle.com/code/saqibsarwarkhan/cervical-cancer-risk-analysis
2.   https://www.kaggle.com/datasets/loveall/cervical-cancer-risk-classification


*   GitHub
1.   https://github.com/topics/cervical-cancer
2.   https://github.com/avivace/cervical-cancer/blob/master/report.pdf


*   Datahub.io: https://datahub.io/machine-learning/cervical-cancer
