## Instal Toolkit, Import Libraries, Access Google Drive

In [None]:
# Imports

# General Libraries
import numpy as np
import pandas as pd
import time
import random

# For Training
from sklearn.model_selection import GroupKFold

# Measurements
from LAMDA_SSL.Evaluation.Classifier.Accuracy import Accuracy
from LAMDA_SSL.Evaluation.Classifier.Precision import Precision
from LAMDA_SSL.Evaluation.Classifier.Confusion_Matrix import Confusion_Matrix

# Supervised Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Semi-Supervised Models
from LAMDA_SSL.Algorithm.Classification.Co_Training import Co_Training
from LAMDA_SSL.Algorithm.Classification.Tri_Training import Tri_Training
from LAMDA_SSL.Algorithm.Classification.SSGMM import SSGMM
from LAMDA_SSL.Algorithm.Classification.Assemble import Assemble
from LAMDA_SSL.Algorithm.Classification.SemiBoost import SemiBoost

## Co-Training (with Group k-fold)

In [None]:
def coTraining(estimatorType, feature, label, folds, groups, labelledCount, fileName, saveLoc):
  # Lists to store results
  foldNumber = []
  trainAccuracy = []
  trainPrecision = []
  testAccuracy = []
  testPrecision = []
  confusionMatrixList = []

  group_kfold = GroupKFold(n_splits = folds)
  for train_index, test_index in group_kfold.split(feature, label, groups):
    # Getting Train and Test Sets
    trainFeatures, testFeatures = feature.iloc[train_index].values, feature.iloc[test_index].values
    trainLabels, testLabels = label.iloc[train_index].values, label.iloc[test_index].values

    numOfLabeledParticipants = int(len(trainLabels) * (labelledCount/16))
    labeled_mask = np.array([True] * numOfLabeledParticipants + [False] * (len(trainLabels) - numOfLabeledParticipants))
    random.shuffle(labeled_mask)

    # Train Set Labelled Data
    labeledFeatures = trainFeatures[labeled_mask]
    labeledLabels = trainLabels[labeled_mask]

    # Train Set Unlabelled Data
    unlabeledFeatures = trainFeatures[~labeled_mask]
    unlabeledLabels = trainLabels[~labeled_mask]

    # For Evaluation
    evaluation={'Accuracy':Accuracy(), 'Precision':Precision(average='macro'), 'ConfusionMatrix':Confusion_Matrix()}

    # Choosing base estimators
    if estimatorType == "BLR":
      estimator1 = LogisticRegression(solver = "lbfgs", max_iter = 2000)
      estimator2 = LogisticRegression(solver = "lbfgs", max_iter = 2000)
    elif estimatorType == "RF":
      estimator1 = RandomForestClassifier(n_estimators = 50, random_state = 42)
      estimator2 = RandomForestClassifier(n_estimators = 50, random_state = 42)
    elif estimatorType == "NN":
      estimator1 = MLPClassifier(hidden_layer_sizes = (32,), max_iter = 10000, random_state = 42)
      estimator2 = MLPClassifier(hidden_layer_sizes = (32,), max_iter = 10000, random_state = 42)
    else: raise Exception("Invalid")

    # Creating and fitting Model
    model = Co_Training(base_estimator = estimator1, base_estimator_2 = estimator2, evaluation=evaluation)
    model.fit(X = labeledFeatures, y = labeledLabels, unlabeled_X = unlabeledFeatures)

    # Getting Performance Results
    performanceTrain = model.evaluate(X = trainFeatures,y = trainLabels)
    performanceTest = model.evaluate(X = testFeatures,y = testLabels)

    # Append results
    foldNumber.append(len(foldNumber)+1)
    trainAccuracy.append(performanceTrain["Accuracy"])
    trainPrecision.append(performanceTrain["Precision"])
    testAccuracy.append(performanceTest["Accuracy"])
    testPrecision.append(performanceTest["Precision"])
    confusionMatrixList.append(performanceTest["ConfusionMatrix"])

  # Save per fold results
  csvFile = pd.DataFrame({"foldNumber": foldNumber, "trainAccuracy": trainAccuracy, "trainPrecision": trainPrecision, "testAccuracy": testAccuracy, "testPrecision": testPrecision, "confusionMatrix": confusionMatrixList})
  csvFile.to_csv(saveLoc + "Co-Training/Per Fold Results/" + fileName + ".csv", index=False)

  # Save Confusion Matrix
  with open(saveLoc + "Co-Training/ConfusionMatrices.csv", "a") as file:
    file.write(fileName + "," + str("".join(f"{row}" for row in np.mean(confusionMatrixList, axis=0)) + "\n"))
    file.close

## Tri-Training (with Group k-fold)

In [None]:
def triTraining(estimatorType, feature, label, folds, groups, labelledCount, fileName, saveLoc):
  # Lists to store results
  foldNumber = []
  trainAccuracy = []
  trainPrecision = []
  testAccuracy = []
  testPrecision = []
  confusionMatrixList = []

  group_kfold = GroupKFold(n_splits=folds)
  for train_index, test_index in group_kfold.split(feature, label, groups):
    # Getting Train and Test Sets
    trainFeatures, testFeatures = feature.iloc[train_index].values, feature.iloc[test_index].values
    trainLabels, testLabels = label.iloc[train_index].values, label.iloc[test_index].values

    numOfLabeledParticipants = int(len(trainLabels) * (labelledCount/16))
    labeled_mask = np.array([True] * numOfLabeledParticipants + [False] * (len(trainLabels) - numOfLabeledParticipants))
    random.shuffle(labeled_mask)

    # Train Set Labelled Data
    labeledFeatures = trainFeatures[labeled_mask]
    labeledLabels = trainLabels[labeled_mask]

    # Train Set Unlabelled Data
    unlabeledFeatures = trainFeatures[~labeled_mask]
    unlabeledLabels = trainLabels[~labeled_mask]

    # For Evaluation
    evaluation={
      'Accuracy':Accuracy(),
      'Precision':Precision(average='macro'),
      'ConfusionMatrix':Confusion_Matrix()
    }

    # Choosing base estimators
    if estimatorType == "BLR":
      estimator1 = LogisticRegression(solver = "lbfgs", max_iter = 2000)
      estimator2 = LogisticRegression(solver = "lbfgs", max_iter = 2000)
      estimator3 = LogisticRegression(solver = "lbfgs", max_iter = 2000)
    elif estimatorType == "RF":
      estimator1 = RandomForestClassifier(n_estimators = 50, random_state = 42)
      estimator2 = RandomForestClassifier(n_estimators = 50, random_state = 42)
      estimator3 = RandomForestClassifier(n_estimators = 50, random_state = 42)
    elif estimatorType == "NN":
      estimator1 = MLPClassifier(hidden_layer_sizes = (32,), max_iter = 10000, random_state = 42)
      estimator2 = MLPClassifier(hidden_layer_sizes = (32,), max_iter = 10000, random_state = 42)
      estimator3 = MLPClassifier(hidden_layer_sizes = (32,), max_iter = 10000, random_state = 42)
    else: raise Exception("Invalid")

    # Creating and fitting model
    model = Tri_Training(base_estimator = estimator1, base_estimator_2 = estimator2, base_estimator_3 = estimator3, evaluation = evaluation)
    model.fit(X = labeledFeatures, y = labeledLabels, unlabeled_X = unlabeledFeatures)

    # Getting Performance Results
    performanceTrain = model.evaluate(X = trainFeatures,y = trainLabels)
    performanceTest = model.evaluate(X = testFeatures,y = testLabels)

    # Append results
    foldNumber.append(len(foldNumber)+1)
    trainAccuracy.append(performanceTrain["Accuracy"])
    trainPrecision.append(performanceTrain["Precision"])
    testAccuracy.append(performanceTest["Accuracy"])
    testPrecision.append(performanceTest["Precision"])
    confusionMatrixList.append(performanceTest["ConfusionMatrix"])

  # Save per fold results
  csvFile = pd.DataFrame({"foldNumber": foldNumber, "trainAccuracy": trainAccuracy, "trainPrecision": trainPrecision, "testAccuracy": testAccuracy, "testPrecision": testPrecision, "confusionMatrix": confusionMatrixList})
  csvFile.to_csv(saveLoc + "Tri-Training/Per Fold Results/" + fileName + ".csv", index=False)

  # Save Confusion Matrix
  with open(saveLoc + "Tri-Training/ConfusionMatrices.csv", "a") as file:
    file.write(fileName + "," + str("".join(f"{row}" for row in np.mean(confusionMatrixList, axis=0)) + "\n"))
    file.close

## SSGMM (with Group k-fold)

In [None]:
def SSGMMModel(feature, label, folds, groups, labelledCount, fileName, saveLoc):
  # Lists to store results
  foldNumber = []
  trainAccuracy = []
  trainPrecision = []
  testAccuracy = []
  testPrecision = []
  confusionMatrixList = []

  group_kfold = GroupKFold(n_splits = folds)
  for train_index, test_index in group_kfold.split(feature, label, groups):
    start = time.time()

    # Getting Train and Test Sets
    trainFeatures, testFeatures = feature.iloc[train_index].values, feature.iloc[test_index].values
    trainLabels, testLabels = label.iloc[train_index].values, label.iloc[test_index].values

    numOfLabeledParticipants = int(len(trainLabels) * (labelledCount/16))
    labeled_mask = np.array([True] * numOfLabeledParticipants + [False] * (len(trainLabels) - numOfLabeledParticipants))
    random.shuffle(labeled_mask)

    # Train Set Labelled Data
    labeledFeatures = trainFeatures[labeled_mask]
    labeledLabels = trainLabels[labeled_mask]

    # Train Set Unlabelled Data
    unlabeledFeatures = trainFeatures[~labeled_mask]

    # For Evaluation
    evaluation={
      'Accuracy':Accuracy(),
      'Precision':Precision(average='macro'),
      'ConfusionMatrix':Confusion_Matrix()
    }

    # Creating and fitting model
    model = SSGMM(num_classes=2, tolerance=0.000001, max_iterations = 5, evaluation = evaluation)
    model.fit(X = labeledFeatures, y = labeledLabels, unlabeled_X = unlabeledFeatures)

    # Getting Performance Results
    performanceTrain = model.evaluate(X = trainFeatures,y = trainLabels)
    performanceTest = model.evaluate(X = testFeatures,y = testLabels)

    # Append results
    foldNumber.append(len(foldNumber)+1)
    trainAccuracy.append(performanceTrain["Accuracy"])
    trainPrecision.append(performanceTrain["Precision"])
    testAccuracy.append(performanceTest["Accuracy"])
    testPrecision.append(performanceTest["Precision"])
    confusionMatrixList.append(performanceTest["ConfusionMatrix"])

  # Save per fold results
  csvFile = pd.DataFrame({"foldNumber": foldNumber, "trainAccuracy": trainAccuracy, "trainPrecision": trainPrecision, "testAccuracy": testAccuracy, "testPrecision": testPrecision, "confusionMatrix": confusionMatrixList})
  csvFile.to_csv(saveLoc + "SSGMM/Per Fold Results/" + fileName + ".csv", index=False)

  # Save Confusion Matrix
  with open(saveLoc + "SSGMM/ConfusionMatrices.csv", "a") as file:
    file.write(fileName + "," + str("".join(f"{row}" for row in np.mean(confusionMatrixList, axis=0)) + "\n"))
    file.close

## Assemble (with Group k-fold)

In [None]:
def assemble(estimatorType, feature, label, folds, groups, labelledCount, fileName, saveLoc):
  # Lists to store results
  foldNumber = []
  trainAccuracy = []
  trainPrecision = []
  testAccuracy = []
  testPrecision = []
  confusionMatrixList = []

  group_kfold = GroupKFold(n_splits=folds)
  for train_index, test_index in group_kfold.split(feature, label, groups):
    # Getting Train and Test Sets
    trainFeatures, testFeatures = feature.iloc[train_index].values, feature.iloc[test_index].values
    trainLabels, testLabels = label.iloc[train_index].values, label.iloc[test_index].values

    numOfLabeledParticipants = int(len(trainLabels) * (labelledCount/16))
    labeled_mask = np.array([True] * numOfLabeledParticipants + [False] * (len(trainLabels) - numOfLabeledParticipants))
    random.shuffle(labeled_mask)

    # Train Set Labelled Data
    labeledFeatures = trainFeatures[labeled_mask]
    labeledLabels = trainLabels[labeled_mask]

    # Train Set Unlabelled Data
    unlabeledFeatures = trainFeatures[~labeled_mask]

    # For Evaluation
    evaluation={
      'Accuracy':Accuracy(),
      'Precision':Precision(average='macro'),
      'ConfusionMatrix':Confusion_Matrix()
    }

    # Choosing base estimators
    if estimatorType == "BLR":
      estimator = LogisticRegression(solver = "lbfgs", max_iter = 2000)
    elif estimatorType == "RF":
      estimator = RandomForestClassifier(n_estimators = 50, random_state = 42)
    elif estimatorType == "NN":
      estimator = MLPClassifier(hidden_layer_sizes = (32,), max_iter = 10000, random_state = 42)
    else: raise Exception("Invalid")

    # Creating Model
    model = Assemble(base_estimator = estimator, evaluation = evaluation)

    # Fitting Model
    model.fit(X = labeledFeatures, y = labeledLabels, unlabeled_X = unlabeledFeatures)

    # Getting Performance Results
    performanceTrain = model.evaluate(X = trainFeatures,y = trainLabels)
    performanceTest = model.evaluate(X = testFeatures,y = testLabels)

    # Append results
    foldNumber.append(len(foldNumber)+1)
    trainAccuracy.append(performanceTrain["Accuracy"])
    trainPrecision.append(performanceTrain["Precision"])
    testAccuracy.append(performanceTest["Accuracy"])
    testPrecision.append(performanceTest["Precision"])
    confusionMatrixList.append(performanceTest["ConfusionMatrix"])

  # Save per fold results
  csvFile = pd.DataFrame({"foldNumber": foldNumber, "trainAccuracy": trainAccuracy, "trainPrecision": trainPrecision, "testAccuracy": testAccuracy, "testPrecision": testPrecision, "confusionMatrix": confusionMatrixList})
  csvFile.to_csv(saveLoc + "Assemble/Per Fold Results/" + fileName + ".csv", index=False)

  # Save Confusion Matrix
  with open(saveLoc + "Assemble/ConfusionMatrices.csv", "a") as file:
    file.write(fileName + "," + str("".join(f"{row}" for row in np.mean(confusionMatrixList, axis=0)) + "\n"))
    file.close

## SemiBoost (with Group k-fold)

In [None]:
def semiBoost(estimatorType, feature, label, folds, groups, labelledCount, fileName, saveLoc):
  # Lists to store results
  foldNumber = []
  trainAccuracy = []
  trainPrecision = []
  testAccuracy = []
  testPrecision = []
  confusionMatrixList = []

  group_kfold = GroupKFold(n_splits=folds)
  for train_index, test_index in group_kfold.split(feature, label, groups):
    # Getting Train and Test Sets
    trainFeatures, testFeatures = feature.iloc[train_index].values, feature.iloc[test_index].values
    trainLabels, testLabels = label.iloc[train_index].values, label.iloc[test_index].values

    numOfLabeledParticipants = int(len(trainLabels) * (labelledCount/16))
    labeled_mask = np.array([True] * numOfLabeledParticipants + [False] * (len(trainLabels) - numOfLabeledParticipants))
    random.shuffle(labeled_mask)

    # Train Set Labelled Data
    labeledFeatures = trainFeatures[labeled_mask]
    labeledLabels = trainLabels[labeled_mask]

    # Train Set Unlabelled Data
    unlabeledFeatures = trainFeatures[~labeled_mask]
    unlabeledLabels = trainLabels[~labeled_mask]

    # For Evaluation
    evaluation={
      'Accuracy':Accuracy(),
      'Precision':Precision(average='macro'),
      'ConfusionMatrix':Confusion_Matrix()
    }

    # Choosing base estimators
    if estimatorType == "BLR":
      estimator = LogisticRegression(solver = "lbfgs", max_iter = 2000)
    elif estimatorType == "RF":
      estimator = RandomForestClassifier(n_estimators = 50, random_state = 42)
    elif estimatorType == "NN":
      estimator = MLPClassifier(hidden_layer_sizes = (32,), max_iter = 10000, random_state = 42)
    else: raise Exception("Invalid")

    # Creating and fitting model
    model=SemiBoost(base_estimator=estimator, evaluation=evaluation)
    model.fit(X = labeledFeatures, y = labeledLabels, unlabeled_X = unlabeledFeatures)

    # Getting Performance Results
    performanceTrain = model.evaluate(X = trainFeatures,y = trainLabels)
    performanceTest = model.evaluate(X = testFeatures,y = testLabels)

    # Append results
    foldNumber.append(len(foldNumber)+1)
    trainAccuracy.append(performanceTrain["Accuracy"])
    trainPrecision.append(performanceTrain["Precision"])
    testAccuracy.append(performanceTest["Accuracy"])
    testPrecision.append(performanceTest["Precision"])
    confusionMatrixList.append(performanceTest["ConfusionMatrix"])

  # Save per fold results
  csvFile = pd.DataFrame({"foldNumber": foldNumber, "trainAccuracy": trainAccuracy, "trainPrecision": trainPrecision, "testAccuracy": testAccuracy, "testPrecision": testPrecision, "confusionMatrix": confusionMatrixList})
  csvFile.to_csv(saveLoc + "SemiBoost/Per Fold Results/" + fileName + ".csv", index=False)

  # Save Confusion Matrix
  with open(saveLoc + "SemiBoost/ConfusionMatrices.csv", "a") as file:
    file.write(fileName + "," + str("".join(f"{row}" for row in np.mean(confusionMatrixList, axis=0)) + "\n"))
    file.close