In [None]:
from sklearn.ensemble import RandomForestClassifier  # type: ignore
from sklearn.svm import SVC, LinearSVC  # type: ignore
from evaluator import ModelEvaluator
import sys, warnings
import pandas as pd

sys.path.append("../")
from datasets.setCreator import SetCreator  # type: ignore
from datasets.setModifier import SetModifier  # type: ignore

In [None]:
N_ESTIMATORS = [100, 200, 500]
MAX_DEPTH = [5, 7, 10, 15]

C = [0.01, 0.25, 0.5, 0.75, 1]
GAMMA = [0.1, 0.5, 2, 5, 10]
DEGREE = [1, 2, 3]

In [None]:
warnings.filterwarnings("ignore")

setModifier = SetModifier()
evaluator = ModelEvaluator()

setCreator = SetCreator()
dataset1 = setCreator.getSetList1()
dataset2 = setCreator.getSetList2()
dataset3 = setCreator.getSetList3()
dataset4 = setCreator.getSetList4()
dataset5 = setCreator.getSetList5()

# RandomForestClassifiers

In [None]:
def testWithErgotFeatures(predictor: str, datasets: list):
    print("Running the Random Forest Classifier:")
    print(f"Predicting: {predictor}\n\n")
    numExcludedSets = 0
    bestModel = {"auc": 0}

    numCombinations = len(datasets)

    for currData in datasets:
        trainningData = pd.DataFrame(currData["train"])
        trainY = trainningData[predictor]
        trainX = setModifier.rmErgotPredictors(trainningData)

        testData = pd.DataFrame(currData["test"])
        testY = testData[predictor]
        testX = setModifier.rmErgotPredictors(testData)

        for estimator in N_ESTIMATORS:
            for depth in MAX_DEPTH:
                try:
                    rfc = RandomForestClassifier(
                        random_state=5, n_estimators=estimator, max_depth=depth
                    )
                    rfc.fit(trainX, trainY)
                    results = evaluator.evaluateClassification(
                        rfc, currData["desc"], trainX, trainY, testX, testY
                    )

                    if results["auc"] > bestModel["auc"]:
                        bestModel = results

                        bestModel["n_estimators"] = estimator
                        bestModel["max_depth"] = depth

                except Exception as e:
                    numExcludedSets += 1

    print(
        f"[{numCombinations - numExcludedSets}/{numCombinations}] sets were tested on the Random Forest Classifier"
    )
    print(f"The best model, as per roc, was:\n {bestModel}")

In [None]:
def testWithoutErgotFeatures(predictor: str, datasets: list):
    print("Running the Random Forest Classifier:")
    print(f"Predicting: {predictor}\n\n")
    numExcludedSets = 0
    bestModel = {"auc": 0}

    numCombinations = len(datasets)

    for currData in datasets:
        trainningData = pd.DataFrame(currData["train"])
        trainY = trainningData[predictor]
        trainX = setModifier.rmErgotFeatures(trainningData)

        testData = pd.DataFrame(currData["test"])
        testY = testData[predictor]
        testX = setModifier.rmErgotFeatures(testData)

        for estimator in N_ESTIMATORS:
            for depth in MAX_DEPTH:
                try:
                    rfc = RandomForestClassifier(
                        random_state=5, n_estimators=estimator, max_depth=depth
                    )
                    rfc.fit(trainX, trainY)
                    results = evaluator.evaluateClassification(
                        rfc, currData["desc"], trainX, trainY, testX, testY
                    )

                    if results["auc"] > bestModel["auc"]:
                        bestModel = results

                        bestModel["n_estimators"] = estimator
                        bestModel["max_depth"] = depth

                except Exception as e:
                    numExcludedSets += 1

    print(
        f"[{numCombinations - numExcludedSets}/{numCombinations}] sets were tested on the Random Forest Classifier"
    )
    print(f"The best model, as per roc, was:\n {bestModel}")

In [None]:
testWithErgotFeatures("ergot_present_in_q3", dataset1)
testWithoutErgotFeatures("ergot_present_in_q3", dataset1)
testWithErgotFeatures("ergot_present_in_q3", dataset2)
testWithoutErgotFeatures("ergot_present_in_q3", dataset2)
testWithErgotFeatures("ergot_present_in_q3", dataset3)
testWithoutErgotFeatures("ergot_present_in_q3", dataset3)
testWithErgotFeatures("ergot_present_in_q3", dataset4)
testWithoutErgotFeatures("ergot_present_in_q3", dataset4)
testWithErgotFeatures("ergot_present_in_q3", dataset5)
testWithoutErgotFeatures("ergot_present_in_q3", dataset5)

In [None]:
testWithErgotFeatures("ergot_present_in_q4", dataset1)
testWithoutErgotFeatures("ergot_present_in_q4", dataset1)
testWithErgotFeatures("ergot_present_in_q4", dataset2)
testWithoutErgotFeatures("ergot_present_in_q4", dataset2)
testWithErgotFeatures("ergot_present_in_q4", dataset3)
testWithoutErgotFeatures("ergot_present_in_q4", dataset3)
testWithErgotFeatures("ergot_present_in_q4", dataset4)
testWithoutErgotFeatures("ergot_present_in_q4", dataset4)
testWithErgotFeatures("ergot_present_in_q4", dataset5)
testWithoutErgotFeatures("ergot_present_in_q4", dataset5)

In [None]:
testWithErgotFeatures("sum_severity_in_q3", dataset1)
testWithoutErgotFeatures("sum_severity_in_q3", dataset1)
testWithErgotFeatures("sum_severity_in_q3", dataset2)
testWithoutErgotFeatures("sum_severity_in_q3", dataset2)
testWithErgotFeatures("sum_severity_in_q3", dataset3)
testWithoutErgotFeatures("sum_severity_in_q3", dataset3)
testWithErgotFeatures("sum_severity_in_q3", dataset4)
testWithoutErgotFeatures("sum_severity_in_q3", dataset4)
testWithErgotFeatures("sum_severity_in_q3", dataset5)
testWithoutErgotFeatures("sum_severity_in_q3", dataset5)

In [None]:
testWithErgotFeatures("sum_severity_in_q4", dataset1)
testWithoutErgotFeatures("sum_severity_in_q4", dataset1)
testWithErgotFeatures("sum_severity_in_q4", dataset2)
testWithoutErgotFeatures("sum_severity_in_q4", dataset2)
testWithErgotFeatures("sum_severity_in_q4", dataset3)
testWithoutErgotFeatures("sum_severity_in_q4", dataset3)
testWithErgotFeatures("sum_severity_in_q4", dataset4)
testWithoutErgotFeatures("sum_severity_in_q4", dataset4)
testWithErgotFeatures("sum_severity_in_q4", dataset5)
testWithoutErgotFeatures("sum_severity_in_q4", dataset5)

# Support Vector Machine

In [None]:
def SVMLinearWithErgotFeatures(predictor: str, datasets: list):
    print("Running linear SVM Classifier:")
    print(f"Predicting: {predictor}\n\n")
    numExcludedSets = 0
    bestModel = {"auc": 0.0}

    numCombinations = len(datasets)

    for currData in datasets:
        trainningData = currData["train"]
        trainY = trainningData[predictor]
        trainX = setModifier.rmErgotPredictors(trainningData)

        testData = currData["test"]
        testY = testData[predictor]
        testX = setModifier.rmErgotPredictors(testData)

        for c in C:
            try:
                smv = LinearSVC(C=c, random_state=0)
                smv.fit(trainX, trainY)
                results = evaluator.evaluateClassification(
                    smv,
                    currData["desc"],
                    trainX,
                    trainY,
                    testX,
                    testY,
                    hasFeatImportance=False,
                )

                if results["auc"] > bestModel["auc"]:
                    bestModel = results

                    bestModel["c"] = c

            except Exception as e:
                numExcludedSets += 1

    print(
        f"[{numCombinations - numExcludedSets}/{numCombinations}] sets were tested on the linear SVM Classifier"
    )
    print(f"The best model, as per roc, was:\n {bestModel}")

In [None]:
def SVMPolyWithErgotFeatures(predictor: str, datasets: list):
    print("Running poly kernal SVM Classifier:")
    print(f"Predicting: {predictor}\n\n")
    numExcludedSets = 0
    bestModel = {"auc": 0.0}

    numCombinations = len(datasets)

    for currData in datasets:
        trainningData = currData["train"]
        trainY = trainningData[predictor]
        trainX = setModifier.rmErgotPredictors(trainningData)

        testData = currData["test"]
        testY = testData[predictor]
        testX = setModifier.rmErgotPredictors(testData)

        for c in C:
            for deg in DEGREE:
                try:
                    smv = SVC(kernel="poly", degree=deg, coef0=1, C=c, random_state=0)
                    smv.fit(trainX, trainY)
                    results = evaluator.evaluateClassification(
                        smv,
                        currData["desc"],
                        trainX,
                        trainY,
                        testX,
                        testY,
                        hasFeatImportance=False,
                    )

                    if results["auc"] > bestModel["auc"]:
                        bestModel = results

                        bestModel["c"] = c
                        bestModel["degree"] = deg

                except Exception as e:
                    numExcludedSets += 1

    print(
        f"[{numCombinations - numExcludedSets}/{numCombinations}] sets were tested on the poly kernal SVM Classifier"
    )
    print(f"The best model, as per roc, was:\n {bestModel}")

In [None]:
def SVMRBFWithErgotFeatures(predictor: str, datasets: list):
    print("Running RBF SVM Classifier:")
    print(f"Predicting: {predictor}\n\n")
    numExcludedSets = 0
    bestModel = {"auc": 0.0}

    numCombinations = len(datasets)

    for currData in datasets:
        trainningData = currData["train"]
        trainY = trainningData[predictor]
        trainX = setModifier.rmErgotPredictors(trainningData)

        testData = currData["test"]
        testY = testData[predictor]
        testX = setModifier.rmErgotPredictors(testData)

        for c in C:
            for gam in GAMMA:
                try:
                    smv = SVC(kernel="rbf", gamma=gam, C=c, random_state=0)
                    smv.fit(trainX, trainY)
                    results = evaluator.evaluateClassification(
                        smv,
                        currData["desc"],
                        trainX,
                        trainY,
                        testX,
                        testY,
                        hasFeatImportance=False,
                    )

                    if results["auc"] > bestModel["auc"]:
                        bestModel = results

                        bestModel["c"] = c
                        bestModel["gamma"] = gam

                except Exception as e:
                    numExcludedSets += 1

    print(
        f"[{numCombinations - numExcludedSets}/{numCombinations}] sets were tested on the RBF SVM Classifier"
    )
    print(f"The best model, as per roc, was:\n {bestModel}")

In [None]:
SVMLinearWithErgotFeatures("ergot_present_in_q3", dataset2)
SVMPolyWithErgotFeatures("ergot_present_in_q3", dataset2)
SVMRBFWithErgotFeatures("ergot_present_in_q3", dataset2)
SVMLinearWithErgotFeatures("ergot_present_in_q3", dataset3)
SVMPolyWithErgotFeatures("ergot_present_in_q3", dataset3)
SVMRBFWithErgotFeatures("ergot_present_in_q3", dataset3)
SVMLinearWithErgotFeatures("ergot_present_in_q3", dataset4)
SVMPolyWithErgotFeatures("ergot_present_in_q3", dataset4)
SVMRBFWithErgotFeatures("ergot_present_in_q3", dataset4)
SVMLinearWithErgotFeatures("ergot_present_in_q3", dataset5)
SVMPolyWithErgotFeatures("ergot_present_in_q3", dataset5)
SVMRBFWithErgotFeatures("ergot_present_in_q3", dataset5)

In [None]:
SVMLinearWithErgotFeatures("ergot_present_in_q4", dataset2)
SVMPolyWithErgotFeatures("ergot_present_in_q4", dataset2)
SVMRBFWithErgotFeatures("ergot_present_in_q4", dataset2)
SVMLinearWithErgotFeatures("ergot_present_in_q4", dataset3)
SVMPolyWithErgotFeatures("ergot_present_in_q4", dataset3)
SVMRBFWithErgotFeatures("ergot_present_in_q4", dataset3)
SVMLinearWithErgotFeatures("ergot_present_in_q4", dataset4)
SVMPolyWithErgotFeatures("ergot_present_in_q4", dataset4)
SVMRBFWithErgotFeatures("ergot_present_in_q4", dataset4)
SVMLinearWithErgotFeatures("ergot_present_in_q4", dataset5)
SVMPolyWithErgotFeatures("ergot_present_in_q4", dataset5)
SVMRBFWithErgotFeatures("ergot_present_in_q4", dataset5)

In [None]:
SVMLinearWithErgotFeatures("sum_severity_in_q3", dataset2)
SVMPolyWithErgotFeatures("sum_severity_in_q3", dataset2)
SVMRBFWithErgotFeatures("sum_severity_in_q3", dataset2)
SVMLinearWithErgotFeatures("sum_severity_in_q3", dataset3)
SVMPolyWithErgotFeatures("sum_severity_in_q3", dataset3)
SVMRBFWithErgotFeatures("sum_severity_in_q3", dataset3)
SVMLinearWithErgotFeatures("sum_severity_in_q3", dataset4)
SVMPolyWithErgotFeatures("sum_severity_in_q3", dataset4)
SVMRBFWithErgotFeatures("sum_severity_in_q3", dataset4)
SVMLinearWithErgotFeatures("sum_severity_in_q3", dataset5)
SVMPolyWithErgotFeatures("sum_severity_in_q3", dataset5)
SVMRBFWithErgotFeatures("sum_severity_in_q3", dataset5)

In [None]:
SVMLinearWithErgotFeatures("sum_severity_in_q4", dataset2)
SVMPolyWithErgotFeatures("sum_severity_in_q4", dataset2)
SVMRBFWithErgotFeatures("sum_severity_in_q4", dataset2)
SVMLinearWithErgotFeatures("sum_severity_in_q4", dataset3)
SVMPolyWithErgotFeatures("sum_severity_in_q4", dataset3)
SVMRBFWithErgotFeatures("sum_severity_in_q4", dataset3)
SVMLinearWithErgotFeatures("sum_severity_in_q4", dataset4)
SVMPolyWithErgotFeatures("sum_severity_in_q4", dataset4)
SVMRBFWithErgotFeatures("sum_severity_in_q4", dataset4)
SVMLinearWithErgotFeatures("sum_severity_in_q4", dataset5)
SVMPolyWithErgotFeatures("sum_severity_in_q4", dataset5)
SVMRBFWithErgotFeatures("sum_severity_in_q4", dataset5)