In [2]:
"""
Uses scikit-learn Bagging Ensemble Classifier
"""

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score
import numpy as np


class BaggingClsfr:
    """
    Uses scikit-learn Bagging Ensemble Classifier to run experiments
    """

    def __init__(self, dataset, compute_f1=True):
        self.classifier = ""
        self.dtree = DecisionTreeClassifier(random_state=0)

        self.train(dataset)
        self.tune_parameters(dataset, compute_f1)
        self.train_after_tuning(dataset)
        self.test(dataset, compute_f1)

    def train(self, dataset):
        """
        Trains the classifier
        """
        x_train = dataset["train"][:, :-1]
        y_train = dataset["train"][:, -1]

        self.classifier = BaggingClassifier(
            random_state=0,
            estimator=self.dtree,
        )
        self.classifier.fit(x_train, y_train)

    def tune_parameters(self, dataset, compute_f1=True):
        """
        Tries multiple parameters and chooses the best set of parameters for the classifier
        """
        classifiers = [
            # change n_estimators
            BaggingClassifier(random_state=0, estimator=self.dtree, n_estimators=15),
            BaggingClassifier(random_state=0, estimator=self.dtree, n_estimators=20),
            BaggingClassifier(random_state=0, estimator=self.dtree, n_estimators=30),
            # change max_samples
            BaggingClassifier(
                random_state=0, estimator=self.dtree, n_estimators=15, max_samples=5
            ),
            BaggingClassifier(
                random_state=0, estimator=self.dtree, n_estimators=20, max_samples=5
            ),
            BaggingClassifier(
                random_state=0, estimator=self.dtree, n_estimators=20, max_samples=10
            ),
            BaggingClassifier(
                random_state=0, estimator=self.dtree, n_estimators=30, max_samples=10
            ),
            # change max_features
            BaggingClassifier(
                random_state=0,
                estimator=self.dtree,
                n_estimators=15,
                max_samples=5,
                max_features=5,
            ),
            BaggingClassifier(
                random_state=0,
                estimator=self.dtree,
                n_estimators=20,
                max_samples=5,
                max_features=5,
            ),
            BaggingClassifier(
                random_state=0,
                estimator=self.dtree,
                n_estimators=20,
                max_samples=10,
                max_features=10,
            ),
            BaggingClassifier(
                random_state=0,
                estimator=self.dtree,
                n_estimators=30,
                max_samples=10,
                max_features=10,
            ),
            # change oob_score
            BaggingClassifier(
                random_state=0,
                estimator=self.dtree,
                n_estimators=15,
                max_samples=5,
                max_features=5,
                oob_score=True,
            ),
            BaggingClassifier(
                random_state=0,
                estimator=self.dtree,
                n_estimators=20,
                max_samples=5,
                max_features=5,
                oob_score=True,
            ),
            BaggingClassifier(
                random_state=0,
                estimator=self.dtree,
                n_estimators=20,
                max_samples=10,
                max_features=10,
                oob_score=True,
            ),
            BaggingClassifier(
                random_state=0,
                estimator=self.dtree,
                n_estimators=30,
                max_samples=10,
                max_features=10,
                oob_score=True,
            ),
            # change bootstrap_features
            BaggingClassifier(
                random_state=0,
                estimator=self.dtree,
                n_estimators=15,
                max_samples=5,
                max_features=5,
                oob_score=True,
                bootstrap_features=True,
            ),
            BaggingClassifier(
                random_state=0,
                estimator=self.dtree,
                n_estimators=20,
                max_samples=5,
                max_features=5,
                oob_score=True,
                bootstrap_features=True,
            ),
            BaggingClassifier(
                random_state=0,
                estimator=self.dtree,
                n_estimators=20,
                max_samples=10,
                max_features=10,
                oob_score=True,
                bootstrap_features=True,
            ),
            BaggingClassifier(
                random_state=0,
                estimator=self.dtree,
                n_estimators=30,
                max_samples=10,
                max_features=10,
                oob_score=True,
                bootstrap_features=True,
            ),
            BaggingClassifier(
                random_state=0,
                estimator=self.dtree,
                n_estimators=15,
                max_samples=5,
                max_features=5,
                oob_score=False,
                bootstrap_features=True,
            ),
            BaggingClassifier(
                random_state=0,
                estimator=self.dtree,
                n_estimators=20,
                max_samples=5,
                max_features=5,
                oob_score=False,
                bootstrap_features=True,
            ),
            BaggingClassifier(
                random_state=0,
                estimator=self.dtree,
                n_estimators=20,
                max_samples=10,
                max_features=10,
                oob_score=False,
                bootstrap_features=True,
            ),
            BaggingClassifier(
                random_state=0,
                estimator=self.dtree,
                n_estimators=30,
                max_samples=10,
                max_features=10,
                oob_score=False,
                bootstrap_features=True,
            ),
        ]

        x_train = dataset["train"][:, :-1]
        y_train = dataset["train"][:, -1]

        x_valid = dataset["valid"][:, :-1]
        y_valid = dataset["valid"][:, -1]

        # Predict using the default classifier
        y_pred = self.classifier.predict(x_valid)
        best_accuracy = accuracy_score(y_valid, y_pred)
        print("Tuning default accuracy: " + str(best_accuracy))

        if compute_f1:
            best_f1 = f1_score(y_valid, y_pred)
            print("Tuning default f1 score: " + str(best_f1))

        for clsf in classifiers:
            clsf.fit(x_train, y_train)
            y_pred = clsf.predict(x_valid)

            accuracy = accuracy_score(y_valid, y_pred)

            if compute_f1:
                f1_scr = f1_score(y_valid, y_pred)

            # If this classifier config is better, then choose it
            if accuracy > best_accuracy:
                if compute_f1:
                    if f1_scr > best_f1:
                        best_accuracy = accuracy
                        best_f1 = f1_scr
                        self.classifier = clsf
                else:
                    best_accuracy = accuracy
                    self.classifier = clsf

        print("Tuning best accuracy: " + str(best_accuracy))
        if compute_f1:
            print("Tuning best F1 score: " + str(best_f1))

        params = self.classifier.get_params()
        print("\nBest Baggging Classifier Parameters:")
        for key, value in params.items():
            print(f"{key}: {value}")

    def train_after_tuning(self, dataset):
        """
        Merge Train and Validation data and train the classifier
        """
        x_train = dataset["train"][:, :-1]
        y_train = dataset["train"][:, -1]
        x_valid = dataset["valid"][:, :-1]
        y_valid = dataset["valid"][:, -1]

        x_train = np.concatenate((x_train, x_valid), axis=0)
        y_train = np.append(y_train, y_valid)
        self.classifier.fit(x_train, y_train)

    def test(self, dataset, compute_f1=True):
        """
        Report Accuracy & F1 score on Test Data
        """
        x_test = dataset["test"][:, :-1]
        y_test = dataset["test"][:, -1]

        y_pred = self.classifier.predict(x_test)
        accuracy = accuracy_score(y_test, y_pred)
        print("\nTest accuracy: " + str(accuracy))

        if compute_f1:
            f1_scr = f1_score(y_test, y_pred)
            print("Test F1 score: " + str(f1_scr))


In [3]:
"""
Uses scikit-learn Decision Tree Classifier
"""

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
import numpy as np


class DecisionTreeClsfr:
    """
    Uses scikit-learn Decision Tree Classifier to run experiments
    """

    def __init__(self, dataset, compute_f1=True):
        self.classifier = ""

        self.train(dataset)
        self.tune_parameters(dataset, compute_f1)
        self.train_after_tuning(dataset)
        self.test(dataset, compute_f1)

    def train(self, dataset):
        """
        Trains the classifier on training data
        """
        x_train = dataset["train"][:, :-1]
        y_train = dataset["train"][:, -1]

        self.classifier = DecisionTreeClassifier(random_state=0)
        self.classifier.fit(x_train, y_train)

    def tune_parameters(self, dataset, compute_f1=True):
        """
        Tries multiple parameters and chooses the best set of parameters for the classifier
        """
        classifiers = [
            # change criterion
            DecisionTreeClassifier(
                random_state=0, criterion="log_loss", splitter="best"
            ),
            DecisionTreeClassifier(
                random_state=0, criterion="entropy", splitter="best"
            ),
            # change splitter
            DecisionTreeClassifier(
                random_state=0, criterion="log_loss", splitter="random"
            ),
            DecisionTreeClassifier(
                random_state=0, criterion="entropy", splitter="random"
            ),
            # change min_samples_split & min_samples_leaf
            DecisionTreeClassifier(
                random_state=0,
                criterion="log_loss",
                splitter="best",
                min_samples_split=5,
                min_samples_leaf=2,
            ),
            DecisionTreeClassifier(
                random_state=0,
                criterion="entropy",
                splitter="best",
                min_samples_split=10,
                min_samples_leaf=5,
            ),
            DecisionTreeClassifier(
                random_state=0,
                criterion="log_loss",
                splitter="random",
                min_samples_split=5,
                min_samples_leaf=2,
            ),
            DecisionTreeClassifier(
                random_state=0,
                criterion="entropy",
                splitter="random",
                min_samples_split=10,
                min_samples_leaf=5,
            ),
            DecisionTreeClassifier(
                random_state=0,
                criterion="log_loss",
                splitter="best",
                min_samples_split=5,
                max_features="sqrt",
                min_samples_leaf=2,
            ),
            DecisionTreeClassifier(
                random_state=0,
                criterion="entropy",
                splitter="best",
                min_samples_split=10,
                max_features="log2",
                min_samples_leaf=5,
            ),
            DecisionTreeClassifier(
                random_state=0,
                criterion="log_loss",
                splitter="random",
                min_samples_split=5,
                max_features="sqrt",
                min_samples_leaf=2,
            ),
            DecisionTreeClassifier(
                random_state=0,
                criterion="entropy",
                splitter="random",
                min_samples_split=10,
                max_features="log2",
                min_samples_leaf=3,
            ),
        ]

        x_train = dataset["train"][:, :-1]
        y_train = dataset["train"][:, -1]

        x_valid = dataset["valid"][:, :-1]
        y_valid = dataset["valid"][:, -1]

        # Predict using the default classifier
        y_pred = self.classifier.predict(x_valid)
        best_accuracy = accuracy_score(y_valid, y_pred)
        print("Tuning default accuracy: " + str(best_accuracy))

        if compute_f1:
            best_f1 = f1_score(y_valid, y_pred)
            print("Tuning default f1 score: " + str(best_f1))

        for clsf in classifiers:
            clsf.fit(x_train, y_train)
            y_pred = clsf.predict(x_valid)

            accuracy = accuracy_score(y_valid, y_pred)

            if compute_f1:
                f1_scr = f1_score(y_valid, y_pred)

            # If this classifier config is better, then choose it
            if accuracy > best_accuracy:
                if compute_f1:
                    if f1_scr > best_f1:
                        best_accuracy = accuracy
                        best_f1 = f1_scr
                        self.classifier = clsf
                else:
                    best_accuracy = accuracy
                    self.classifier = clsf

        print("Tuning best accuracy: " + str(best_accuracy))
        if compute_f1:
            print("Tuning best F1 score: " + str(best_f1))

        params = self.classifier.get_params()
        print("\nBest Decision Tree Classifier Parameters:")
        for key, value in params.items():
            print(f"{key}: {value}")

    def train_after_tuning(self, dataset):
        """
        Merge Train and Validation data and Train the classifier
        """
        x_train = dataset["train"][:, :-1]
        y_train = dataset["train"][:, -1]
        x_valid = dataset["valid"][:, :-1]
        y_valid = dataset["valid"][:, -1]

        x_train = np.concatenate((x_train, x_valid), axis=0)
        y_train = np.append(y_train, y_valid)
        self.classifier.fit(x_train, y_train)

    def test(self, dataset, compute_f1=True):
        """
        Report Accuracy & F1 score on Test Data
        """
        x_test = dataset["test"][:, :-1]
        y_test = dataset["test"][:, -1]

        y_pred = self.classifier.predict(x_test)
        accuracy = accuracy_score(y_test, y_pred)
        print("\nTest accuracy: " + str(accuracy))

        if compute_f1:
            f1_scr = f1_score(y_test, y_pred)
            print("Test F1 score: " + str(f1_scr))


In [4]:
"""
Uses scikit-learn Gradient Boosting Classifier
"""

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score
import numpy as np


class GradientBoostingClsfr:
    """
    Uses scikit-learn Gradient Boosting Classifier to run experiments
    """

    def __init__(self, dataset, compute_f1=True, multi_class=False):
        self.classifier = ""

        self.train(dataset)
        self.tune_parameters(dataset, compute_f1, multi_class)
        self.train_after_tuning(dataset)
        self.test(dataset, compute_f1)

    def train(self, dataset):
        """
        Trains all classifier
        """
        x_train = dataset["train"][:, :-1]
        y_train = dataset["train"][:, -1]

        self.classifier = GradientBoostingClassifier(random_state=0)
        self.classifier.fit(x_train, y_train)

    def tune_parameters(self, dataset, compute_f1=True, multi_class=False):
        """
        Tries multiple parameters and chooses the best set of parameters for all classifier
        """
        classifiers = [
            # change n_estimators
            GradientBoostingClassifier(
                random_state=0, loss="log_loss", n_estimators=200
            ),
            # change criterion
            GradientBoostingClassifier(
                random_state=0, loss="log_loss", criterion="squared_error"
            ),
            GradientBoostingClassifier(
                random_state=0,
                loss="log_loss",
                n_estimators=200,
                criterion="squared_error",
            ),
            # change learning rate
            GradientBoostingClassifier(
                random_state=0,
                loss="log_loss",
                n_estimators=200,
                criterion="squared_error",
                learning_rate=0.05,
            ),
            # change subsample
            GradientBoostingClassifier(
                random_state=0,
                loss="log_loss",
                criterion="squared_error",
                subsample=0.75,
            ),
            GradientBoostingClassifier(
                random_state=0,
                loss="log_loss",
                n_estimators=200,
                criterion="squared_error",
                subsample=0.5,
            ),
            # change min_samples_split
            GradientBoostingClassifier(
                random_state=0,
                loss="log_loss",
                criterion="squared_error",
                min_samples_split=5,
                min_samples_leaf=10,
            ),
            GradientBoostingClassifier(
                random_state=0,
                loss="log_loss",
                n_estimators=200,
                criterion="squared_error",
                min_samples_split=10,
                min_samples_leaf=10,
            ),
        ]

        multi_class_classifiers = [
            # change loss
            GradientBoostingClassifier(random_state=0, loss="exponential"),
            GradientBoostingClassifier(
                random_state=0, loss="exponential", n_estimators=200
            ),
            GradientBoostingClassifier(
                random_state=0, loss="exponential", criterion="squared_error"
            ),
            GradientBoostingClassifier(
                random_state=0,
                loss="exponential",
                n_estimators=200,
                criterion="squared_error",
            ),
            GradientBoostingClassifier(
                random_state=0,
                loss="exponential",
                criterion="squared_error",
                subsample=0.75,
            ),
            GradientBoostingClassifier(
                random_state=0,
                loss="exponential",
                n_estimators=200,
                criterion="squared_error",
                learning_rate=0.05,
            ),
            GradientBoostingClassifier(
                random_state=0,
                loss="exponential",
                n_estimators=200,
                criterion="squared_error",
                min_samples_split=10,
                min_samples_leaf=5,
            ),
            GradientBoostingClassifier(
                random_state=0,
                loss="exponential",
                criterion="squared_error",
                min_samples_split=5,
                min_samples_leaf=5,
            ),
            GradientBoostingClassifier(
                random_state=0,
                loss="exponential",
                n_estimators=200,
                criterion="squared_error",
                subsample=0.5,
            ),
        ]

        if not multi_class:
            classifiers = classifiers + multi_class_classifiers

        x_train = dataset["train"][:, :-1]
        y_train = dataset["train"][:, -1]

        x_valid = dataset["valid"][:, :-1]
        y_valid = dataset["valid"][:, -1]

        # Predict using the default classifier
        y_pred = self.classifier.predict(x_valid)
        best_accuracy = accuracy_score(y_valid, y_pred)
        print("Tuning default accuracy: " + str(best_accuracy))

        if compute_f1:
            best_f1 = f1_score(y_valid, y_pred)
            print("Tuning default f1 score: " + str(best_f1))

        for clsf in classifiers:
            clsf.fit(x_train, y_train)
            y_pred = clsf.predict(x_valid)

            accuracy = accuracy_score(y_valid, y_pred)

            if compute_f1:
                f1_scr = f1_score(y_valid, y_pred)

            # If this classifier config is better, then choose it
            if accuracy > best_accuracy:
                if compute_f1:
                    if f1_scr > best_f1:
                        best_accuracy = accuracy
                        best_f1 = f1_scr
                        self.classifier = clsf
                else:
                    best_accuracy = accuracy
                    self.classifier = clsf

        print("Tuning best accuracy: " + str(best_accuracy))
        if compute_f1:
            print("Tuning best F1 score: " + str(best_f1))

        params = self.classifier.get_params()
        print("\nBest Gradient Boosting Classifier Parameters:")
        for key, value in params.items():
            print(f"{key}: {value}")

    def train_after_tuning(self, dataset):
        """
        Merge Train and Validation data and Train the classifier
        """
        x_train = dataset["train"][:, :-1]
        y_train = dataset["train"][:, -1]
        x_valid = dataset["valid"][:, :-1]
        y_valid = dataset["valid"][:, -1]

        x_train = np.concatenate((x_train, x_valid), axis=0)
        y_train = np.append(y_train, y_valid)
        self.classifier.fit(x_train, y_train)

    def test(self, dataset, compute_f1=True):
        """
        Report Accuracy & F1 score on Test Data
        """
        x_test = dataset["test"][:, :-1]
        y_test = dataset["test"][:, -1]

        y_pred = self.classifier.predict(x_test)
        accuracy = accuracy_score(y_test, y_pred)
        print("\nTest accuracy: " + str(accuracy))

        if compute_f1:
            f1_scr = f1_score(y_test, y_pred)
            print("Test F1 score: " + str(f1_scr))


In [5]:
"""
Uses scikit-learn Random Forest Classifier
"""

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
import numpy as np


class RandomForestClsfr:
    """
    Uses scikit-learn Random Forest Classifier to run experiments
    """

    def __init__(self, dataset, compute_f1=True):
        self.classifier = ""

        self.train(dataset)
        self.tune_parameters(dataset, compute_f1)
        self.train_after_tuning(dataset)
        self.test(dataset, compute_f1)

    def train(self, dataset):
        """
        Trains the classifier
        """
        x_train = dataset["train"][:, :-1]
        y_train = dataset["train"][:, -1]

        self.classifier = RandomForestClassifier(random_state=0)
        self.classifier.fit(x_train, y_train)

    def tune_parameters(self, dataset, compute_f1=True):
        """
        Tries multiple parameters and chooses the best set of parameters for the classifier
        """
        classifiers = [
            # change ecriterion
            RandomForestClassifier(random_state=0, criterion="entropy"),
            RandomForestClassifier(random_state=0, criterion="log_loss"),
            # change min_samples_split
            RandomForestClassifier(
                random_state=0, criterion="entropy", min_samples_split=5
            ),
            RandomForestClassifier(
                random_state=0, criterion="log_loss", min_samples_split=5
            ),
            RandomForestClassifier(
                random_state=0, criterion="entropy", min_samples_split=10
            ),
            RandomForestClassifier(
                random_state=0, criterion="log_loss", min_samples_split=10
            ),
            # change min_samples_leaf
            RandomForestClassifier(
                random_state=0,
                criterion="entropy",
                min_samples_split=5,
                min_samples_leaf=5,
            ),
            RandomForestClassifier(
                random_state=0,
                criterion="log_loss",
                min_samples_split=5,
                min_samples_leaf=5,
            ),
            RandomForestClassifier(
                random_state=0,
                criterion="entropy",
                min_samples_split=10,
                min_samples_leaf=10,
            ),
            RandomForestClassifier(
                random_state=0,
                criterion="log_loss",
                min_samples_split=10,
                min_samples_leaf=10,
            ),
            # change max_features
            RandomForestClassifier(
                random_state=0,
                criterion="entropy",
                min_samples_split=5,
                min_samples_leaf=5,
                max_features="log2",
            ),
            RandomForestClassifier(
                random_state=0,
                criterion="log_loss",
                min_samples_split=5,
                min_samples_leaf=5,
                max_features="log2",
            ),
            RandomForestClassifier(
                random_state=0,
                criterion="entropy",
                min_samples_split=10,
                min_samples_leaf=10,
                max_features="log2",
            ),
            RandomForestClassifier(
                random_state=0,
                criterion="log_loss",
                min_samples_split=10,
                min_samples_leaf=10,
                max_features="log2",
            ),
        ]

        x_train = dataset["train"][:, :-1]
        y_train = dataset["train"][:, -1]

        x_valid = dataset["valid"][:, :-1]
        y_valid = dataset["valid"][:, -1]

                # Predict using the default classifier
        y_pred = self.classifier.predict(x_valid)
        best_accuracy = accuracy_score(y_valid, y_pred)
        print("Tuning default accuracy: " + str(best_accuracy))

        if compute_f1:
            best_f1 = f1_score(y_valid, y_pred)
            print("Tuning default f1 score: " + str(best_f1))

        for clsf in classifiers:
            clsf.fit(x_train, y_train)
            y_pred = clsf.predict(x_valid)

            accuracy = accuracy_score(y_valid, y_pred)

            if compute_f1:
                f1_scr = f1_score(y_valid, y_pred)

            # If this classifier config is better, then choose it
            if accuracy > best_accuracy:
                if compute_f1:
                    if f1_scr > best_f1:
                        best_accuracy = accuracy
                        best_f1 = f1_scr
                        self.classifier = clsf
                else:
                    best_accuracy = accuracy
                    self.classifier = clsf

        print("Tuning best accuracy: " + str(best_accuracy))
        if compute_f1:
            print("Tuning best F1 score: " + str(best_f1))

        params = self.classifier.get_params()
        print("\nBest Random Forest Classifier Parameters:")
        for key, value in params.items():
            print(f"{key}: {value}")

    def train_after_tuning(self, dataset):
        """
        Merge Train and Validation data and train the classifier
        """
        x_train = dataset["train"][:, :-1]
        y_train = dataset["train"][:, -1]
        x_valid = dataset["valid"][:, :-1]
        y_valid = dataset["valid"][:, -1]

        x_train = np.concatenate((x_train, x_valid), axis=0)
        y_train = np.append(y_train, y_valid)
        self.classifier.fit(x_train, y_train)

    def test(self, dataset, compute_f1=True):
        """
        Report Accuracy & F1 score on Test Data
        """
        x_test = dataset["test"][:, :-1]
        y_test = dataset["test"][:, -1]

        y_pred = self.classifier.predict(x_test)
        accuracy = accuracy_score(y_test, y_pred)
        print("\nTest accuracy: " + str(accuracy))

        if compute_f1:
            f1_scr = f1_score(y_test, y_pred)
            print("Test F1 score: " + str(f1_scr))


In [6]:
from collections import defaultdict

def read_datasets(dataset_path="./all_data", zip_file = "./project2_data.zip") -> defaultdict(lambda: defaultdict(dict)):
        """
        Extracts and Reads csv files from the dataset
        """
        dataset = defaultdict(lambda: defaultdict(dict))
        if not exists(dataset_path):
            with zipfile.ZipFile(zip_file, "r") as zip_ref:
                zip_ref.extractall(".")

        dataset_csv_files = listdir(dataset_path)
        for csv_file in dataset_csv_files:
            name_parts = csv_file.split("_")
            d_type = name_parts[0]
            d_clauses = name_parts[1]
            d_examples = name_parts[2].split(".")[0]
            csv_file_path = join(dataset_path, csv_file)
            dataset[d_clauses][d_examples][d_type] = np.genfromtxt(
                csv_file_path, delimiter=","
            )

        return dataset

In [7]:
"""
Instantiates and Runs all models
"""

import zipfile
import sys
from os.path import exists, join
from os import listdir
from collections import defaultdict
import numpy as np

class AllModels:
    """
    Instantiates and Runs all models
    """

    def __init__(self, dataset, models="all"):
        self.dataset = dataset

        if models == "all" or "dtree" in models:
            print("Running Decision Tree Classifier")
            print("*************************************************\n")
            self.decision_tree_classifiers = defaultdict(
                lambda: defaultdict(lambda: DecisionTreeClsfr)
            )

            for n_clauses in self.dataset:
                for n_examples in self.dataset[n_clauses]:
                    print(n_clauses + "->" + n_examples)
                    print("---------------------------------------------")
                    self.decision_tree_classifiers[n_clauses][
                        n_examples
                    ] = DecisionTreeClsfr(self.dataset[n_clauses][n_examples])
                    print("---------------------------------------------\n")

        if models == "all" or "bagging" in models:
            print("Running Bagging Classifier")
            print("*************************************************\n")
            self.bagging_classifiers = defaultdict(
                lambda: defaultdict(lambda: BaggingClsfr)
            )

            for n_clauses in self.dataset:
                for n_examples in self.dataset[n_clauses]:
                    print(n_clauses + "->" + n_examples)
                    print("---------------------------------------------")
                    self.bagging_classifiers[n_clauses][n_examples] = BaggingClsfr(
                        self.dataset[n_clauses][n_examples]
                    )
                    print("---------------------------------------------\n")

        if models == "all" or "randomforest" in models:
            print("Running Random Forest Classifier")
            print("*************************************************\n")
            self.random_forest_classifiers = defaultdict(
                lambda: defaultdict(lambda: RandomForestClsfr)
            )

            for n_clauses in self.dataset:
                for n_examples in self.dataset[n_clauses]:
                    print(n_clauses + "->" + n_examples)
                    print("---------------------------------------------")
                    print(n_clauses + "->" + n_examples)
                    self.random_forest_classifiers[n_clauses][
                        n_examples
                    ] = RandomForestClsfr(self.dataset[n_clauses][n_examples])
                    print("---------------------------------------------\n")

        if models == "all" or "gradientboost" in models:
            print("Running Gradient Boost Classifier")
            print("*************************************************\n")
            self.gradient_boosting_classifiers = defaultdict(
                lambda: defaultdict(lambda: GradientBoostingClsfr)
            )

            for n_clauses in self.dataset:
                for n_examples in self.dataset[n_clauses]:
                    print(n_clauses + "->" + n_examples)
                    print("---------------------------------------------")
                    self.gradient_boosting_classifiers[n_clauses][
                        n_examples
                    ] = GradientBoostingClsfr(self.dataset[n_clauses][n_examples])
                    print("---------------------------------------------\n")

In [8]:
"""
Fetches MNIST dataset and runs all models on it
"""
import sys
import numpy as np
from sklearn.datasets import fetch_openml


class MNISTExperiments:
    """
    Fetches MNIST dataset and runs all models on it
    """

    def __init__(self, models="all"):
        x, y = fetch_openml("mnist_784", version=1, return_X_y=True)
        x = x / 255

        x_train, x_valid, x_test = x[:40000], x[40000:60000], x[60000:]
        y_train, y_valid, y_test = y[:40000], y[40000:60000], y[60000:]

        dataset = {
            "train": np.column_stack((x_train, y_train)),
            "valid": np.column_stack((x_valid, y_valid)),
            "test": np.column_stack((x_test, y_test)),
        }

        if models == "all" or "dtree" in models:
            print("Running Decision Tree Classifier on MNIST Dataset")
            print("*************************************************\n")
            self.decision_tree_classifier = DecisionTreeClsfr(dataset, compute_f1=False)
            print("---------------------------------------------")

        if models == "all" or "bagging" in models:
            print("Running Bagging Classifier on MNIST Dataset")
            print("*************************************************\n")
            self.bagging_classifier = BaggingClsfr(dataset, compute_f1=False)
            print("---------------------------------------------")

        if models == "all" or "randomforest" in models:
            print("Running Random Forest Classifier on MNIST Dataset")
            print("*************************************************\n")
            self.random_forest_classifier = RandomForestClsfr(dataset, compute_f1=False)
            print("---------------------------------------------")

        if models == "all" or "gradientboost" in models:
            print("Running Gradient Boost Classifier on MNIST Dataset")
            print("*************************************************\n")
            self.gradient_boosting_classifier = GradientBoostingClsfr(
                dataset, compute_f1=False, multi_class=True
            )
            print("---------------------------------------------")


Use driver classes to run models in datasets

In [None]:
project2_dataset = read_datasets()

In [None]:
dtree = AllModels(models="dtree", dataset=project2_dataset)

Running Decision Tree Classifier
*************************************************

c300->d5000
---------------------------------------------
Tuning default accuracy: 0.7224
Tuning default f1 score: 0.7209489344591878
Tuning best accuracy: 0.7258
Tuning best F1 score: 0.728191911181602

Best Decision Tree Classifier Parameters:
ccp_alpha: 0.0
class_weight: None
criterion: entropy
max_depth: None
max_features: None
max_leaf_nodes: None
min_impurity_decrease: 0.0
min_samples_leaf: 5
min_samples_split: 10
min_weight_fraction_leaf: 0.0
random_state: 0
splitter: best

Test accuracy: 0.7588
Test F1 score: 0.755771567436209
---------------------------------------------

c300->d1000
---------------------------------------------
Tuning default accuracy: 0.6145
Tuning default f1 score: 0.613920881321983
Tuning best accuracy: 0.6345
Tuning best F1 score: 0.6206538661131291

Best Decision Tree Classifier Parameters:
ccp_alpha: 0.0
class_weight: None
criterion: log_loss
max_depth: None
max_features

In [None]:
bagging = AllModels(models="bagging", dataset=project2_dataset)

Running Bagging Classifier
*************************************************

c300->d5000
---------------------------------------------
Tuning default accuracy: 0.8226
Tuning default f1 score: 0.8168868703550785
Tuning best accuracy: 0.8775
Tuning best F1 score: 0.8813099505861833

Best Baggging Classifier Parameters:
base_estimator: deprecated
bootstrap: True
bootstrap_features: False
estimator__ccp_alpha: 0.0
estimator__class_weight: None
estimator__criterion: gini
estimator__max_depth: None
estimator__max_features: None
estimator__max_leaf_nodes: None
estimator__min_impurity_decrease: 0.0
estimator__min_samples_leaf: 1
estimator__min_samples_split: 2
estimator__min_weight_fraction_leaf: 0.0
estimator__random_state: 0
estimator__splitter: best
estimator: DecisionTreeClassifier(random_state=0)
max_features: 1.0
max_samples: 1.0
n_estimators: 30
n_jobs: None
oob_score: False
random_state: 0
verbose: 0
warm_start: False

Test accuracy: 0.9081
Test F1 score: 0.9109582404805736
----------

In [None]:
randomforest = AllModels(models="randomforest", dataset=project2_dataset)

Running Random Forest Classifier
*************************************************

c300->d5000
---------------------------------------------
c300->d5000
Tuning default accuracy: 0.8711
Tuning default f1 score: 0.8715751718641027
Tuning best accuracy: 0.8999
Tuning best F1 score: 0.9017953497498282

Best Random Forest Classifier Parameters:
bootstrap: True
ccp_alpha: 0.0
class_weight: None
criterion: entropy
max_depth: None
max_features: sqrt
max_leaf_nodes: None
max_samples: None
min_impurity_decrease: 0.0
min_samples_leaf: 5
min_samples_split: 5
min_weight_fraction_leaf: 0.0
n_estimators: 100
n_jobs: None
oob_score: False
random_state: 0
verbose: 0
warm_start: False

Test accuracy: 0.9087
Test F1 score: 0.9105866222701009
---------------------------------------------

c300->d1000
---------------------------------------------
c300->d1000
Tuning default accuracy: 0.8465
Tuning default f1 score: 0.8492881688757977
Tuning best accuracy: 0.8625
Tuning best F1 score: 0.8661800486618004

Be

In [None]:
gradientboost = AllModels(models="gradientboost", dataset=project2_dataset)

Running Gradient Boost Classifier
*************************************************

c300->d5000
---------------------------------------------
Tuning default accuracy: 0.9793
Tuning default f1 score: 0.9796919454527617
Tuning best accuracy: 0.9927
Tuning best F1 score: 0.9927529038022436

Best Gradient Boosting Classifier Parameters:
ccp_alpha: 0.0
criterion: friedman_mse
init: None
learning_rate: 0.1
loss: log_loss
max_depth: 3
max_features: None
max_leaf_nodes: None
min_impurity_decrease: 0.0
min_samples_leaf: 1
min_samples_split: 2
min_weight_fraction_leaf: 0.0
n_estimators: 200
n_iter_no_change: None
random_state: 0
subsample: 1.0
tol: 0.0001
validation_fraction: 0.1
verbose: 0
warm_start: False

Test accuracy: 0.994
Test F1 score: 0.9940357852882703
---------------------------------------------

c300->d1000
---------------------------------------------
Tuning default accuracy: 0.9525
Tuning default f1 score: 0.954039671020803
Tuning best accuracy: 0.9745
Tuning best F1 score: 0.97

In [None]:
mnist_dtree = MNISTExperiments("dtree")

  warn(


Running Decision Tree Classifier on MNIST Dataset
*************************************************

Tuning default accuracy: 0.86375
Tuning best accuracy: 0.87645

Best Decision Tree Classifier Parameters:
ccp_alpha: 0.0
class_weight: None
criterion: entropy
max_depth: None
max_features: None
max_leaf_nodes: None
min_impurity_decrease: 0.0
min_samples_leaf: 5
min_samples_split: 10
min_weight_fraction_leaf: 0.0
random_state: 0
splitter: best

Test accuracy: 0.8875
---------------------------------------------


In [None]:
mnist_bagging = MNISTExperiments("bagging")

  warn(


Running Bagging Classifier on MNIST Dataset
*************************************************

Tuning default accuracy: 0.93575
Tuning best accuracy: 0.94955

Best Baggging Classifier Parameters:
base_estimator: deprecated
bootstrap: True
bootstrap_features: False
estimator__ccp_alpha: 0.0
estimator__class_weight: None
estimator__criterion: gini
estimator__max_depth: None
estimator__max_features: None
estimator__max_leaf_nodes: None
estimator__min_impurity_decrease: 0.0
estimator__min_samples_leaf: 1
estimator__min_samples_split: 2
estimator__min_weight_fraction_leaf: 0.0
estimator__random_state: 0
estimator__splitter: best
estimator: DecisionTreeClassifier(random_state=0)
max_features: 1.0
max_samples: 1.0
n_estimators: 30
n_jobs: None
oob_score: False
random_state: 0
verbose: 0
warm_start: False

Test accuracy: 0.9554
---------------------------------------------


In [None]:
mnist_randomforest = MNISTExperiments("randomforest")

  warn(


Running Random Forest Classifier on MNIST Dataset
*************************************************

Tuning default accuracy: 0.96475
Tuning best accuracy: 0.9655

Best Random Forest Classifier Parameters:
bootstrap: True
ccp_alpha: 0.0
class_weight: None
criterion: entropy
max_depth: None
max_features: sqrt
max_leaf_nodes: None
max_samples: None
min_impurity_decrease: 0.0
min_samples_leaf: 1
min_samples_split: 2
min_weight_fraction_leaf: 0.0
n_estimators: 100
n_jobs: None
oob_score: False
random_state: 0
verbose: 0
warm_start: False

Test accuracy: 0.9692
---------------------------------------------


In [9]:
mnist_gradientboost = MNISTExperiments("gradientboost")

  warn(


Running Gradient Boost Classifier on MNIST Dataset
*************************************************

Tuning default accuracy: 0.94665
Tuning best accuracy: 0.96045

Best Gradient Boosting Classifier Parameters:
ccp_alpha: 0.0
criterion: squared_error
init: None
learning_rate: 0.1
loss: log_loss
max_depth: 3
max_features: None
max_leaf_nodes: None
min_impurity_decrease: 0.0
min_samples_leaf: 10
min_samples_split: 10
min_weight_fraction_leaf: 0.0
n_estimators: 200
n_iter_no_change: None
random_state: 0
subsample: 1.0
tol: 0.0001
validation_fraction: 0.1
verbose: 0
warm_start: False

Test accuracy: 0.9619
---------------------------------------------
