# Overview

Notebook for training sklearn models to predict bias class from article tokens

# Setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')
# NOTE: To be able to access the shared files, you need to go to Drive and click
# "Add shortcut to Drive" on the options for the shared folder to be able to access it when mounted

In [None]:
# Google Drive Paths to Folder
path = "/content/gdrive/MyDrive/"
%cd {path}
%pwd

In [None]:
%pip install -U scikit-learn datasets

In [None]:
import os
import os.path
import torch
from torch.utils.data import DataLoader
import numpy as np
from datasets import load_dataset
from tqdm.notebook import tqdm

from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier, Perceptron, SGDRegressor, PassiveAggressiveRegressor
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
import joblib
import json

# Cuda Check
print(torch.__version__)
torch.cuda.is_available()

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

# Load Data

In [None]:
tokens = load_dataset("bzhao18/hyperpartisan-news-distilbert")
tokens = tokens.remove_columns(["text", "title", "hyperpartisan", "url", "published_at"]) \
    .rename_column("bias", "label") \
    .with_format("torch")
dataset = tokens["train"].train_test_split(test_size=0.3, shuffle=True)
print(dataset)

# Train Sklearn Models

In [None]:
partial_fit_classifiers = {
    'SGDClassifier': SGDClassifier(),
    'Perceptron': Perceptron(),
    'MultinomialNB': MultinomialNB(),
    'BernoulliNB': BernoulliNB(),
    'PassiveAggressive': PassiveAggressiveClassifier(),
}
all_classes = np.array([0, 1, 2, 3, 4])

partial_fit_regressors = {
    'SGDRegressor': SGDRegressor(),
    'PassiveAggressiveRegressor': PassiveAggressiveRegressor(),
}

In [None]:
def log_classification_metrics(log, labels, predictions):
    accuracy = accuracy_score(labels, predictions)
    rmse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    log['accuracy'].append(accuracy)
    log['rmse'].append(rmse)
    log['mae'].append(mae)
    return accuracy, rmse, mae

def log_regression_metrics(log, labels, predictions):
    rmse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    log['rmse'].append(rmse)
    log['mae'].append(mae)
    return rmse, mae

In [None]:
###############################################################################
# Main
###############################################################################
# dataloader
train_limit = 500
valid_limit = 100
batch_size = 256
shuffle = True
# hyperparameters
num_epochs = 10

def train(name, model, mode, scale=False, all_classes=[]):
    root = os.path.join('sklearn', name)
    os.makedirs(root, exist_ok=True)
    logs = {'train':{'accuracy': [], 'rmse': [], 'mae': []},
            'valid':{'accuracy': [], 'rmse': [], 'mae': []}}

    training_set = DataLoader(dataset["train"], batch_size=batch_size, shuffle=shuffle)
    validation_set = DataLoader(dataset["test"], batch_size=batch_size, shuffle=False)
    if scale:
        scaler = StandardScaler()
    for epoch in tqdm(range(num_epochs), total=num_epochs, desc="Total progress"):
        epoch_logs = {'train':{'accuracy': [], 'rmse': [], 'mae': []},
                'valid':{'accuracy': [], 'rmse': [], 'mae': []}}

        # train
        y_label_train = []
        y_predict_train = []
        for i,batch in tqdm(enumerate(training_set), total=train_limit, desc=f"     Training epoch {epoch}"):
            if i==train_limit: break

            # get data
            data = batch["input_ids"].squeeze(1)
            if scale:
                scaler.partial_fit(data)
                data = scaler.transform(data)
            labels = batch["label"]
            y_label_train += labels.tolist()

            # fit
            if mode=="cls":
                model.partial_fit(data, labels, classes=all_classes)
            elif mode=="reg":
                model.partial_fit(data, labels)

            # predict
            prediction = model.predict(data)
            y_predict_train += prediction.tolist()

            # get batch metrics
            if mode=="cls":
                accuracy, rmse, mae = log_classification_metrics(epoch_logs['train'], labels.tolist(), prediction.tolist())
            elif mode=="reg":
                rmse, mae = log_regression_metrics(epoch_logs['train'], labels.tolist(), prediction.tolist())

        # validate
        y_label_valid = []
        y_predict_valid = []
        for i,batch in tqdm(enumerate(validation_set), total=valid_limit, desc=f"     Validating epoch {epoch}"):
            if i==valid_limit: break

            # get data
            data = batch["input_ids"].squeeze(1)
            if scale:
                data = scaler.transform(data)
            labels = batch["label"]
            y_label_valid += labels.tolist()

            # predict
            prediction = model.predict(data)
            y_predict_valid += prediction.tolist()

            # get batch metrics
            if mode=="cls":
                accuracy, rmse, mae = log_classification_metrics(epoch_logs['valid'], labels.tolist(), prediction.tolist())
            elif mode=="reg":
                rmse, mae = log_regression_metrics(epoch_logs['valid'], labels.tolist(), prediction.tolist())

        # get epoch metrics
        if mode=="cls":
            accuracy, rmse, mae = log_classification_metrics(logs['train'], y_label_train, y_predict_train)
            print(f"\t{name} \tTraining \tAccuracy: {accuracy:.3f} \tRMSE: {rmse:.3f} \tMAE: {mae: .3f}")
            accuracy, rmse, mae = log_classification_metrics(logs['valid'], y_label_valid, y_predict_valid)
            print(f"\t{name} \tValidation \tAccuracy: {accuracy:.3f} \tRMSE: {rmse:.3f} \tMAE: {mae: .3f}")
        elif mode=="reg":
            rmse, mae = log_regression_metrics(logs['train'], y_label_train, y_predict_train)
            print(f"\t{name} \tTraining \tRMSE: {rmse:.3f} \tMAE: {mae: .3f}")
            rmse, mae = log_regression_metrics(logs['valid'], y_label_valid, y_predict_valid)
            print(f"\t{name} \tValidation \tRMSE: {rmse:.3f} \tMAE: {mae: .3f}")

        # save checkpoints
        checkpoint_folder = os.path.join(root, f"{name}_chkpt_{epoch}")
        os.makedirs(checkpoint_folder, exist_ok=True)
        joblib.dump(model, os.path.join(checkpoint_folder, "model.joblib"))
        if scale:
            joblib.dump(scaler, os.path.join(checkpoint_folder, "scaler.joblib"))
        with open(os.path.join(checkpoint_folder, "chkpt_logs.json"), 'w') as f:
            json.dump(epoch_logs, f)
        with open(os.path.join(root, "logs.json"), 'w') as f:
            json.dump(logs, f)

In [None]:
train("SGDClassifier", partial_fit_classifiers["SGDClassifier"], mode="cls", scale=True, all_classes=all_classes)
train("Perceptron", partial_fit_classifiers["Perceptron"], mode="cls", all_classes=all_classes)
train("MultinomialNB", partial_fit_classifiers["MultinomialNB"], mode="cls", all_classes=all_classes)
train("BernoulliNB", partial_fit_classifiers["BernoulliNB"], mode="cls", all_classes=all_classes)
train("PassiveAggressive", partial_fit_classifiers["PassiveAggressive"], mode="cls", all_classes=all_classes)

train("SGDRegressor", partial_fit_regressors["SGDRegressor"], mode="reg", scale=True)
train("PassiveAggressiveRegressor", partial_fit_regressors["PassiveAggressiveRegressor"], mode="reg")