<a href="https://colab.research.google.com/github/ChavChavC/BT4222/blob/main/Simple_ML_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Models covered:

* Multinomial Naive Bayes
* SVM / logistic regression
* Random Forest
* kNN Classifier
* LSTM

In [None]:
!pip install datasets optuna

## Import data and split into train and validation

In [2]:
from datasets import load_dataset

dataset = load_dataset("Jean-Baptiste/financial_news_sentiment")

Downloading readme:   0%|          | 0.00/880 [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/768k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/145k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1512 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/267 [00:00<?, ? examples/s]

In [3]:
import pandas as pd

train_data = pd.DataFrame(dataset["train"])

train_data.head()

Unnamed: 0,summary_detail,title,summary_detail_with_title,topic,labels,__index_level_0__
0,"TORONTO, Oct. 05, 2022 (GLOBE NEWSWIRE) -- W...",Wesdome Exploration Defines up Plunge Extensio...,Wesdome Exploration Defines up Plunge Extensio...,6,2,884
1,"Greenville, South Carolina, Nov. 15, 2022 (G...",ARCPOINT INC. TO HOLD Q3 FINANCIAL RESULTS CON...,ARCPOINT INC. TO HOLD Q3 FINANCIAL RESULTS CON...,2,1,711
2,"TORONTO, Nov. 09, 2022 (GLOBE NEWSWIRE) -- O...",Existing Multi-Unit Freshii Franchisee Signs A...,Existing Multi-Unit Freshii Franchisee Signs A...,0,2,268
3,"VANCOUVER, British Columbia, Oct. 06, 2022 (...","Fortuna reports production of 101,840 gold equ...","Fortuna reports production of 101,840 gold equ...",2,2,1504
4,"VANCOUVER, British Columbia, Oct. 17, 2022 (...",HYTN Awarded Controlled Drugs and Substances D...,HYTN Awarded Controlled Drugs and Substances D...,1,2,66


In [4]:
len(train_data)

1512

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_data["summary_detail_with_title"], train_data["labels"], test_size=0.2, shuffle=True, random_state=4222)

using cleaned data:

In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

data = pd.read_csv("/content/gdrive/My Drive/BT4222/data.csv")

print(len(data))
data.head()

Mounted at /content/gdrive
13710


Unnamed: 0.1,Unnamed: 0,title,labels
0,0,Gildan Activewear Reports Strong Results for t...,2
1,1,TRILLION ENERGY ANNOUNCES FLOW TEST RESULTS FO...,2
2,2,CAPREIT Announces October Distribution,1
3,3,Unigold Inc Delivers Positive Feasibility Stud...,2
4,4,Wallbridge Provides Update on Archer Explorati...,1


In [6]:
from sklearn.model_selection import train_test_split

# create train : val : test split of 6 : 2 : 2

X_train, X_test, y_train, y_test = train_test_split(data["title"], data["labels"], test_size=0.2, shuffle=True, random_state=4222)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=True, random_state=4222)

In [20]:
y_train = y_train[~X_train.isnull()]
X_train = X_train[~X_train.isnull()]
y_val = y_val[~X_val.isnull()]
X_val = X_val[~X_val.isnull()]

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer().fit(X_train)
X_train_count = vectorizer.transform(X_train)
X_val_count = vectorizer.transform(X_val)

transformer = TfidfTransformer().fit(X_train_count)
X_train_feature = transformer.transform(X_train_count)
X_val_feature = transformer.transform(X_val_count)

## Test out different sklearn classifiers

The main idea here is that we are using `optuna` library to help us finetune some of the hyperparameters for each of the models we have chosen. It will experiment with a range of hyperparameter values, as specified, and choose the set of hyperparameters that result in the greatest validation accuracy.

In [33]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression  #, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# from sklearn.preprocessing import StandardScaler


def train_model(X_train, X_val, y_train, y_val, model_fn, params=None):
    if params:
        model = model_fn(**params)
    else:
        model = model_fn()

    # if model_fn == LinearSVC:
    #     scaler = StandardScaler()
    #     X_train = scaler.fit_transform(X_train)
    #     X_val = scaler.transform(X_val)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    return y_pred


def objective_MNB(trial, X_train, X_val, y_train, y_val):
    params = {
        "alpha": trial.suggest_float("alpha", 0.01, 10.0),
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, MultinomialNB, params)

    return accuracy_score(y_val, y_pred)


def objective_SVM(trial, X_train, X_val, y_train, y_val):
    params = {
        # "loss": trial.suggest_categorical("loss", ["hinge", "squared_hinge"]),
        "C": trial.suggest_float("C", 0.1, 10.0),  # regularisation parameter
        "intercept_scaling": trial.suggest_float("intercept_scaling", 1.0, 10.0),  # allows intercept to have different regularisation behaviour from other features
        "max_iter": 10000,
        "dual": False
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, LinearSVC, params)

    return accuracy_score(y_val, y_pred)


def objective_LR(trial, X_train, X_val, y_train, y_val):
    params = {
        "C": trial.suggest_float("C", 0.01, 1.0),  # regularisation parameter
        "max_iter": trial.suggest_int("max_iter", 500, 1000),
        # "solver": trial.suggest_categorical("solver", ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"])
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, LogisticRegression, params)

    return accuracy_score(y_val, y_pred)


def objective_RF(trial, X_train, X_val, y_train, y_val):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 5, 300),
        "max_depth": trial.suggest_int("max_depth", 5, 100),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 30),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        # "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 5, 1000),
        # "ccp_alpha": trial.suggest_float("ccp_alpha", 0.0, 10.0)
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, RandomForestClassifier, params)

    return accuracy_score(y_val, y_pred)


def objective_KN(trial, X_train, X_val, y_train, y_val):
    params = {
        "n_neighbors": trial.suggest_int("n_neighbors", 1, 100),
        "leaf_size": trial.suggest_int("leaf_size", 1, 100)
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, KNeighborsClassifier, params)

    return accuracy_score(y_val, y_pred)

In [115]:
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

objectives = [
    [objective_MNB, MultinomialNB],
    [objective_SVM, LinearSVC],
    [objective_LR, LogisticRegression],
    [objective_RF, RandomForestClassifier],
    [objective_KN, KNeighborsClassifier]
]

best_params_dct = {}

for objective, model in objectives:
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train_feature, X_val_feature, y_train, y_val), n_trials=30)
    params = study.best_params
    best_params_dct[str(model)] = params

    print("Classifier:", model)
    y_preds = train_model(X_train_feature, X_val_feature, y_train, y_val, model)
    print("Default params acc:", accuracy_score(y_val, y_preds))
    print("Best params:", params)
    y_preds_2 = train_model(X_train_feature, X_val_feature, y_train, y_val, model, params)
    print("Best params acc:", accuracy_score(y_val, y_preds_2))
    print()


Classifier: <class 'sklearn.naive_bayes.MultinomialNB'>
Default params acc: 0.8226924480116746
Best params: {'alpha': 0.07811930460001588}
Best params acc: 0.8540678584458227

Classifier: <class 'sklearn.svm._classes.LinearSVC'>
Default params acc: 0.874133527909522
Best params: {'C': 0.8560958573054762, 'intercept_scaling': 9.231320985461904}




Best params acc: 0.8766873403867201

Classifier: <class 'sklearn.linear_model._logistic.LogisticRegression'>
Default params acc: 0.8602699744618753
Best params: {'C': 0.99389298832784, 'max_iter': 997}
Best params acc: 0.8602699744618753

Classifier: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
Default params acc: 0.8642831083546151
Best params: {'n_estimators': 22, 'max_depth': 100, 'min_samples_split': 12, 'min_samples_leaf': 1}
Best params acc: 0.8453119299525721

Classifier: <class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Default params acc: 0.842758117475374
Best params: {'n_neighbors': 10, 'leaf_size': 82}
Best params acc: 0.8515140459686246



### Test out different model aggregation algorithms

Ideas:

1. Have 2 different classifiers, the first one classifies everything, then whatever it classifies wrongly or un-confidently (based on a certain probability threshold value) gets passed on to the second classifier. The second classifier is only fitted on these "difficult" cases, to try to classify them correctly.



In [119]:
import numpy as np

def get_repred_ind(clf, X, y, threshold=0.6):
    pred_probs = clf.predict_proba(X)
    threshold_col = np.expand_dims(np.array([threshold]*pred_probs.shape[0]), axis=1)
    pred_probs_threshed = np.append(pred_probs, threshold_col, axis=1)
    repredict_ind = np.where(np.argmax(pred_probs_threshed, axis=1) != y)[0]
    return repredict_ind

def train_ensem_model(clf1, clf2, X_train, y_train):
    clf1.fit(X_train_feature, y_train)
    repredict_ind = get_repred_ind(clf1, X_train, y_train)
    clf2.fit(X_train[repredict_ind], np.array(y_train)[repredict_ind])
    return clf1, clf2

def eval_ensem_model(clf1, clf2, X_val, y_val, threshold=0.6):
    pred_probs = clf1.predict_proba(X_val)
    repredict_ind = get_repred_ind(clf1, X_val, y_val)
    repredict_preds = clf2.predict(X_val[repredict_ind])
    preds = clf1.predict(X_val)
    preds[repredict_ind] = repredict_preds
    acc = accuracy_score(y_val, preds)
    print("Validation acc:", acc)
    return preds, acc


Models with `predict_proba`: MultinomialNB, LogisticRegression, RandomForest, KNeighbors
(Basically all except LinearSVC)


In [120]:
first = [MultinomialNB, LogisticRegression, RandomForestClassifier, KNeighborsClassifier]
second = [MultinomialNB, LinearSVC, LogisticRegression, RandomForestClassifier, KNeighborsClassifier]
classifier_pairs = []
for clf1 in first:
    for clf2 in second:
        classifier_pairs.append([clf1, clf2])

max_val_acc = 0
best_clf1 = None
best_clf2 = None
for clf1, clf2 in classifier_pairs:
    params1 = best_params_dct[str(clf1)]
    params2 = best_params_dct[str(clf2)]
    clf1, clf2 = train_ensem_model(clf1(**params1), clf2(**params2), X_train_feature, y_train)
    print("clf1:", clf1)
    print("clf2:", clf2)
    preds, acc = eval_ensem_model(clf1, clf2, X_val_feature, y_val)
    print()
    if acc > max_val_acc:
        max_val_acc = acc
        best_clf1 = clf1
        best_clf2 = clf2

clf1: MultinomialNB(alpha=0.07811930460001588)
clf2: MultinomialNB(alpha=0.07811930460001588)
Validation acc: 0.9452754469171835

clf1: MultinomialNB(alpha=0.07811930460001588)
clf2: LinearSVC(C=0.8560958573054762, intercept_scaling=9.231320985461904)
Validation acc: 0.9456402772710689





clf1: MultinomialNB(alpha=0.07811930460001588)
clf2: LogisticRegression(C=0.99389298832784, max_iter=997)
Validation acc: 0.9346953666545057

clf1: MultinomialNB(alpha=0.07811930460001588)
clf2: RandomForestClassifier(max_depth=100, min_samples_split=12, n_estimators=22)
Validation acc: 0.9387085005472455

clf1: MultinomialNB(alpha=0.07811930460001588)
clf2: KNeighborsClassifier(leaf_size=82, n_neighbors=10)
Validation acc: 0.9463699379788398

clf1: LogisticRegression(C=0.99389298832784, max_iter=997)
clf2: MultinomialNB(alpha=0.07811930460001588)
Validation acc: 0.9730025538124772





clf1: LogisticRegression(C=0.99389298832784, max_iter=997)
clf2: LinearSVC(C=0.8560958573054762, intercept_scaling=9.231320985461904)
Validation acc: 0.9726377234585918

clf1: LogisticRegression(C=0.99389298832784, max_iter=997)
clf2: LogisticRegression(C=0.99389298832784, max_iter=997)
Validation acc: 0.9620576431959139

clf1: LogisticRegression(C=0.99389298832784, max_iter=997)
clf2: RandomForestClassifier(max_depth=100, min_samples_split=12, n_estimators=22)
Validation acc: 0.9591390003648304

clf1: LogisticRegression(C=0.99389298832784, max_iter=997)
clf2: KNeighborsClassifier(leaf_size=82, n_neighbors=10)
Validation acc: 0.9700839109813937

clf1: RandomForestClassifier(max_depth=100, min_samples_split=12, n_estimators=22)
clf2: MultinomialNB(alpha=0.07811930460001588)
Validation acc: 0.9799343305363006





clf1: RandomForestClassifier(max_depth=100, min_samples_split=12, n_estimators=22)
clf2: LinearSVC(C=0.8560958573054762, intercept_scaling=9.231320985461904)
Validation acc: 0.9784750091207588

clf1: RandomForestClassifier(max_depth=100, min_samples_split=12, n_estimators=22)
clf2: LogisticRegression(C=0.99389298832784, max_iter=997)
Validation acc: 0.9722728931047063

clf1: RandomForestClassifier(max_depth=100, min_samples_split=12, n_estimators=22)
clf2: RandomForestClassifier(max_depth=100, min_samples_split=12, n_estimators=22)
Validation acc: 0.9627873039036848

clf1: RandomForestClassifier(max_depth=100, min_samples_split=12, n_estimators=22)
clf2: KNeighborsClassifier(leaf_size=82, n_neighbors=10)
Validation acc: 0.9759211966435607

clf1: KNeighborsClassifier(leaf_size=82, n_neighbors=10)
clf2: MultinomialNB(alpha=0.07811930460001588)
Validation acc: 0.9824881430134987





clf1: KNeighborsClassifier(leaf_size=82, n_neighbors=10)
clf2: LinearSVC(C=0.8560958573054762, intercept_scaling=9.231320985461904)
Validation acc: 0.9813936519518424

clf1: KNeighborsClassifier(leaf_size=82, n_neighbors=10)
clf2: LogisticRegression(C=0.99389298832784, max_iter=997)
Validation acc: 0.9715432323969354

clf1: KNeighborsClassifier(leaf_size=82, n_neighbors=10)
clf2: RandomForestClassifier(max_depth=100, min_samples_split=12, n_estimators=22)
Validation acc: 0.967894928858081

clf1: KNeighborsClassifier(leaf_size=82, n_neighbors=10)
clf2: KNeighborsClassifier(leaf_size=82, n_neighbors=10)
Validation acc: 0.9788398394746443



In [124]:
print("Best ensemble model:")
print("clf1:", best_clf1)
print("clf2:", best_clf2)
print("val acc:", max_val_acc)

y_test = y_test[~X_test.isnull()]
X_test = X_test[~X_test.isnull()]
X_test_count = vectorizer.transform(X_test)
X_test_feature = transformer.transform(X_test_count)
preds, acc = eval_ensem_model(best_clf1, best_clf2, X_test_feature, y_test)
print("test acc:", acc)

Best ensemble model:
clf1: KNeighborsClassifier(leaf_size=82, n_neighbors=10)
clf2: MultinomialNB(alpha=0.07811930460001588)
val acc: 0.9824881430134987
Validation acc: 0.9802919708029197
test acc: 0.9802919708029197


## Train LSTM model

The code below has not been tested yet, requires tokenization performed during feature engineering.

In [None]:
import torch
import torch.nn as nn

class SentimentLSTM(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim, vocab_size, embedding_dim, pad=0, dropout=0):

        super().__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.embedding_dim

        self.embedding = nn.Embedding(vocab_size, embedding_dim, pad)

        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.hidden_dim,
            batch_first=True)

        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(0)


    def forward(self, x: torch.Tensor) -> torch.Tensor:
        output = self.embedding(x)
        output = self.dropout(output)
        output = self.lstm(x)[0]
        output = self.relu(output)
        output = self.fc(output)
        output = self.softmax(output)

        return output


In [None]:
from torch.utils.data import TensorDataset, DataLoader
import numpy as np

def get_dataloader(X, y, batch_size):
    y = np.array(y)
    data = TensorDataset(X, torch.FloatTensor(y))
    loader = DataLoader(data, batch_size=batch_size, shuffle=True)

    return loader

In [None]:
from google.colab import drive
drive.mount('/content/drive')
dir = "/content/gdrive/My Drive/BT4222"

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
from torch.optim import Adam
import pickle

def trainLSTM(X_train, X_val, y_train, y_val, batch_size, epochs, input_dim, vocab_size, pad, dir, dropout=0.05, lr=1e-4):

    train_loader = get_dataloader(X_train, y_train, batch_size)
    val_loader = get_dataloader(X_val, y_val, batch_size)

    # hyperparams
    output_dim = 3
    hidden_dim = 512
    embedding_dim = 64

    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("GPU is available")
    else:
        device = torch.device("cpu")
        print("GPU not available, CPU used")

    # initialise LSTM model, loss function and optimizer
    sentimentModel = SentimentLSTM(input_dim, output_dim, hidden_dim, vocab_size, embedding_dim, pad, dropout)
    sentimentModel.to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = Adam(sentimentModel.parameters(), lr = lr)

    epoch_train_loss = []
    epoch_train_acc = []
    epoch_val_loss = []
    epoch_val_acc = []
    best_epoch_sd = {}
    best_epoch = 0
    best_val_loss = 0
    best_val_acc = 0

    # train and evaluate LSTM model
    for epoch in range(1, epochs+1):
        print(f"Training Epoch {epoch} / {epochs}")

        train_losses = []
        train_acc = []

        sentimentModel.train()

        # train loop
        for batch_count, (inputs, labels) in enumerate(train_loader, 1):

            if batch_count == 1 or batch_count % 50 == 0:
                print(f"Training Batch {batch_count} / {len(train_loader)}")

            inputs, labels = inputs.to(device), labels.to(device)
            sentimentModel.zero_grad()

            # model training
            output = sentimentModel(inputs)
            preds = output.max(dim=0)[0]
            loss = loss_fn(preds, labels.float())
            loss.backward()
            optimizer.step()
            nn.utils.clip_grad_norm_(sentimentModel.parameters(), 1.0)

            # recording training stats
            train_losses.append(loss.item())
            train_acc.append(acc_fn(output, labels).item())


        avg_train_loss = sum(train_losses) / len(train_losses)
        avg_train_acc = sum(train_acc) / len(train_acc)
        epoch_train_loss.append(avg_train_loss)
        epoch_train_acc.append(avg_train_acc)

        avg_val_loss, avg_val_acc, preds_lst = eval_loop(sentimentModel, val_loader, device, loss_fn)
        epoch_val_loss.append(avg_val_loss)
        epoch_val_acc.append(avg_val_acc)

        print("Epoch training and validation stats")
        print("Average train loss:", avg_train_loss)
        print("Average train accuracy:", avg_train_acc)
        print("Average val loss:", avg_val_loss)
        print("Average val accuracy:", avg_val_acc)
        print()

        # record best model based on average validation loss
        if avg_val_loss == min(epoch_val_loss):
            best_epoch_sd = sentimentModel.state_dict()
            best_epoch = epoch
            best_val_loss = avg_val_loss
            best_val_acc = avg_val_acc

    # save the best model
    torch.save(best_epoch_sd, f"{dir}/LSTM.pth")
    print(f"Saved model from epoch {best_epoch}.")
    print(f"Lowest validation loss of {best_val_loss} had validation accuracy {best_val_acc}")

    # save training and validation stats
    file_stats = open(f"{dir}/LSTMstats", "ab")
    stats = {"train_loss": epoch_train_loss,
             "train_acc": epoch_train_acc,
             "val_loss": epoch_val_loss,
             "val_acc": epoch_val_acc
             }
    pickle.dump(stats, file_stats)
    file_stats.close()

    return stats


def acc_fn(labels, preds):
    preds = torch.argmax(preds, dim=1)
    corr = (preds == labels).float()
    return corr.sum() / len(corr)


def eval_loop(model, val_loader, device, loss_fn):
    val_losses = 0
    val_acc = 0
    preds_lst = []
    gt_lst = []

    model.eval()
    for inputs, labels in val_loader:

        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        preds = outputs.max(dim=0)[0]
        loss = loss_fn(preds, labels.float())

        val_losses += loss.item()
        val_acc += acc_fn(outputs, labels).item()
        preds_lst.extend(preds.tolist())
        gt_lst.extend(labels.tolist())

    # returns average val loss and acc
    return val_losses / len(val_loader), val_acc / len(val_loader), preds_lst


In [None]:
X_train_tokens, X_val_tokens, vocab_size, pad, max_token_len = NotImplemented
stats = trainLSTM(X_train_tokens, X_val_tokens, y_train, y_val, batch_size=2, epochs=2, input_dim=max_token_len, vocab_size=vocab_size, pad=pad, dir=dir, dropout=0.05, lr=1e-4)