<a href="https://colab.research.google.com/github/ChavChavC/BT4222/blob/main/Simple_ML_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Models covered:

* Multinomial Naive Bayes
* SVM / logistic regression
* Random Forest
* kNN Classifier
* LSTM

In [None]:
!pip install datasets optuna

## import data and split into train and validation

In [3]:
from datasets import load_dataset

dataset = load_dataset("Jean-Baptiste/financial_news_sentiment")

Downloading readme:   0%|          | 0.00/880 [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/768k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/145k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1512 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/267 [00:00<?, ? examples/s]

In [10]:
import pandas as pd

train_data = pd.DataFrame(dataset["train"])

train_data.head()

Unnamed: 0,summary_detail,title,summary_detail_with_title,topic,labels,__index_level_0__
0,"TORONTO, Oct. 05, 2022 (GLOBE NEWSWIRE) -- W...",Wesdome Exploration Defines up Plunge Extensio...,Wesdome Exploration Defines up Plunge Extensio...,6,2,884
1,"Greenville, South Carolina, Nov. 15, 2022 (G...",ARCPOINT INC. TO HOLD Q3 FINANCIAL RESULTS CON...,ARCPOINT INC. TO HOLD Q3 FINANCIAL RESULTS CON...,2,1,711
2,"TORONTO, Nov. 09, 2022 (GLOBE NEWSWIRE) -- O...",Existing Multi-Unit Freshii Franchisee Signs A...,Existing Multi-Unit Freshii Franchisee Signs A...,0,2,268
3,"VANCOUVER, British Columbia, Oct. 06, 2022 (...","Fortuna reports production of 101,840 gold equ...","Fortuna reports production of 101,840 gold equ...",2,2,1504
4,"VANCOUVER, British Columbia, Oct. 17, 2022 (...",HYTN Awarded Controlled Drugs and Substances D...,HYTN Awarded Controlled Drugs and Substances D...,1,2,66


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_data["summary_detail_with_title"], train_data["labels"], test_size=0.2, shuffle=True, random_state=4222)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer().fit(X_train)
X_train_count = vectorizer.transform(X_train)
X_val_count = vectorizer.transform(X_val)

transformer = TfidfTransformer().fit(X_train_count)
X_train_feature = transformer.transform(X_train_count)
X_val_feature = transformer.transform(X_val_count)

## Test out different sklearn classifiers

The main idea here is that we are using `optuna` library to help us finetune some of the hyperparameters for each of the models we have chosen. It will experiment with a range of hyperparameter values, as specified, and choose the set of hyperparameters that result in the greatest validation accuracy.

In [48]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression  #, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


def train_model(X_train, X_val, y_train, y_val, model_fn, params=None):
    if params:
        model = model_fn(**params)
    else:
        model = model_fn()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    return y_pred


def objective_MNB(trial, X_train, X_val, y_train, y_val):
    params = {
        "alpha": trial.suggest_float("alpha", 0.01, 10.0),
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, MultinomialNB, params)

    return accuracy_score(y_val, y_pred)


def objective_SVM(trial, X_train, X_val, y_train, y_val):
    params = {
        "loss": trial.suggest_categorical("loss", ["hinge", "squared_hinge"]),
        "C": trial.suggest_float("C", 0.1, 10.0),  # regularisation parameter
        "intercept_scaling": trial.suggest_float("intercept_scaling", 1.0, 10.0),  # allows intercept to have different regularisation behaviour from other features
        # "max_iter": trial.suggest_int("max_iter", 500, 2000)
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, LinearSVC, params)

    return accuracy_score(y_val, y_pred)


def objective_LR(trial, X_train, X_val, y_train, y_val):
    params = {
        "C": trial.suggest_float("C", 0.01, 1.0),  # regularisation parameter
        "max_iter": trial.suggest_int("max_iter", 500, 1000),
        # "solver": trial.suggest_categorical("solver", ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"])
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, LogisticRegression, params)

    return accuracy_score(y_val, y_pred)


def objective_RF(trial, X_train, X_val, y_train, y_val):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 5, 300),
        "max_depth": trial.suggest_int("max_depth", 5, 100),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 30),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        # "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 5, 1000),
        # "ccp_alpha": trial.suggest_float("ccp_alpha", 0.0, 10.0)
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, RandomForestClassifier, params)

    return accuracy_score(y_val, y_pred)


def objective_KN(trial, X_train, X_val, y_train, y_val):
    params = {
        "n_neighbors": trial.suggest_int("n_neighbors", 1, 100),
        "leaf_size": trial.suggest_int("leaf_size", 1, 100)
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, KNeighborsClassifier, params)

    return accuracy_score(y_val, y_pred)

In [49]:
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

objectives = [
    [objective_MNB, MultinomialNB],
    [objective_SVM, LinearSVC],
    [objective_LR, LogisticRegression],
    [objective_RF, RandomForestClassifier],
    [objective_KN, KNeighborsClassifier]
]

for objective, model in objectives:
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train_feature, X_val_feature, y_train, y_val), n_trials=25)
    params = study.best_params

    print("Classifier:", model)
    y_preds = train_model(X_train_feature, X_val_feature, y_train, y_val, model)
    print("Default params acc:", accuracy_score(y_val, y_preds))
    print("Best params:", params)
    y_preds_2 = train_model(X_train_feature, X_val_feature, y_train, y_val, model, params)
    print("Best params acc:", accuracy_score(y_val, y_preds_2))
    print()


Classifier: <class 'sklearn.naive_bayes.MultinomialNB'>
Default params acc: 0.693069306930693
Best params: {'alpha': 0.1782215132772898}
Best params acc: 0.7293729372937293





Classifier: <class 'sklearn.svm._classes.LinearSVC'>
Default params acc: 0.7887788778877888
Best params: {'loss': 'squared_hinge', 'C': 0.715094645877762, 'intercept_scaling': 8.727679764317223}




Best params acc: 0.7920792079207921

Classifier: <class 'sklearn.linear_model._logistic.LogisticRegression'>
Default params acc: 0.768976897689769
Best params: {'C': 0.8600797820165759, 'max_iter': 715}
Best params acc: 0.7722772277227723

Classifier: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
Default params acc: 0.7557755775577558
Best params: {'n_estimators': 256, 'max_depth': 86, 'min_samples_split': 6, 'min_samples_leaf': 1}
Best params acc: 0.768976897689769

Classifier: <class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Default params acc: 0.6963696369636964
Best params: {'n_neighbors': 70, 'leaf_size': 36}
Best params acc: 0.7260726072607261

