<a href="https://colab.research.google.com/github/ChavChavC/BT4222/blob/main/Simple_ML_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Models covered:

* Multinomial Naive Bayes
* SVM
* Logistic Regression
* Random Forest
* K-Neighbors Classifier


In [None]:
!pip install datasets optuna

# Data cleaning and pre-processing

## Reading in clean data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd

data = pd.read_csv("/content/gdrive/My Drive/BT4222/new_data.csv")
data = data[~data["title"].isnull()]

print(len(data))
data.head()

13702


Unnamed: 0.1,Unnamed: 0,title,labels
0,0,Gildan Activewear Reports Strong Results for t...,2
1,1,TRILLION ENERGY ANNOUNCES FLOW TEST RESULTS FO...,2
2,2,CAPREIT Announces October Distribution,1
3,3,Unigold Inc Delivers Positive Feasibility Stud...,2
4,4,Wallbridge Provides Update on Archer Explorati...,1


## Lemmatization

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_sentence(text):
    # Tokenize the sentence into words
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in words]
    lemmatized_sentence = ' '.join(lemmatized_words)
    return lemmatized_sentence

def lemmatize_dataset(sentences):
    lemmatized = []
    for sentence in sentences:
        lemmatized.append(lemmatize_sentence(sentence))
    return lemmatized

data["title"] = pd.Series(lemmatize_dataset(data["title"]))

data.head()

Unnamed: 0.1,Unnamed: 0,title,labels
0,0,gildan activewear report strong result for the...,2
1,1,trillion energy announces flow test result for...,2
2,2,capreit announces october distribution,1
3,3,unigold inc delivers positive feasibility stud...,2
4,4,wallbridge provides update on archer explorati...,1


## Creating train-validation-test split

In [None]:
from sklearn.model_selection import train_test_split

data = data[~data["title"].isnull()]

# create train : val : test split of 6 : 2 : 2
X_train, X_test, y_train, y_test = train_test_split(
    data["title"], data["labels"], test_size=0.2, shuffle=True, random_state=4222)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, shuffle=True, random_state=4222)

# Feature engineering

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np

# TFIDF for unigrams and bigrams
ngram_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
ngram_matrix_train = ngram_vectorizer.fit_transform(X_train)
ngram_matrix_val = ngram_vectorizer.transform(X_val)
ngram_dense_matrix_train = ngram_matrix_train.todense()
ngram_dense_matrix_val = ngram_matrix_val.todense()

In [None]:
# TFIDF for unigrams only
vectorizer = CountVectorizer().fit(X_train)
X_train_count = vectorizer.transform(X_train)
X_val_count = vectorizer.transform(X_val)

transformer = TfidfTransformer().fit(X_train_count)
X_train_feature = transformer.transform(X_train_count)
X_val_feature = transformer.transform(X_val_count)

# Testing out different sklearn classifiers

The main idea here is that we are using `optuna` library to help us finetune some of the hyperparameters for each of the models we have chosen. It will experiment with a range of hyperparameter values, as specified, and choose the set of hyperparameters that result in the greatest validation accuracy.

## Hyperparameter fine-tuning for all classifiers, using TFIDF unigram data only

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)


def train_model(X_train, X_val, y_train, y_val, model):
    """Used to train the model and return predictions on validation data"""

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    return y_pred

# the following functions are used as objective functions for optuna to perform
# hyperparameter fine-tuning
# the return value will be maximised in the optimisation algorithm

def objective_MNB(trial, X_train, X_val, y_train, y_val):
    params = {
        "alpha": trial.suggest_float("alpha", 0.01, 1.0),
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, MultinomialNB(**params))

    return accuracy_score(y_val, y_pred)


def objective_SVM(trial, X_train, X_val, y_train, y_val):
    params = {
        # "loss": trial.suggest_categorical("loss", ["hinge", "squared_hinge"]),
        "C": trial.suggest_float("C", 0.1, 1.0),  # regularisation parameter
        "intercept_scaling": trial.suggest_float("intercept_scaling", 1.0, 10.0),  # allows intercept to have different regularisation behaviour from other features
        "max_iter": trial.suggest_int("max_iter", 5000, 10000),
        "dual": False
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, LinearSVC(**params))

    return accuracy_score(y_val, y_pred)


def objective_LR(trial, X_train, X_val, y_train, y_val):
    params = {
        "C": trial.suggest_float("C", 0.01, 1.0),  # regularisation parameter
        "max_iter": trial.suggest_int("max_iter", 5000, 10000)
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, LogisticRegression(**params))

    return accuracy_score(y_val, y_pred)


def objective_RF(trial, X_train, X_val, y_train, y_val):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 5, 300),
        "max_depth": trial.suggest_int("max_depth", 5, 100),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 30),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 5, 1000),
        # "ccp_alpha": trial.suggest_float("ccp_alpha", 0.0, 10.0)
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, RandomForestClassifier(**params))

    return accuracy_score(y_val, y_pred)


def objective_KN(trial, X_train, X_val, y_train, y_val):
    params = {
        "n_neighbors": trial.suggest_int("n_neighbors", 1, 100),
        "leaf_size": trial.suggest_int("leaf_size", 1, 100)
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, KNeighborsClassifier(**params))

    return accuracy_score(y_val, y_pred)

In [None]:
# this cell basically carries out model training multiple times with different hyperpameters
# then chooses then ones that give the best validation accuracy

objectives = [
    [objective_MNB, MultinomialNB],
    [objective_SVM, LinearSVC],
    [objective_LR, LogisticRegression],
    [objective_RF, RandomForestClassifier],
    [objective_KN, KNeighborsClassifier]
]

best_params_dct = {}

for objective, model in objectives:
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train_feature, X_val_feature, y_train, y_val), n_trials=30)
    params = study.best_params
    best_params_dct[str(model)] = params

    print("Classifier:", model)
    y_preds = train_model(X_train_feature, X_val_feature, y_train, y_val, model())
    print("Default params acc:", accuracy_score(y_val, y_preds))
    print("Best params:", params)
    y_preds_2 = train_model(X_train_feature, X_val_feature, y_train, y_val, model(**params))
    print("Best params acc:", accuracy_score(y_val, y_preds_2))
    print()


Classifier: <class 'sklearn.naive_bayes.MultinomialNB'>
Default params acc: 0.6995253742241694
Best params: {'alpha': 0.15223417247258386}
Best params acc: 0.7619569185834246

Classifier: <class 'sklearn.svm._classes.LinearSVC'>
Default params acc: 0.7900693683826214
Best params: {'C': 0.7397751893387812, 'intercept_scaling': 2.9888751603831105, 'max_iter': 9347}
Best params acc: 0.7962760131434831

Classifier: <class 'sklearn.linear_model._logistic.LogisticRegression'>


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Default params acc: 0.7721796276013143
Best params: {'C': 0.9907313081009161, 'max_iter': 9982}
Best params acc: 0.7721796276013143

Classifier: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
Default params acc: 0.7860533041255933
Best params: {'n_estimators': 83, 'max_depth': 98, 'min_samples_split': 18, 'min_samples_leaf': 1, 'max_leaf_nodes': 821}
Best params acc: 0.7488134355604235

Classifier: <class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Default params acc: 0.7440671778021175
Best params: {'n_neighbors': 11, 'leaf_size': 1}
Best params acc: 0.7612267250821467



Since there is risk of failure to convergence for LogisticRegression and LinearSVC, we try using SGDClassifier instead


In [None]:
# the hyperparameters for the SGDClassifier are finetuned in a similar way as above

def objective_SGD(trial, X_train, X_val, y_train, y_val):
    params = {
        "loss": trial.suggest_categorical("loss", ["modified_huber", "squared_hinge", "perceptron"]),
        "alpha": trial.suggest_float("alpha", 0.0001, 0.1),  # regularisation parameter
        "max_iter": trial.suggest_int("max_iter", 1000, 10000),
        "penalty": trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet"]),
        "warm_start": trial.suggest_categorical("warm_start", [True, False]),
        "learning_rate": trial.suggest_categorical("learning_rate", ["optimal", "adaptive"]),
        "eta0": trial.suggest_float("eta0", 0.0, 1.0)
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, SGDClassifier(**params))

    return accuracy_score(y_val, y_pred)

def objective_SVM_SGD(trial, X_train, X_val, y_train, y_val):
    params = {
        # "loss": trial.suggest_categorical("loss", ["hinge", "log_loss", "modified_huber", "squared_hinge", "perceptron"]),
        "loss": "hinge",
        "alpha": trial.suggest_float("alpha", 0.0001, 0.1),  # regularisation parameter
        "max_iter": trial.suggest_int("max_iter", 1000, 10000),
        "penalty": trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet"]),
        "warm_start": trial.suggest_categorical("warm_start", [True, False]),
        "learning_rate": trial.suggest_categorical("learning_rate", ["optimal", "adaptive"]),
        "eta0": trial.suggest_float("eta0", 0.0, 1.0)
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, SGDClassifier(**params))

    return accuracy_score(y_val, y_pred)

def objective_LR_SGD(trial, X_train, X_val, y_train, y_val):
    params = {
        # "loss": trial.suggest_categorical("loss", ["hinge", "log_loss", "modified_huber", "squared_hinge", "perceptron"]),
        "loss": "log_loss",
        "alpha": trial.suggest_float("alpha", 0.0001, 0.1),  # regularisation parameter
        "max_iter": trial.suggest_int("max_iter", 1000, 10000),
        "penalty": trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet"]),
        "warm_start": trial.suggest_categorical("warm_start", [True, False]),
        "learning_rate": trial.suggest_categorical("learning_rate", ["optimal", "adaptive"]),
        "eta0": trial.suggest_float("eta0", 0.0, 1.0)
    }

    y_pred = train_model(X_train, X_val, y_train, y_val, SGDClassifier(**params))

    return accuracy_score(y_val, y_pred)

SGD_objectives = [
    [objective_SVM_SGD, "SVM"],
    [objective_LR_SGD, "LR"],
    [objective_SGD, "Any"],
]

best_params_dct_1 = {}

for objective, model in SGD_objectives:
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train_feature, X_val_feature, y_train, y_val), n_trials=200)
    params = study.best_params
    best_params_dct_1[model] = params

    print("Classifier:", model)
    if model == "LR":
        y_preds = train_model(X_train_feature, X_val_feature, y_train, y_val, SGDClassifier(loss="log_loss"))
    else:
        y_preds = train_model(X_train_feature, X_val_feature, y_train, y_val, SGDClassifier())
    print("Default params acc:", accuracy_score(y_val, y_preds))
    print("Best params:", params)
    y_preds_2 = train_model(X_train_feature, X_val_feature, y_train, y_val, SGDClassifier(**params))
    print("Best params acc:", accuracy_score(y_val, y_preds_2))
    print()


Classifier: SVM
Default params acc: 0.7984665936473165
Best params: {'alpha': 0.00010132788989789356, 'max_iter': 7079, 'penalty': 'elasticnet', 'warm_start': True, 'learning_rate': 'adaptive', 'eta0': 0.4325638808470886}
Best params acc: 0.796641109894122

Classifier: LR
Default params acc: 0.764877692588536
Best params: {'alpha': 0.00010561665784748327, 'max_iter': 1681, 'penalty': 'l1', 'warm_start': False, 'learning_rate': 'adaptive', 'eta0': 0.40050212305690175}
Best params acc: 0.7882438846294268

Classifier: Any
Default params acc: 0.796641109894122
Best params: {'loss': 'perceptron', 'alpha': 0.007482187295449014, 'max_iter': 7109, 'penalty': 'l2', 'warm_start': True, 'learning_rate': 'adaptive', 'eta0': 0.9426725754999974}
Best params acc: 0.7517342095655348



Hence, from here, we will proceed with using SGDClassifier for both logistic regression and SVM classifiers.

### Hyperparameter fine-tuning for all classifiers using TFIDF with unigrams and bigrams




In [None]:
X_train_all_ngrams = np.array(ngram_dense_matrix_train)
X_val_all_ngrams = np.array(ngram_dense_matrix_val)

In [None]:
print(X_train_feature.shape)
print(X_train_all_ngrams.shape)

(8216, 13199)
(8216, 65895)


#### Can try using optuna to choose the best k

In [None]:
classifiers = [
    MultinomialNB,
    SGDClassifier,
    RandomForestClassifier,
    KNeighborsClassifier
]

def objective_k(trial, X_train, X_val, y_train, y_val, clf):

    selector = SelectKBest(chi2, k=trial.suggest_int("k", 500, 5000))
    X_train = selector.fit_transform(X_train, y_train)
    X_val = selector.transform(X_val)

    y_pred = train_model(X_train, X_val, y_train, y_val, clf())

    return accuracy_score(y_val, y_pred)

best_params_dct_2 = {}

for model in classifiers:
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective_k(
        trial, X_train_all_ngrams, X_val_all_ngrams, y_train, y_val, model),
                   n_trials=30)
    params = study.best_params
    best_params_dct_2[str(model)] = params

    print("Classifier:", model)
    y_preds = train_model(X_train_all_ngrams, X_val_all_ngrams, y_train, y_val, model())
    print("All ngrams acc:", accuracy_score(y_val, y_preds))
    print("Best params:", params)
    k = params["k"]
    selector = SelectKBest(chi2, k=k)
    X_train_selected = selector.fit_transform(X_train_all_ngrams, y_train)
    X_val_selected = selector.transform(X_val_all_ngrams)
    y_preds_2 = train_model(X_train_selected, X_val_selected, y_train, y_val, model())
    print("Selected params acc:", accuracy_score(y_val, y_preds_2))
    print()

#### Hyperpameter fine-tuning using only 10,000 features for training, selected using SelectKBest


In [None]:
selector = SelectKBest(chi2, k=10000)
X_train_selected = selector.fit_transform(X_train_all_ngrams, y_train)
X_val_selected = selector.transform(X_val_all_ngrams)

In [None]:
# the hyperparameters for the classifiers are finetuned to train well to a subset
# of all the TFIDF of unigrams and bigrams

objectives = [
    [objective_MNB, MultinomialNB],
    [objective_SVM_SGD, SGDClassifier],
    [objective_LR_SGD, SGDClassifier],
    [objective_RF, RandomForestClassifier],
    [objective_KN, KNeighborsClassifier]
]

best_params_dct_3 = {}

for objective, model in objectives:
    # k = best_params_dct_2[str(model)]["k"]
    # selector = SelectKBest(chi2, k=k)
    # X_train_selected = selector.fit_transform(X_train_all_ngrams, y_train)
    # X_val_selected = selector.transform(X_val_all_ngrams)

    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train_selected, X_val_selected, y_train, y_val), n_trials=30)
    params = study.best_params
    best_params_dct_3[str(model)] = params

    print("Classifier:", model)
    y_preds = train_model(X_train_selected, X_val_selected, y_train, y_val, model())
    print("Default params acc:", accuracy_score(y_val, y_preds))
    print("Best params:", params)
    y_preds_2 = train_model(X_train_selected, X_val_selected, y_train, y_val, model(**params))
    print("Best params acc:", accuracy_score(y_val, y_preds_2))
    print()

Classifier: <class 'sklearn.naive_bayes.MultinomialNB'>
Default params acc: 0.6856516976998904
Best params: {'alpha': 0.017985757775145533}
Best params acc: 0.7776560788608982

Classifier: <class 'sklearn.linear_model._stochastic_gradient.SGDClassifier'>
Default params acc: 0.7940854326396495
Best params: {'alpha': 0.000501578634830783, 'max_iter': 5851, 'penalty': 'elasticnet', 'warm_start': False, 'learning_rate': 'adaptive', 'eta0': 0.06828026948459881}
Best params acc: 0.6973347937203359

Classifier: <class 'sklearn.linear_model._stochastic_gradient.SGDClassifier'>
Default params acc: 0.7820372398685652
Best params: {'alpha': 0.00029271480817876655, 'max_iter': 5766, 'penalty': 'l2', 'warm_start': True, 'learning_rate': 'optimal', 'eta0': 0.3074761528089284}
Best params acc: 0.7458926615553122

Classifier: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
Default params acc: 0.7663380795910917
Best params: {'n_estimators': 171, 'max_depth': 100, 'min_samples_split': 27, 'mi

##### Re-initialise `best_params_dct_3` in case of runtime disconnection

In [None]:
best_params_dct_3 = {
    "<class 'sklearn.naive_bayes.MultinomialNB'>": {'alpha': 0.017985757775145533},
    "<class 'sklearn.linear_model._stochastic_gradient.SGDClassifier'>": {'alpha': 0.00029271480817876655,
                                                                          'max_iter': 5766,
                                                                          'penalty': 'l2',
                                                                          'warm_start': True,
                                                                          'learning_rate': 'optimal',
                                                                          'eta0': 0.3074761528089284},
    "<class 'sklearn.ensemble._forest.RandomForestClassifier'>": {'n_estimators': 171,
                                                                  'max_depth': 100,
                                                                  'min_samples_split': 27,
                                                                  'min_samples_leaf': 1,
                                                                  'max_leaf_nodes': 740},
    "<class 'sklearn.neighbors._classification.KNeighborsClassifier'>": {'n_neighbors': 1,
                                                                         'leaf_size': 38}
}

#### Benchmark accuracies for all classifiers trained using all TFIDF unigrams and bigrams


The hyperparameters obtained from fine-tuning above (using only the top 10,000 features) are used below, for models where an improvement in validation accuracy was observed. Else, the default model hyperparameters are used.

In [None]:
finetuned_clfs = [
    MultinomialNB(**best_params_dct_3[str(MultinomialNB)]),
    SGDClassifier(loss="hinge"),
    SGDClassifier(loss="log_loss"),
    RandomForestClassifier(),
    KNeighborsClassifier(**best_params_dct_3[str(KNeighborsClassifier)])
]

for clf in finetuned_clfs:
    y_preds = train_model(X_train_all_ngrams, X_val_all_ngrams, y_train, y_val, clf)
    print(clf)
    print("val acc:", accuracy_score(y_val, y_preds))
    print()

MultinomialNB(alpha=0.017985757775145533)
val acc: 0.7926250456370938

SGDClassifier()
val acc: 0.8185469149324571

SGDClassifier(loss='log_loss')
val acc: 0.7656078860898138

RandomForestClassifier()
val acc: 0.7772909821102593

KNeighborsClassifier(leaf_size=38, n_neighbors=1)
val acc: 0.7144943410003651



## Test out different model aggregation algorithms

Brainstorming for ideas:

1. Use all classifiers that predict using probabilities, then classify based on highest average probability across all classifiers
2. Have 2 different classifiers, the first one classifies everything, then whatever it classifies wrongly or un-confidently (based on a certain probability threshold value) gets passed on to the second classifier. The second classifier is only fitted on these "difficult" cases, to try to classify them correctly.
    - can try integrating feature selection from model as well
3. Use all classifiers that predict using probabilities, then use a second model to output the final class based on the concatenated probabilities (stacking)



### Idea 1: Weighted Average

The average of the predictions from various classifiers is taken to be the final prediction. The predictions are weighted based on the prediction confidence. For SVM without prediction probabilities available, a customisable `prob` value is assigned to the class it predicts to, and `0` otherwise.

In [None]:
from sklearn.preprocessing import OneHotEncoder

def weighted_avg_preds(clfs, X_val, y_val, prob=0.9):
    preds_probs = []
    for clf in clfs:
        if hasattr(clf, "predict_proba"):  # all classifiers except SVM
            preds_probs.append(clf.predict_proba(X_val))
        else:  # SVM
            enc = OneHotEncoder()
            enc.fit(np.expand_dims(np.array(y_val), axis=1))
            preds_probs.append(enc.transform(np.expand_dims(clf.predict(X_val), axis=1)).toarray() * prob)
    summed_preds_probs = preds_probs[0]
    for pred in preds_probs[1:]:
        summed_preds_probs += pred

    preds = np.argmax(summed_preds_probs, axis=1)
    return accuracy_score(y_val, preds)

#### Aggregation of all models trained on previously selected (10,000) features

In [None]:
clfs = []
for clf in second:
    clf.fit(X_train_selected, y_train)
    clfs.append(clf)

weighted_avg_preds(clfs, X_val_selected, y_val)

0.7802117561153705

#### Aggregation of top 3 performing models (SVM, Logistic Regression, Multinomial NB) trained on all features

In [None]:
clf1 = SGDClassifier("log_loss")
clf2 = SGDClassifier()
clf3 = MultinomialNB(alpha=0.017985757775145533)

clf1.fit(X_train_all_ngrams, y_train)
clf2.fit(X_train_all_ngrams, y_train)
clf3.fit(X_train_all_ngrams, y_train)

clfs = [clf1, clf2, clf3]

In [None]:
for clf in clfs:
    print(accuracy_score(clf.predict(X_val_all_ngrams), y_val))

0.7714494341000365
0.8181818181818182
0.7926250456370938


In [None]:
weighted_avg_preds(clfs, X_val_all_ngrams, y_val, prob=0.7)

0.8159912376779846

### Idea 2: Classifier Pairs



The basic idea here is that Classifier 1 will be trained on all data points. Tthose points classified wrongly, or under a certain confidence threshold will be used to train Classifier 2. The predictions from Classifier 2 will replace those made by Classifier 1.

During evaluation, similar to above, Classifier 1 will be used to obtain the prediction probabilities for all data points, and those below a certain threshold will be classified by Classifier 2 instead.

One point to note for this algorithm is that since SVM classifier does not have prediction probabilities, it will be omitted from the choice of Classifier 1.

In [None]:
def get_repred_ind(clf, X, y=None, threshold=0.7):
    """Obtains the indices of data points to be classified by Classifier 2"""

    pred_probs = clf.predict_proba(X)
    if y is not None:
        threshold_col = np.expand_dims(np.array([threshold]*pred_probs.shape[0]), axis=1)
        pred_probs_threshed = np.append(pred_probs, threshold_col, axis=1)
        repredict_ind = np.where(np.argmax(pred_probs_threshed, axis=1) != y)[0]
    else:
        repredict_ind = np.where(np.max(pred_probs, axis=1) < threshold)[0]
    return repredict_ind

def train_ensem_model(clf1, clf2, X_train, y_train, threshold=0.7):
    """Trains both Classifier 1 and 2"""

    clf1.fit(X_train, y_train)
    repredict_ind = get_repred_ind(clf1, X_train, y_train, threshold)
    if len(repredict_ind) == 0:
        print("second classifier trained on all data")
        clf2.fit(X_train, y_train)
    else:
        clf2.fit(X_train[repredict_ind], np.array(y_train)[repredict_ind])
    return clf1, clf2

def eval_ensem_model(clf1, clf2, X_val, y_val, threshold=0.7):
    """Used to make predictions on validation data, using both Classifier 1 and 2"""

    pred_probs = clf1.predict_proba(X_val)
    repredict_ind = get_repred_ind(clf1, X_val, threshold)
    if len(repredict_ind) > 0:
        repredict_preds = clf2.predict(X_val[repredict_ind])
        preds = clf1.predict(X_val)
        preds[repredict_ind] = repredict_preds
    else:
        print("second classifier not used for evaluation")
        preds = clf1.predict(X_val)
    acc = accuracy_score(y_val, preds)
    return preds, acc


#### All possible combinations of classifier pairs are tested, using previously selected (10,000) features

In [None]:
first = [
    MultinomialNB(**best_params_dct_3[str(MultinomialNB)]),
    SGDClassifier(loss="log_loss"),
    RandomForestClassifier(),
    KNeighborsClassifier(**best_params_dct_3[str(KNeighborsClassifier)])
]
second = [
    MultinomialNB(**best_params_dct_3[str(MultinomialNB)]),
    SGDClassifier(loss="hinge"),
    SGDClassifier(loss="log_loss"),
    RandomForestClassifier(),
    KNeighborsClassifier(**best_params_dct_3[str(KNeighborsClassifier)])
]

In [None]:
classifier_pairs = []
for clf1 in first:
    for clf2 in second:
        classifier_pairs.append([clf1, clf2])

max_val_acc = 0
best_clf1 = None
best_clf2 = None
for clf1, clf2 in classifier_pairs:
    clf1, clf2 = train_ensem_model(clf1, clf2, X_train_selected, y_train)
    print("clf1:", clf1)
    print("clf2:", clf2)
    preds, acc = eval_ensem_model(clf1, clf2, X_val_selected, y_val)
    print("val acc:", acc)
    print()
    if acc > max_val_acc:
        max_val_acc = acc
        best_clf1 = clf1
        best_clf2 = clf2

clf1: MultinomialNB(alpha=0.017985757775145533)
clf2: MultinomialNB(alpha=0.017985757775145533)
val acc: 0.7776560788608982

clf1: MultinomialNB(alpha=0.017985757775145533)
clf2: SGDClassifier()
val acc: 0.7944505293902884

clf1: MultinomialNB(alpha=0.017985757775145533)
clf2: SGDClassifier(loss='log_loss')
val acc: 0.7824023366192041

clf1: MultinomialNB(alpha=0.017985757775145533)
clf2: RandomForestClassifier()
val acc: 0.7867834976268712

clf1: MultinomialNB(alpha=0.017985757775145533)
clf2: KNeighborsClassifier(leaf_size=38, n_neighbors=1)
val acc: 0.764512595837897

clf1: SGDClassifier(loss='log_loss')
clf2: MultinomialNB(alpha=0.017985757775145533)
val acc: 0.7772909821102593

clf1: SGDClassifier(loss='log_loss')
clf2: SGDClassifier()
val acc: 0.7940854326396495

clf1: SGDClassifier(loss='log_loss')
clf2: SGDClassifier(loss='log_loss')
val acc: 0.7579408543263965

clf1: SGDClassifier(loss='log_loss')
clf2: RandomForestClassifier()
val acc: 0.7637824023366192

clf1: SGDClassifier(

In [None]:
print("Best ensemble model:")
print("clf1:", best_clf1)
print("clf2:", best_clf2)
print("val acc:", max_val_acc)

Best ensemble model:
clf1: MultinomialNB(alpha=0.017985757775145533)
clf2: SGDClassifier()
val acc: 0.7944505293902884


The top 2 classifier pairs are Multinomial NB + SVM and Logistic Regression + SVM.

The focus of subsequent efforts to improve model performance using this aggregation technique will work with these 2 pairs only.

#### Experimenting with different threshold values

Here, we experiment with different threshold values in range [0.6, 0.9] to find the best validation accuracy.

In [None]:
clf1 = MultinomialNB(alpha=0.017985757775145533)
clf2 = SGDClassifier()

clf1, clf2 = train_ensem_model(clf1, clf2, X_train_selected, y_train, threshold=0.8)
preds, acc = eval_ensem_model(clf1, clf2, X_val_selected, y_val, threshold=0.7)

Using the previous selected (10,000) features as training data, the best threshold values for training and evaluation are 0.85 and 0.7 respectively.

This gives rise to a 0.80066 accuracy.

In [None]:
clf1 = MultinomialNB(alpha=0.017985757775145533)
clf2 = SGDClassifier()

clf1, clf2 = train_ensem_model(clf1, clf2, X_train_all_ngrams, y_train, threshold=0.85)
preds, acc = eval_ensem_model(clf1, clf2, X_val_all_ngrams, y_val, threshold=0.75)
acc

0.8174516246805403

Using all the n-gram data for training, the best threshold values for training and evaluation are 0.85 and 0.75 respectively.

This gives rise to a 0.81745 accuracy.

In [None]:
clf1 = SGDClassifier("log_loss")
clf2 = SGDClassifier()

clf1, clf2 = train_ensem_model(clf1, clf2, X_train_all_ngrams, y_train, threshold=0.85)
preds, acc = eval_ensem_model(clf1, clf2, X_val_all_ngrams, y_val, threshold=0.75)
acc

0.8170865279299014

Similar results can be observed for the 2nd best classifier pair above.

##### Trial and error results

threshold values for train, eval: classification accuracy using selected, all ngrams

0.6, 0.6: 0.7849580138736765, 0.805403431909456

0.7, 0.6: 0.7882438846294268, 0.8108798831690398

0.7, 0.7: 0.7933552391383717, 0.8116100766703176

0.8, 0.7: 0.7988316903979554, 0.8167214311792625

0.85, 0.7: 0.80065717415115, 0.8163563344286235

0.85, 0.75: 0.7995618838992333, 0.8174516246805403

#### Trying out weighted average for evaluation

Instead of using the threshold algorithm during evaluation, weighted average is used instead.

The models are still trained using the threshold algorithm though.

In [None]:
clf1a = MultinomialNB(alpha=0.017985757775145533)
clf2a = SGDClassifier()

clf1a, clf2a = train_ensem_model(clf1a, clf2a, X_train_all_ngrams, y_train)

clf1b = SGDClassifier(loss="log_loss")
clf2b = SGDClassifier()

clf1b, clf2b = train_ensem_model(clf1b, clf2b, X_train_all_ngrams, y_train)

In [None]:
clfs_a = [clf1a, clf2a]

clfs_b = [clf1b, clf2b]

print(weighted_avg_preds(clfs_a, X_val_all_ngrams, y_val, prob=0.85))
print(weighted_avg_preds(clfs_b, X_val_all_ngrams, y_val, prob=0.85))

0.8170865279299014
0.8159912376779846


By experimenting with different values for the `prob` value used to weight the predictions made by SVM, the best `prob` for classifier pair a and b are 0.85 and 0.9 respectively, resulting in accuracies 0.81709 and 0.81599 respectively.

##### Trial and error results

training, pred threshold: weighted acc

0.6, 0.9: 0.8152610441767069, 0.8178167214311792

0.8, 0.85: 0.818912011683096, 0.8127053669222344

0.85, 0.85: 0.818912011683096, 0.8127053669222344

0.9, 0.9: 0.8170865279299014, 0.8159912376779846

#### Trying out using feature selection for Classifier 2 inputs

The idea here is that the data points identified for Classifier 2 remains the same, but instead of using all the features to train Classifier 2, we use a selector to select the most important 10,000 features first, and train Classifier 2 only on those features.

Similarly, during evaluation, feature selection precedes prediction by Classifier 2.

Classifier 1 is not affected.

##### Testing different selector functions

In [None]:
from sklearn.feature_selection import SelectFromModel

In [None]:
selector1 = SelectFromModel(estimator=SGDClassifier(), max_features=10000, threshold=-np.inf).fit(X_train_all_ngrams, y_train)
X_train_selected_2 = selector1.transform(X_train_all_ngrams)
X_val_selected_2 = selector1.transform(X_val_all_ngrams)

In [None]:
for clf in second:
    y_preds = train_model(X_train_selected_2, X_val_selected_2, y_train, y_val, clf)
    print(clf)
    print("val acc:", accuracy_score(y_val, y_preds))
    print()

MultinomialNB(alpha=0.017985757775145533)
val acc: 0.7849580138736765

SGDClassifier()
val acc: 0.80065717415115

SGDClassifier(loss='log_loss')
val acc: 0.768893756845564

RandomForestClassifier()
val acc: 0.7791164658634538

KNeighborsClassifier(leaf_size=38, n_neighbors=1)
val acc: 0.7458926615553122



In [None]:
selector2 = SelectFromModel(estimator=SGDClassifier("log_loss"), max_features=10000, threshold=-np.inf).fit(X_train_all_ngrams, y_train)
X_train_selected_3 = selector2.transform(X_train_all_ngrams)
X_val_selected_3 = selector2.transform(X_val_all_ngrams)
for clf in second:
    y_preds = train_model(X_train_selected_3, X_val_selected_3, y_train, y_val, clf)
    print(clf)
    print("val acc:", accuracy_score(y_val, y_preds))
    print()

MultinomialNB(alpha=0.017985757775145533)
val acc: 0.7864184008762322

SGDClassifier()
val acc: 0.796641109894122

SGDClassifier(loss='log_loss')
val acc: 0.7579408543263965

RandomForestClassifier()
val acc: 0.7805768528660095

KNeighborsClassifier(leaf_size=38, n_neighbors=1)
val acc: 0.7437020810514786



The best selector function is SelectFromModel, using SVM as the model.

##### Carrying out the model training with inbuilt feature selection

In [1]:
def get_repred_ind(clf, X, y=None, threshold=0.85):
    """Obtains the indices of data points to be classified by Classifier 2"""

    pred_probs = clf.predict_proba(X)
    if y is not None:
        threshold_col = np.expand_dims(np.array([threshold]*pred_probs.shape[0]), axis=1)
        pred_probs_threshed = np.append(pred_probs, threshold_col, axis=1)
        repredict_ind = np.where(np.argmax(pred_probs_threshed, axis=1) != y)[0]
    else:
        repredict_ind = np.where(np.max(pred_probs, axis=1) < threshold)[0]
    return repredict_ind

def train_ensem_model(clf1, clf2, X_train, y_train, threshold=0.85):
    """Trains both Classifier 1 and 2"""

    clf1.fit(X_train, y_train)
    repredict_ind = get_repred_ind(clf1, X_train, y_train, threshold)
    if len(repredict_ind) == 0:
        print("second classifier trained on all data")
        clf2.fit(selector1.transform(X_train), np.array(y_train))  # selector used
    else:
        clf2.fit(selector1.transform(X_train[repredict_ind]), np.array(y_train)[repredict_ind])  # selector used
    return clf1, clf2

def eval_ensem_model(clf1, clf2, X_val, y_val, threshold=0.7):
    """Used to make predictions on validation data, using both Classifier 1 and 2"""

    pred_probs = clf1.predict_proba(X_val)
    repredict_ind = get_repred_ind(clf1, X_val)
    if len(repredict_ind) > 0:
        repredict_preds = clf2.predict(selector1.transform(X_val[repredict_ind]))  # selector used before prediction
        preds = clf1.predict(X_val)
        preds[repredict_ind] = repredict_preds
    else:
        print("second classifier not used for evaluation")
        preds = clf1.predict(X_val)
    acc = accuracy_score(y_val, preds)
    return preds, acc

In [None]:
# testing out the performance of the top 2 classifier pairs

clf1a = MultinomialNB(alpha=0.017985757775145533)
clf2a = SGDClassifier()

clf1a, clf2a = train_ensem_model(clf1a, clf2a, X_train_all_ngrams, y_train, threshold=0.85)
preds_a, acc_a = eval_ensem_model(clf1a, clf2a, X_val_all_ngrams, y_val, threshold=0.85)
print(acc_a)

clf1b = SGDClassifier(loss="log_loss")
clf2b = SGDClassifier()

clf1b, clf2b = train_ensem_model(clf1b, clf2b, X_train_all_ngrams, y_train, threshold=0.85)
preds_b, acc_b = eval_ensem_model(clf1b, clf2b, X_val_all_ngrams, y_val, threshold=0.85)
print(acc_b)

0.7652427893391749
0.8032128514056225


In [None]:
def weighted_avg_pred_ensem(clfs, X_val, y_val, selector, prob=0.9):
    """Used to get average prediction, weighted by confidence of each classifier"""

    preds_probs = []
    clf1, clf2 = clfs
    preds_probs.append(clf1.predict_proba(X_val))
    X_val_2 = selector.transform(X_val)
    if hasattr(clf2, "predict_proba"):
        preds_probs.append(clf2.predict_proba(X_val_2))
    else:
        enc = OneHotEncoder()
        enc.fit(np.expand_dims(np.array(y_val), axis=1))
        preds_probs.append(enc.transform(np.expand_dims(clf2.predict(X_val_2), axis=1)).toarray() * prob)
    summed_preds_probs = preds_probs[0] + preds_probs[1]

    preds = np.argmax(summed_preds_probs, axis=1)
    return accuracy_score(y_val, preds)

In [None]:
clfs_a = [clf1a, clf2a]
clfs_b = [clf1b, clf2b]

print(weighted_avg_pred_ensem(clfs_a, X_val_all_ngrams, y_val, selector1, prob=0.9))
print(weighted_avg_pred_ensem(clfs_b, X_val_all_ngrams, y_val, selector1, prob=0.9))

0.7433369843008397
0.8032128514056225


Both the threshold evaluation and weighted average evaluation methods are tested, but both do not seem to improve the results of the classifier pair algorithm much.

### Idea 3: Stacking

Each base learner is trained using the optimal hyperparameters obtained in previous parts. All the prediction confidences are used as inputs into the meta-learner to aggregate and output the final classification.

Similar to above, the confidence value for SVM predictions is set manually.

In [None]:
def get_proba(clfs, X, y, prob=0.9):
    """Get concatenated prediction probabilities for all classes from all classifiers"""

    preds_probs = []
    for clf in clfs:
        if hasattr(clf, "predict_proba"):
            preds_probs.append(clf.predict_proba(X))
        else:
            enc = OneHotEncoder()
            enc.fit(np.expand_dims(np.array(y), axis=1))
            preds_probs.append(enc.transform(np.expand_dims(clf.predict(X), axis=1)).toarray() * prob)
    collated = np.concatenate(preds_probs, axis=1)
    return collated

#### Use all classifiers on all features in aggregation

In [None]:
trained_all_ngrams = [
    MultinomialNB(**best_params_dct_3[str(MultinomialNB)]),
    SGDClassifier(loss="hinge"),
    SGDClassifier(loss="log_loss"),
    RandomForestClassifier(),
    KNeighborsClassifier(**best_params_dct_3[str(KNeighborsClassifier)])
]

for clf in trained_all_ngrams:
    clf.fit(X_train_all_ngrams, y_train)

In [None]:
finals = [
    MultinomialNB(),
    SGDClassifier(loss="hinge"),
    SGDClassifier(loss="log_loss"),
    RandomForestClassifier(),
    KNeighborsClassifier()
]

train_probs = get_proba(trained_all_ngrams, X_train_all_ngrams, y_train, prob=1.7)
val_probs = get_proba(trained_all_ngrams, X_val_all_ngrams, y_train, prob=1.7)

for final in finals:
    final.fit(train_probs, y_train)
    preds = final.predict(val_probs)
    print(clf)  # typo: shld be print(final)
    print(accuracy_score(y_val, preds))
    print()

KNeighborsClassifier(leaf_size=38, n_neighbors=1)
0.8174516246805403

KNeighborsClassifier(leaf_size=38, n_neighbors=1)
0.7155896312522818

KNeighborsClassifier(leaf_size=38, n_neighbors=1)
0.7721796276013143

KNeighborsClassifier(leaf_size=38, n_neighbors=1)
0.7758305951077036

KNeighborsClassifier(leaf_size=38, n_neighbors=1)
0.7152245345016429



#### Use top 4 classifiers on all features in aggregation

In [None]:
# use all models except knn

finals = [
    MultinomialNB(),
    SGDClassifier(loss="hinge"),
    SGDClassifier(loss="log_loss"),
    RandomForestClassifier(),
    KNeighborsClassifier()
]

train_probs = get_proba(trained_all_ngrams[:4], X_train_all_ngrams, y_train, prob=1.3)
val_probs = get_proba(trained_all_ngrams[:4], X_val_all_ngrams, y_train, prob=1.3)

for final in finals:
    final.fit(train_probs, y_train)
    preds = final.predict(val_probs)
    print(final)
    print(accuracy_score(y_val, preds))
    print()

MultinomialNB()
0.8185469149324571

SGDClassifier()
0.8218327856882074

SGDClassifier(loss='log_loss')
0.8181818181818182

RandomForestClassifier()
0.7838627236217598

KNeighborsClassifier()
0.8039430449069004



##### Trial and error results

0.9

MultinomialNB()
0.8174516246805403

SGDClassifier()
0.809784592917123

SGDClassifier(loss='log_loss')
0.8203723986856517

RandomForestClassifier()
0.7893391748813435

KNeighborsClassifier()
0.8039430449069004


1.4

MultinomialNB()
0.8185469149324571

SGDClassifier()
0.8218327856882074

SGDClassifier(loss='log_loss')
0.8185469149324571

RandomForestClassifier()
0.7926250456370938

KNeighborsClassifier()
0.8039430449069004

1.7

MultinomialNB()
0.8185469149324571

SGDClassifier()
0.8141657539247901

SGDClassifier(loss='log_loss')
0.8185469149324571

RandomForestClassifier()
0.7907995618838992

KNeighborsClassifier()
0.8039430449069004

#### Use top 3 classifiers on all features in aggregation

In [None]:
# use all models except rf and knn

finals = [
    MultinomialNB(),
    SGDClassifier(loss="hinge"),
    SGDClassifier(loss="log_loss"),
    RandomForestClassifier(),
    KNeighborsClassifier()
]

train_probs = get_proba(trained_all_ngrams[:3], X_train_all_ngrams, y_train, prob=1.2)
val_probs = get_proba(trained_all_ngrams[:3], X_val_all_ngrams, y_train, prob=1.2)

for final in finals:
    final.fit(train_probs, y_train)
    preds = final.predict(val_probs)
    print(final)
    print(accuracy_score(y_val, preds))
    print()

MultinomialNB()
0.8185469149324571

SGDClassifier()
0.7981014968966776

SGDClassifier(loss='log_loss')
0.8108798831690398

RandomForestClassifier()
0.7838627236217598

KNeighborsClassifier()
0.7944505293902884



#### Use top 4 classifiers on selected features in aggregation

In [None]:
# using seleted features only

trained_selected = [
    MultinomialNB(**best_params_dct_3[str(MultinomialNB)]),
    SGDClassifier(loss="hinge"),
    SGDClassifier(loss="log_loss"),
    RandomForestClassifier(),
    KNeighborsClassifier(**best_params_dct_3[str(KNeighborsClassifier)])
]

for clf in trained_selected:
    clf.fit(X_train_selected_2, y_train)

In [None]:
# use all models except knn

finals = [
    MultinomialNB(),
    SGDClassifier(loss="hinge"),
    SGDClassifier(loss="log_loss"),
    RandomForestClassifier(),
    KNeighborsClassifier()
]

train_probs = get_proba(trained_selected[:4], X_train_selected_2, y_train, prob=1.3)
val_probs = get_proba(trained_selected[:4], X_val_selected_2, y_train, prob=1.3)

for final in finals:
    final.fit(train_probs, y_train)
    preds = final.predict(val_probs)
    print(final)
    print(accuracy_score(y_val, preds))
    print()

MultinomialNB()
0.80065717415115

SGDClassifier()
0.7316538882803943

SGDClassifier(loss='log_loss')
0.8046732384081782

RandomForestClassifier()
0.7864184008762322

KNeighborsClassifier()
0.8061336254107339



All the trials above consistently show that the accuracy score is highest when only the top 4 classifiers are used as the base-models, and all features are used for training. Our final model shall thus follow this.

#### Fine-tuning the hyperparameters of the meta-learner

In [None]:
def objective_meta(trial, X_train, X_val, y_train, y_val, clf, clf_name):
    """Objective function to be used to optimize the hyperparameters of meta-learner"""

    if clf_name == "MNB":
        params = {
            "alpha": trial.suggest_float("alpha", 0.001, 1.0),
        }
    elif clf_name == "SVM":
        params = {
            "loss": "hinge",
            "alpha": trial.suggest_float("alpha", 0.0001, 0.1),  # regularisation parameter
            "max_iter": trial.suggest_int("max_iter", 1000, 10000),
            "penalty": trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet"]),
            # "warm_start": trial.suggest_categorical("warm_start", [True, False]),
            # "learning_rate": trial.suggest_categorical("learning_rate", ["optimal", "adaptive"]),
            # "eta0": trial.suggest_float("eta0", 0.0, 1.0)
        }
    elif clf_name == "LR":
        params = {
            "loss": "log_loss",
            "alpha": trial.suggest_float("alpha", 0.0001, 0.1),  # regularisation parameter
            "max_iter": trial.suggest_int("max_iter", 1000, 10000),
            "penalty": trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet"]),
            # "warm_start": trial.suggest_categorical("warm_start", [True, False]),
            # "learning_rate": trial.suggest_categorical("learning_rate", ["optimal", "adaptive"]),
            # "eta0": trial.suggest_float("eta0", 0.0, 1.0)
        }
    elif clf_name == "RF":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 5, 300),
            "max_depth": trial.suggest_int("max_depth", 5, 100),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 30),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 5, 1000),
        }
    elif clf_name == "KN":
            params = {
            "n_neighbors": trial.suggest_int("n_neighbors", 1, 100),
            "leaf_size": trial.suggest_int("leaf_size", 1, 100)
        }

    model = clf(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    return accuracy_score(y_val, preds)

In [None]:
train_probs = get_proba(trained_all_ngrams[:4], X_train_all_ngrams, y_train, prob=1.3)
val_probs = get_proba(trained_all_ngrams[:4], X_val_all_ngrams, y_train, prob=5)

In [None]:
clf_clfname = [
    [MultinomialNB, "MNB"],
    [SGDClassifier, "SVM"],
    [SGDClassifier, "LR"],
    [RandomForestClassifier, "RF"],
    [KNeighborsClassifier, "KN"]
]

for clf, clf_name in clf_clfname:
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective_meta(trial, train_probs, val_probs, y_train, y_val, clf, clf_name), n_trials=100)
    params = study.best_params

    print("Meta-learner:", clf_name)
    if clf_name == "LR":
        model = clf(loss="log_loss")
    else:
        model = clf()
    model.fit(train_probs, y_train)
    y_preds = model.predict(val_probs)
    print("Default params acc:", accuracy_score(y_val, y_preds))
    print("Best params:", params)
    model = clf(**params)
    model.fit(train_probs, y_train)
    y_preds_2 = model.predict(val_probs)
    print("Best params acc:", accuracy_score(y_val, y_preds_2))
    print()

Meta-learner: MNB
Default params acc: 0.8185469149324571
Best params: {'alpha': 0.5750016598974882}
Best params acc: 0.8185469149324571

Meta-learner: SVM
Default params acc: 0.8130704636728733
Best params: {'alpha': 0.0003982622356968902, 'max_iter': 6869, 'penalty': 'elasticnet'}
Best params acc: 0.8254837531945965

Meta-learner: LR
Default params acc: 0.8181818181818182
Best params: {'alpha': 0.08216126312129639, 'max_iter': 2738, 'penalty': 'elasticnet'}
Best params acc: 0.8185469149324571

Meta-learner: RF
Default params acc: 0.7893391748813435
Best params: {'n_estimators': 10, 'max_depth': 64, 'min_samples_split': 13, 'min_samples_leaf': 9, 'max_leaf_nodes': 379}
Best params acc: 0.796641109894122

Meta-learner: KN
Default params acc: 0.8039430449069004
Best params: {'n_neighbors': 79, 'leaf_size': 68}
Best params acc: 0.8207374954362906



The best performing stacking model consists of Multinomial NB, SVM, Logistic Regression and Random Forest base-learners, with SVM meta-learner. After fine-tuning, the validation accuracy is 0.82548.