# Test and compare different classification models, store the trained models for using downstream

In [1]:
import pandas as pd
import numpy as np

# NLP Imports
from collections import Counter

from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from tqdm import tqdm # Used to show a progress bar
import spacy
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB

from sklearn.model_selection import GridSearchCV

In [2]:
upstream = ['vectorizer']
random_seed = None

In [3]:
# Parameters
random_seed = 42
upstream = {
    "vectorizer": {
        "nb": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/vectorizer.ipynb",
        "vectorizer": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/vectorizer.pkl",
        "vocab": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/vocab.pkl",
    }
}
product = {
    "nb": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/category_classification_models.ipynb",
    "model_lr": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/model_lr.pkl",
    "model_rf": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/model_rf.pkl",
    "model_nb": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/model_nb.csv",
    "model_votingc": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/model_votingc.csv",
}


## Read the train and test data

In [4]:
df_train = pd.read_csv('../data/HumAID_data_v1.0/all_combined/all_train.tsv', sep='\t')
df_test = pd.read_csv('../data/HumAID_data_v1.0/all_combined/all_test.tsv', sep='\t')

df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

df_train.sample(5)

Unnamed: 0,tweet_id,tweet_text,class_label
46069,732346289652928512,Happening now: mandatory evacuation of oilfiel...,displaced_people_and_evacuations
33284,1167531580845281280,Whats reckless is your unsubstantiated attacks...,not_humanitarian
7684,870100606421147648,ἰF Please donate funds for #SriLanka ἟1἟0Worst...,rescue_volunteering_or_donation_effort
48186,1021738373663387648,"#PrayForAthens A 13 year old girl, being burne...",injured_or_dead_people
27763,905602234007920642,"VU Athletics Flood Relief Drive!2 trucks full,...",other_relevant_information


## Read the already trained vectorizer

In [5]:
with open(upstream["vectorizer"]["vectorizer"], 'rb') as f:
    vectorizer = pickle.load(f)

In [6]:
%%time
X_train = vectorizer.fit_transform(df_train['tweet_text'])
y_train = list(df_train['class_label'])

X_test = vectorizer.transform(df_test['tweet_text'])
y_test = list(df_test['class_label'])

X_train.shape

CPU times: user 13.1 s, sys: 134 ms, total: 13.2 s
Wall time: 13.3 s


(53516, 10000)

## Logistic regression test

In [7]:
%%time
# Prepate the logistic regression classifier
#clf_lr = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=random_seed, max_iter=1000)
clf_lr = LogisticRegression()

# parameters_lr = {'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
#                  'multi_class': ['auto'], 'max_iter': [500, 1000, 2000]}

parameters_lr = {'solver':['liblinear'],
                 'multi_class': ['auto'], 'max_iter': [500]}

clf = GridSearchCV(clf_lr, parameters_lr)

clf.fit(X_train, y_train)

CPU times: user 2min 21s, sys: 30.7 s, total: 2min 52s
Wall time: 22.1 s


In [8]:
# Show the optimal parameters
clf.best_params_

{'max_iter': 500, 'multi_class': 'auto', 'solver': 'liblinear'}

In [9]:
%%time
# Predict on test
lr_test_preds = clf.predict(X_test)
# Score on the test data
lr_f1 = f1_score(y_test, lr_test_preds, average='macro')
print(lr_f1)

0.7101213779285164
CPU times: user 420 ms, sys: 80.7 ms, total: 501 ms
Wall time: 105 ms


In [10]:
# Store the model
with open(str(product['model_lr']), 'wb') as f:
    pickle.dump(clf, f)

## Random Forest test

In [11]:
%%time
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=10)
clf_rf.fit(X_train, y_train)

CPU times: user 2.23 s, sys: 27 ms, total: 2.26 s
Wall time: 2.27 s


In [12]:
%%time
# Predict on test
rf_test_preds = clf_rf.predict(X_test)
# Score on the test data
rf_f1 = f1_score(y_test, rf_test_preds, average='macro')
print(rf_f1)

0.20735819588359936
CPU times: user 253 ms, sys: 4.81 ms, total: 258 ms
Wall time: 258 ms


In [13]:
# Store the model
with open(str(product['model_rf']), 'wb') as f:
    pickle.dump(clf_rf, f)

## Gradient Boosting Classifier - takes too long

In [14]:
# %%time
# clf_gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=10, random_state=random_seed)
# clf_gbc.fit(X_train, y_train)

In [15]:
# %%time
# # Predict on test
# gbc_test_preds = clf_gbc.predict(X_test)
# # Score on the test data
# gbc_f1 = f1_score(y_test, gbc_test_preds, average='macro')
# print(gbc_f1)

## MultinomialNB test

In [16]:
%%time
clf_mnb = MultinomialNB()
clf_mnb.fit(X_train, y_train)

CPU times: user 122 ms, sys: 9.99 ms, total: 132 ms
Wall time: 132 ms


In [17]:
%%time
# Predict on test
mnb_test_preds = clf_mnb.predict(X_test)
# Score on the test data
mnb_f1 = f1_score(y_test, mnb_test_preds, average='macro')
print(mnb_f1)

0.5686163106453006
CPU times: user 86.5 ms, sys: 3.33 ms, total: 89.9 ms
Wall time: 89.3 ms


In [18]:
# Store the model
with open(str(product['model_nb']), 'wb') as f:
    pickle.dump(clf_mnb, f)

## Voting Classifier test

In [19]:
clf1 = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=42, max_iter=1000)
clf2 = RandomForestClassifier(n_estimators=50, random_state=42)
clf3 = MultinomialNB()

eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('mnb', clf3)], voting='soft')
eclf1.fit(X_train, y_train)

In [20]:
%%time
# Predict on test
ec_test_preds = eclf1.predict(X_test)
# Score on the test data
ec_f1 = f1_score(y_test, ec_test_preds, average='macro')
print(ec_f1)

0.6894118557773756
CPU times: user 473 ms, sys: 32 ms, total: 505 ms
Wall time: 505 ms


In [21]:
# Store the model
with open(str(product['model_votingc']), 'wb') as f:
    pickle.dump(eclf1, f)