In [1]:
import pandas as pd 
import numpy as np

train = pd.read_csv('../datasets/train.csv')
test = pd.read_csv('../datasets/test.csv')

## Utility Functions

Before we start exploring embeddings lets write a couple of helper functions to run Logistic Regression and calculate evaluation metrics

Since we want to optimize our model for F1-Scores, for all models we'll first predict the probability of the positive class. We'll then use these probabilities to get the Precision-Recall curve and from here we can select a threshold value that has the highest F1-score. To predict the labels we can simply use this threshold value.

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, f1_score, accuracy_score, roc_auc_score, confusion_matrix
import seaborn as sns
from sklearn import metrics
sns.set_palette("muted")
    

def calc_f1(p_and_r):
    p, r = p_and_r
    return (2*p*r)/(p+r)


# Print the F1, Precision, Recall, ROC-AUC, and Accuracy Metrics 
# Since we are optimizing for F1 score - we will first calculate precision and recall and 
# then find the probability threshold value that gives us the best F1 score

def print_model_metrics(y_test, y_test_prob,y_pred,label_list,name):
    print("*"*5)
    print(name)
    print("roc")
    print(roc_auc_score(y_test,y_test_prob,labels = label_list,multi_class='ovr',average="weighted"))
    print("f1")
    print(metrics.f1_score(y_test,y_pred,labels=label_list,average="weighted"))
    print("acc")
    print(metrics.accuracy_score(y_test,y_pred))

In [3]:
# Run Simple Log Reg Model and Print metrics
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
import random
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Continuous, Categorical, Integer
import ast
# Run log reg 10 times and average the result to reduce predction variance
def run_log_reg(train_features, test_features, y_train, y_test, lbl_to_idx, idx_to_lbl,label_list,feature_name, apply=True):
    y_train_idx = [lbl_to_idx[label] for label in y_train]
    y_test_idx = [lbl_to_idx[label] for label in y_test]
    label_idx = [i for i in range(len(label_list))]
    models = [
       MultinomialNB(),
       SVC(probability=True),
       RandomForestClassifier(),
#       XGBClassifier(),
       
    ]
    model_names=[
        "Naive bayes",
        "SVM",
        "RF",
#        "XGBoost"

    ]
    model_prams=[
        {'alpha' : Continuous(0.01,0.5,distribution="log-uniform")},
        {"kernel" : Categorical(['linear', 'poly', 'rbf', 'sigmoid']),
         "C" : Continuous(0.1,50,distribution="uniform")
        },
        {
            "n_estimators" : Integer(10,100),
            "max_depth" : Integer(5,50),
            "min_samples_split" : Integer(2,11),
            "min_samples_leaf" : Integer(1,11),
            "criterion" : Categorical(["gini", "entropy", "log_loss"]),
            "max_features" : Integer(1,13)
        }
    ]
    non_negs = ["BOW", "TF-IDF"]
    if apply == True:
        print(feature_name)
        computed_params = pd.read_csv(f"../result/all_params/{feature_name}.csv", encoding='utf-8')
        computed_params = computed_params.drop(columns=["Unnamed: 0"])
        compute_dict = computed_params.to_dict('dict')

        result_dict ={}
        for model,name in zip(models,model_names):
            if feature_name not in non_negs and name =="Naive bayes":
                continue
            clf = model
            if name in compute_dict:
                clf = model.set_params(**ast.literal_eval(compute_dict[name][0]))
            clf.fit(train_features,y_train_idx)
            y_test_prob = clf.predict_proba(test_features)
            y_pred = clf.predict(test_features)
            print_model_metrics(y_test_idx, y_test_prob, y_pred, label_idx,name)



    pre_comp_best =[]
    
    random.seed(1)
    cv = cv = StratifiedKFold(n_splits=2, shuffle=True)

    if apply:
        return
    for model, model_name, model_param in zip(models, model_names, model_prams):
        try:
            evolved_estimator = GASearchCV(estimator=model,
                               cv=cv,
                               scoring='accuracy',
                               population_size=10,
                               generations=35,
                               param_grid=model_param,
                               n_jobs=-1,
                               verbose=True,
                               keep_top_k=4)        
            evolved_estimator.fit(train_features,y_train_idx)
            print(evolved_estimator.best_params_)
            pre_comp_best.append({model_name :evolved_estimator.best_params_})
        except ValueError:
            print("hi")
    print(pre_comp_best)
    #y_test_prob = model.predict_proba(test_features)
    #y_pred = model.predict(test_features)
    #print_model_metrics(y_test_idx,y_test_prob,y_pred,label_idx)
    df = pd.DataFrame(pre_comp_best)
    df.to_csv(f"../result/all_search/{feature_name}.csv", encoding='utf-8')


# Bag-of-Words, TF-IDF and Word Embeddings

In [4]:
label_list= sorted(list(set(test.label.values)))
lbl_to_idx = {item:i for i,item in enumerate(label_list)}
idx_to_lbl = {i:item for i,item in enumerate(label_list)}
print(lbl_to_idx)
print(idx_to_lbl)
y_train = train.label
y_test = test.label
print(y_train.shape)
print(y_test.shape)
len(set(y_train.values))


{0: 0, 2: 1, 5: 2, 6: 3, 8: 4, 11: 5, 13: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 21: 12, 27: 13, 28: 14, 32: 15, 33: 16, 39: 17, 42: 18, 44: 19, 46: 20, 50: 21, 53: 22}
{0: 0, 1: 2, 2: 5, 3: 6, 4: 8, 5: 11, 6: 13, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 21, 13: 27, 14: 28, 15: 32, 16: 33, 17: 39, 18: 42, 19: 44, 20: 46, 21: 50, 22: 53}
(325,)
(169,)


23

## Bag of Words
Let's start with simple Bag-Of-Words

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer()
x_train = bow.fit_transform(train.title.values)
x_test = bow.transform(test.title.values)

run_log_reg(x_train, x_test, y_train, y_test, lbl_to_idx=lbl_to_idx, idx_to_lbl=idx_to_lbl, label_list=label_list,feature_name="BOW")

BOW
*****
Naive bayes
roc
0.9971063268788243
f1
0.8832843612233645
acc
0.893491124260355
*****
SVM
roc
0.9929615063098315
f1
0.8833631041211363
acc
0.893491124260355
*****
RF
roc
0.9954018640580207
f1
0.9243200614492016
acc
0.9349112426035503


## TF-IDF

TFIDF should perform better than BoW since it uses document frequencies to normalize

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
x_train = tfidf.fit_transform(train.title.values)
x_test = tfidf.transform(test.title.values)

run_log_reg(x_train, x_test, y_train, y_test, lbl_to_idx=lbl_to_idx, idx_to_lbl=idx_to_lbl, label_list=label_list,feature_name="TF-IDF")

TF-IDF
*****
Naive bayes
roc
0.9972862406171837
f1
0.8956672832927013
acc
0.9053254437869822
*****
SVM
roc
0.9955755050701972
f1
0.939688255470134
acc
0.9408284023668639
*****
RF
roc
0.9938325162740105
f1
0.905940364311405
acc
0.9171597633136095


## TF-IDF(Normalize)

TFIDF should perform better than BoW since it uses document frequencies to normalize

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def things_to_unit(a):
    "if 0.5km kind of that appears, convert to unitLength etc"
    doc_units = pd.read_excel("./normalizer/units.xlsx")
    doc_dict = dict(zip(doc_units["from"],doc_units["to"])) 
    for from_ in doc_dict:
        idx = np.where(
                 np.char.count(a,from_)==1
              )
        a[idx] = doc_dict[from_] 
    return a

class LemmaPlaceTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`','(',')']
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        val = []
        for t in word_tokenize(doc):
            if t.isdigit():
                val.append("unitN")
            elif (t not in self.ignore_tokens):
                val.append(
                    self.wnl.lemmatize(t,get_wordnet_pos(t))
                )
        new_val = np.array(val)
        new_val = np.apply_along_axis(things_to_unit, 0, new_val)
        return new_val

def preprocess(document):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words & numbrs
    words = [word for word in words if word not in stopwords.words("english") or not word.isdigit()]


    # join words to make sentence
    document = " ".join(words)
    
    return document

TFIDF performs marginally better than BoW. Although whats impressive here is the fact that we're getting an F1 score of 0.826 with just 50 datapoints. This is why Log Reg + TFIDF is a great baseline for NLP classification tasks.

Next we'll try 100D glove vectors. 

## GloVe

In [7]:
# Load the glove vectors with PyMagnitude
# PyMagnitude is a fantastic library that handles a lot of word vectorization tasks. 

from pymagnitude import *
from collections.abc import MutableMapping
glove = Magnitude("../vectors/glove.6B.100d.magnitude")

In [8]:
# We'll use Average Glove here 
from tqdm import tqdm_notebook
from nltk import word_tokenize


def avg_glove(df):
    vectors = []
    for title in tqdm_notebook(df.title.values):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

x_train = avg_glove(train)
x_test = avg_glove(test)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for title in tqdm_notebook(df.title.values):


  0%|          | 0/325 [00:00<?, ?it/s]

  0%|          | 0/169 [00:00<?, ?it/s]

In [None]:
run_log_reg(x_train, x_test, y_train, y_test, lbl_to_idx=lbl_to_idx, idx_to_lbl=idx_to_lbl, label_list=label_list,feature_name="GLOVE")

# BERT

In [None]:
x_train = np.genfromtxt("../datasets/train_feature_bert-base-uncased.csv",delimiter=",")
x_test = np.genfromtxt("../datasets/test_feature_bert-base-uncased.csv",delimiter=",")
y_train = np.genfromtxt("../datasets/train_label_bert-base-uncased.csv",delimiter=",")
y_test = np.genfromtxt("../datasets/test_label_bert-base-uncased.csv",delimiter=",")

run_log_reg(x_train, x_test, y_train, y_test, lbl_to_idx=lbl_to_idx, idx_to_lbl=idx_to_lbl, label_list=label_list,feature_name="BERT")

BERT
*****
SVM
roc
0.9727600248077499
f1
0.7352760009476428
acc
0.746268656716418
*****
RF
roc
0.951664896784059
f1
0.5728609698758953
acc
0.6119402985074627


# SenBERT

In [None]:
x_train = np.genfromtxt("../datasets/train_feature_sbert.csv",delimiter=",")
x_test = np.genfromtxt("../datasets/test_feature_sbert.csv",delimiter=",")
y_train = np.genfromtxt("../datasets/train_label_sbert.csv",delimiter=",")
y_test = np.genfromtxt("../datasets/test_label_sbert.csv",delimiter=",")

run_log_reg(x_train, x_test, y_train, y_test, lbl_to_idx=lbl_to_idx, idx_to_lbl=idx_to_lbl, label_list=label_list,feature_name="SENBERT")

SENBERT
*****
SVM
roc
0.9491645780328376
f1
0.701820981671728
acc
0.7164179104477612
*****
RF
roc
0.9495676228027575
f1
0.666571902392798
acc
0.6865671641791045


# Electra

In [None]:
x_train = np.genfromtxt("../datasets/train_feature_electra-small-discriminator.csv",delimiter=",")
x_test = np.genfromtxt("../datasets/test_feature_electra-small-discriminator.csv",delimiter=",")
y_train = np.genfromtxt("../datasets/train_label_electra-small-discriminator.csv",delimiter=",")
y_test = np.genfromtxt("../datasets/test_label_electra-small-discriminator.csv",delimiter=",")

run_log_reg(x_train, x_test, y_train, y_test, lbl_to_idx=lbl_to_idx, idx_to_lbl=idx_to_lbl, label_list=label_list,feature_name="Electra")

Electra
*****
SVM
roc
0.9477035652920343
f1
0.6754975124378111
acc
0.7014925373134329
*****
RF
roc
0.9367162980321985
f1
0.6191624296101909
acc
0.6567164179104478


# FNet

In [None]:
x_train = np.genfromtxt("../datasets/train_feature_fnet-base.csv",delimiter=",")
x_test = np.genfromtxt("../datasets/test_feature_fnet-base.csv",delimiter=",")
y_train = np.genfromtxt("../datasets/train_label_fnet-base.csv",delimiter=",")
y_test = np.genfromtxt("../datasets/test_label_fnet-base.csv",delimiter=",")

run_log_reg(x_train, x_test, y_train, y_test, lbl_to_idx=lbl_to_idx, idx_to_lbl=idx_to_lbl, label_list=label_list, feature_name="FNET")

FNET
*****
SVM
roc
0.9827484215017316
f1
0.7934536110655515
acc
0.7910447761194029
*****
RF
roc
0.9511822368571887
f1
0.6960927960927961
acc
0.7164179104477612


# Roberta

In [None]:
x_train = np.genfromtxt("../datasets/train_feature_roberta-base.csv",delimiter=",")
x_test = np.genfromtxt("../datasets/test_feature_roberta-base.csv",delimiter=",")
y_train = np.genfromtxt("../datasets/train_label_roberta-base.csv",delimiter=",")
y_test = np.genfromtxt("../datasets/test_label_roberta-base.csv",delimiter=",")

run_log_reg(x_train, x_test, y_train, y_test, lbl_to_idx=lbl_to_idx, idx_to_lbl=idx_to_lbl, label_list=label_list, feature_name="FNET")

FNET
*****
SVM
roc
0.8891042563933151
f1
0.6358589172022007
acc
0.6567164179104478
*****
RF
roc
0.8689497313925576
f1
0.5444013697745042
acc
0.5522388059701493
