In [35]:
!pip install allennlp;

In [28]:
import pandas as pd
import numpy as np
import torch
import math
import pickle
from tqdm import tqdm

import nltk
nltk.download("punkt")
from nltk import RegexpTokenizer
from nltk.lm import Vocabulary

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from allennlp.modules.token_embedders import ElmoTokenEmbedder
from allennlp.modules.elmo import batch_to_ids

[nltk_data] Downloading package punkt to /home/ft/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load ELMo

In [26]:
# options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
# weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

options_file = './downloads/elmo_2x4096_512_2048cnn_2xhighway_options.json'
weight_file = './downloads/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'

In [27]:
elmo_embedder = ElmoTokenEmbedder(options_file, weight_file=weight_file, dropout=0).to('cuda')

# Common functions

In [5]:
def tokenize(sentences):

    tokenized_senteces = list()
    for s, sent in enumerate(sentences):

        remove_punctuation = RegexpTokenizer(r"\w+")
        tokenized_sent = remove_punctuation.tokenize(sent)

        for i, t in enumerate(tokenized_sent):
            if t.isnumeric():
                tokenized_sent[i] = "<num>"

        tokenized_senteces.append(tokenized_sent)

    return tokenized_senteces

In [6]:
def token_filter(tokenized_sentences, thresh=5):
    
    words = list()
    for t_sent in tokenized_sentences:
        for word in t_sent:
            words.append(word)

    vocab = Vocabulary(words, unk_cutoff=thresh)

    filtered_sentences = list()

    for t, tokenized_sent in enumerate(tokenized_sentences):
        filtered_sent = list()
        for word in tokenized_sent:
            if vocab.lookup(word) == '<UNK>':
                filtered_sent.append('<unk>')
            else: 
                filtered_sent.append(word)
        filtered_sentences.append(filtered_sent)

    return filtered_sentences

In [13]:
def get_elmo_embedding(filtered_sentences, device=device):
    vecs = list()
    batch_size = 128
    batches = math.ceil( len(filtered_sentences) / batch_size )
    for i in tqdm(range(batches)):
        character_ids = batch_to_ids(filtered_sentences[i * batch_size : (i + 1) * batch_size])
        batch_embedding = elmo_embedder(character_ids.to(device))
        batch_embedding = torch.mean(batch_embedding, dim=1)
        vecs.extend(batch_embedding.cpu().detach().numpy())

    return np.array(vecs)

# Load data

In [8]:
trn = pd.read_csv('./preprocessed_data/trn.csv', delimiter = ',', names=['title','label'])
tst = pd.read_csv('./preprocessed_data/tst.csv', delimiter = ',', names=['title','label'])

In [9]:
trn_sentences = tokenize(list(trn.title))
trn_filtered_sentences = token_filter(trn_sentences,5)

tst_sentences = tokenize(list(tst.title))
tst_filtered_sentences = token_filter(tst_sentences,5)

# Embedding

In [14]:
trn_embedings = get_elmo_embedding(trn_filtered_sentences, device)

f = open("./preprocessed_embeddings/elmo_trn.pkl","wb")
pickle.dump(trn_embedings,f)
f.close()

100%|██████████| 246/246 [05:39<00:00,  1.38s/it]


In [19]:
tst_embedings = get_elmo_embedding(tst_filtered_sentences, device)

f = open("./preprocessed_embeddings/elmo_tst.pkl","wb")
pickle.dump(tst_embedings,f)
f.close()

100%|██████████| 36/36 [00:45<00:00,  1.25s/it]


# Linear Classifier

In [17]:
classifier = LogisticRegression(fit_intercept=True, penalty="l2", C=1, max_iter=200)
classifier.fit(trn_embedings, trn.label)

LogisticRegression(C=1, max_iter=200)

In [20]:
print("Training accuracy = {}".format(classifier.score(trn_embedings, trn.label)))
print("Test accuracy = {}".format(classifier.score(tst_embedings, tst.label)))

Training accuracy = 0.9984408311324657
Test accuracy = 0.9944345503116652
