In [26]:
!pip install allennlp;
!pip install googledrivedownloader;

In [5]:
import pandas as pd
import numpy as np
import torch
import math
import pickle
import urllib.request
from tqdm import tqdm
from os.path import isfile

import nltk
nltk.download("punkt")
from nltk import RegexpTokenizer
from nltk.lm import Vocabulary

from allennlp.modules.token_embedders import ElmoTokenEmbedder
from allennlp.modules.elmo import batch_to_ids

from google_drive_downloader import GoogleDriveDownloader as gdd

[nltk_data] Downloading package punkt to /home/ft/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load ELMo

In [14]:
## Download weights and config file
if not isfile("../downloads/elmo_2x4096_512_2048cnn_2xhighway_options.json"):
    url_options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
    filename, headers = urllib.request.urlretrieve(url_options_file, filename="../downloads/elmo_2x4096_512_2048cnn_2xhighway_options.json")

if not isfile("../downloads/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"):
    url_weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
    filename, headers = urllib.request.urlretrieve(url_weight_file, filename="../downloads/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5")

In [15]:
options_file = '../downloads/elmo_2x4096_512_2048cnn_2xhighway_options.json'
weight_file = '../downloads/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'

In [16]:
## Load ELMo Token Embedder
elmo_embedder = ElmoTokenEmbedder(options_file, weight_file=weight_file, dropout=0).to(device)

# Common functions

In [17]:
def tokenize(sentences):

    tokenized_senteces = list()
    for s, sent in enumerate(sentences):

        remove_punctuation = RegexpTokenizer(r"\w+")
        tokenized_sent = remove_punctuation.tokenize(sent)

        for i, t in enumerate(tokenized_sent):
            if t.isnumeric():
                tokenized_sent[i] = "<num>"

        tokenized_senteces.append(tokenized_sent)

    return tokenized_senteces

In [18]:
def token_filter(tokenized_sentences, thresh=5):
    
    words = list()
    for t_sent in tokenized_sentences:
        for word in t_sent:
            words.append(word)

    vocab = Vocabulary(words, unk_cutoff=thresh)

    filtered_sentences = list()

    for t, tokenized_sent in enumerate(tokenized_sentences):
        filtered_sent = list()
        for word in tokenized_sent:
            if vocab.lookup(word) == '<UNK>':
                filtered_sent.append('<unk>')
            else: 
                filtered_sent.append(word)
        filtered_sentences.append(filtered_sent)

    return filtered_sentences

In [19]:
def get_elmo_embedding(filtered_sentences, device=device):
    vecs = list()
    batch_size = 128
    batches = math.ceil( len(filtered_sentences) / batch_size )
    for i in tqdm(range(batches)):
        character_ids = batch_to_ids(filtered_sentences[i * batch_size : (i + 1) * batch_size])
        batch_embedding = elmo_embedder(character_ids.to(device))
        batch_embedding = torch.mean(batch_embedding, dim=1)
        vecs.extend(batch_embedding.cpu().detach().numpy())

    return np.array(vecs)

# Load data

In [20]:
trn = pd.read_csv('../preprocessed_data/trn_title.csv', delimiter = ',', names=['title','label'])
tst = pd.read_csv('../preprocessed_data/tst_title.csv', delimiter = ',', names=['title','label'])

In [21]:
trn_sentences = tokenize(list(trn.title))
trn_filtered_sentences = token_filter(trn_sentences,5)

tst_sentences = tokenize(list(tst.title))
tst_filtered_sentences = token_filter(tst_sentences,5)

# Embedding

In [24]:
trn_embedings = get_elmo_embedding(trn_filtered_sentences, device)

f = open("../preprocessed_embeddings/elmo_trn.pkl","wb")
pickle.dump(trn_embedings,f)
f.close()

In [25]:
tst_embedings = get_elmo_embedding(tst_filtered_sentences, device)

f = open("../preprocessed_embeddings/elmo_tst.pkl","wb")
pickle.dump(tst_embedings,f)
f.close()

100%|██████████| 36/36 [00:45<00:00,  1.28s/it]


# Download preprocessed embeddings

In [15]:
## Download pre-processed embeddings
gdd.download_file_from_google_drive(file_id='1esvWZDtDMe-TUG7sR_U5N9QytebN4Cjy', dest_path='../preprocessed_embeddings/elmo_embeddings.zip', unzip=True)
!rm ./preprocessed_embeddings/elmo_embeddings.zip

Downloading 1esvWZDtDMe-TUG7sR_U5N9QytebN4Cjy into ./preprocessed_embeddings/asd.zip... Done.
Unzipping...Done.
