In [26]:
!pip install allennlp;
!pip install googledrivedownloader;

In [1]:
import pandas as pd
import numpy as np
import torch
import math
import pickle
import urllib.request
from tqdm import tqdm
from os.path import isfile

from allennlp.modules.token_embedders import ElmoTokenEmbedder
from allennlp.modules.elmo import batch_to_ids

from google_drive_downloader import GoogleDriveDownloader as gdd

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load ELMo

In [3]:
## Download weights and config file
if not isfile("../downloads/elmo_2x4096_512_2048cnn_2xhighway_options.json"):
    url_options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
    filename, headers = urllib.request.urlretrieve(url_options_file, filename="../downloads/elmo_2x4096_512_2048cnn_2xhighway_options.json")

if not isfile("../downloads/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"):
    url_weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
    filename, headers = urllib.request.urlretrieve(url_weight_file, filename="../downloads/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5")

In [4]:
options_file = '../downloads/elmo_2x4096_512_2048cnn_2xhighway_options.json'
weight_file = '../downloads/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'

In [5]:
## Load ELMo Token Embedder
elmo_embedder = ElmoTokenEmbedder(options_file, weight_file=weight_file, dropout=0).to(device)

# Common functions

In [6]:
def get_elmo_embedding(filtered_sentences, device=device):
    vecs = list()
    batch_size = 128
    batches = math.ceil( len(filtered_sentences) / batch_size )
    for i in tqdm(range(batches)):
        character_ids = batch_to_ids(filtered_sentences[i * batch_size : (i + 1) * batch_size])
        batch_embedding = elmo_embedder(character_ids.to(device))
        batch_embedding = torch.mean(batch_embedding, dim=1)
        vecs.extend(batch_embedding.cpu().detach().numpy())

    return np.array(vecs)

# Load data

In [7]:
### Load titles
trn = pd.read_csv('../preprocessed_data/trn_title.csv', delimiter = ',', names=['title','label'])
tst = pd.read_csv('../preprocessed_data/tst_title.csv', delimiter = ',', names=['title','label'])

trn_title_list = [x.split() for x in list(trn.title)]
tst_title_list = [x.split() for x in list(tst.title)]

In [None]:
### Load texts
trn = pd.read_csv('../preprocessed_data/trn_text.csv', delimiter = ',', names=['text','label'])
tst = pd.read_csv('../preprocessed_data/tst_text.csv', delimiter = ',', names=['text','label'])

trn_text_list = [x.split() for x in list(trn.text)]
tst_text_list = [x.split() for x in list(tst.text)]

# Embedding

In [10]:
trn_embedings = get_elmo_embedding(trn_title_list, device)

f = open("../preprocessed_embeddings/elmo_trn_title.pkl","wb")
pickle.dump(trn_embedings,f)
f.close()

100%|██████████| 243/243 [05:01<00:00,  1.24s/it]


In [11]:
tst_embedings = get_elmo_embedding(tst_title_list, device)

f = open("../preprocessed_embeddings/elmo_tst_title.pkl","wb")
pickle.dump(tst_embedings,f)
f.close()

100%|██████████| 35/35 [00:42<00:00,  1.23s/it]


# Download preprocessed embeddings

In [15]:
## Download pre-processed embeddings
gdd.download_file_from_google_drive(file_id='1esvWZDtDMe-TUG7sR_U5N9QytebN4Cjy', dest_path='../preprocessed_embeddings/elmo_embeddings.zip', unzip=True)
!rm ./preprocessed_embeddings/elmo_embeddings.zip

Downloading 1esvWZDtDMe-TUG7sR_U5N9QytebN4Cjy into ./preprocessed_embeddings/asd.zip... Done.
Unzipping...Done.
