In [26]:
!pip install allennlp;
!pip install googledrivedownloader;

In [38]:
import pandas as pd
import numpy as np
import torch
import math
import os
import pickle
import urllib.request
from tqdm import tqdm
from os.path import isfile

from allennlp.modules.token_embedders import ElmoTokenEmbedder
from allennlp.modules.elmo import batch_to_ids

from google_drive_downloader import GoogleDriveDownloader as gdd

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load ELMo

In [3]:
if not os.path.exists("../downloads/"):
    !mkdir ../downloads

## Download weights and config file
if not isfile("../downloads/elmo_2x4096_512_2048cnn_2xhighway_options.json"):
    url_options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
    filename, headers = urllib.request.urlretrieve(url_options_file, filename="../downloads/elmo_2x4096_512_2048cnn_2xhighway_options.json")

if not isfile("../downloads/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"):
    url_weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
    filename, headers = urllib.request.urlretrieve(url_weight_file, filename="../downloads/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5")

In [4]:
options_file = '../downloads/elmo_2x4096_512_2048cnn_2xhighway_options.json'
weight_file = '../downloads/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'

In [5]:
## Load ELMo Token Embedder
elmo_embedder = ElmoTokenEmbedder(options_file, weight_file=weight_file, dropout=0).to(device)

# Common functions

In [6]:
def get_elmo_embedding(dataiter, batch_size=128, device=device):
    embeddings = list()
    labels = list()
    with tqdm(total=len(dataiter)) as pbar:
        for batch, label in dataiter:
            character_ids = batch_to_ids(batch)
            batch_embedding = elmo_embedder(character_ids.to(device))
            embeddings.extend(batch_embedding.cpu().detach().numpy())
            labels.extend(label)
            pbar.update(1)

    return np.array(embeddings, dtype=object), labels

In [23]:
class FakeNews():
    def __init__(self, split):
        if split == 'trn-title':
            data = pd.read_csv('../preprocessed_data/trn_title.csv', delimiter = ',', names=['title','label'])
            self.data = [x.split() for x in list(data.title)]
        elif split == 'trn-text':
            data = pd.read_csv('../preprocessed_data/trn_text.csv', delimiter = ',', names=['text','label'])
            self.data = [x.split() for x in list(data.text)]
        elif split == 'val-title':
            data = pd.read_csv('../preprocessed_data/val_title.csv', delimiter = ',', names=['title','label'])
            self.data = [x.split() for x in list(data.title)]
        elif split == 'val-text':
            data = pd.read_csv('../preprocessed_data/val_text.csv', delimiter = ',', names=['text','label'])
            self.data = [x.split() for x in list(data.text)]
        elif split == 'tst-title':
            data = pd.read_csv('../preprocessed_data/tst_title.csv', delimiter = ',', names=['title','label'])
            self.data = [x.split() for x in list(data.title)]
        elif split == 'tst-text':
            data = pd.read_csv('../preprocessed_data/tst_text.csv', delimiter = ',', names=['text','label'])
            self.data = [x.split() for x in list(data.text)]
        else:
            raise ValueError("Unrecognized dataset type. Try 'trn-title', 'trn-text', 'tst-title' or 'tst-text'")
        
        self.label = data.label
        self.n_sentences = len(data)

    def __setitem__(self, key, value):
        self.data[key] = value[0]
        self.label[key] = value[1]
        
    def __getitem__(self, index):
        return self.data[index] , self.label[index]

    def __len__(self):
        return self.n_sentences

In [8]:
class BucketIterator():
    def __init__(self, dataset, batch_size=8):
        self.dataset = dataset[:][0]
        self.labels = dataset[:][1]
        self.dataset_len = len(dataset)
        self.batch_size = batch_size
        self.length = math.ceil( len(dataset) / batch_size )

        indices = [(i, len(s)) for i, s in enumerate(self.dataset)]
        pooled_indices = []
        pooled_indices.extend(sorted(indices, key=lambda x: x[1]))

        self.pooled_indices = [x[0] for x in pooled_indices]
        self.pointer = 0

    def __len__(self):
        return self.length

    def __iter__(self):
        self.pointer = 0
        return self

    def __next__(self):
        if self.pointer > self.dataset_len:
            self.pointer = 0
            raise StopIteration
        else:
            start = self.pointer
            end = self.pointer + self.batch_size
            self.pointer += self.batch_size
        
            return [ self.dataset[i] for i in self.pooled_indices[start:end] ], [ self.labels[i] for i in self.pooled_indices[start:end] ]

# Titles Embedding

In [9]:
### Load titles
trn_dataset = FakeNews('trn-title')
trn_title_iterator = BucketIterator(trn_dataset,128)

val_dataset = FakeNews('val-title')
val_title_iterator = BucketIterator(val_dataset,128)

tst_dataset = FakeNews('tst-title')
tst_title_iterator = BucketIterator(tst_dataset,128)

In [None]:
if not os.path.exists("../preprocessed_embeddings/"):
    !mkdir ../preprocessed_embeddings

In [49]:
trn_embedings, trn_labels = get_elmo_embedding(trn_title_iterator, 128, device)

f = open("../preprocessed_embeddings/elmo_trn_title.pkl","wb")
pickle.dump(trn_embedings,f)
f.close()

f = open("../preprocessed_embeddings/elmo_trn_title_labels.pkl","wb")
pickle.dump(trn_labels,f)
f.close()

100%|██████████| 243/243 [03:16<00:00,  1.24it/s]


In [15]:
val_embedings, val_labels = get_elmo_embedding(val_title_iterator, 128, device)

f = open("../preprocessed_embeddings/elmo_val_title.pkl","wb")
pickle.dump(val_embedings,f)
f.close()

f = open("../preprocessed_embeddings/elmo_val_title_labels.pkl","wb")
pickle.dump(val_labels,f)
f.close()

100%|██████████| 70/70 [00:53<00:00,  1.30it/s]


In [48]:
tst_embedings, tst_labels = get_elmo_embedding(tst_title_iterator, 128, device)

f = open("../preprocessed_embeddings/elmo_tst_title.pkl","wb")
pickle.dump(tst_embedings,f)
f.close()

f = open("../preprocessed_embeddings/elmo_tst_title_labels.pkl","wb")
pickle.dump(tst_labels,f)
f.close()

100%|██████████| 35/35 [00:28<00:00,  1.22it/s]


# Texts Embedding

In [32]:
### Load texts
trn_dataset = FakeNews('trn-text')
val_dataset = FakeNews('val-title')
tst_dataset = FakeNews('tst-title')

In [None]:
if not os.path.exists("../preprocessed_embeddings/"):
    !mkdir ../preprocessed_embeddings

## Explore the dataset

In [12]:
print(f"The length of the dataset is: {len(trn_dataset)}")

The length of the dataset is: 30986


In [11]:
max_length = 0
sentence_id = 0
for i in range(len(trn_dataset)):
    if len(trn_dataset[i][0]) > max_length:
        max_length = len(trn_dataset[i][0])
        sentence_id = i
print(f"Sentence #{sentence_id} has the highest amount of tokens: {max_length}")

Sentence #29277 has the highest amount of tokens: 8375


In [13]:
count = 0
threshold = 100
saved_idx = 0
for i in range(len(trn_dataset)):
    if len(trn_dataset[i][0]) > threshold:
        count += 1
        if len(trn_dataset[i][0]) < threshold + 10:
            saved_idx = i
print(f"There are #{count} sentences with more than {threshold} tokens. One example is sentence #{saved_idx}")

There are #27120 sentences with more than 100 tokens. One example is sentence #30973


## Create the embeddings

In [33]:
### Truncate text to 100 tokens

### Trn dataset
for i in range(len(trn_dataset)):
    trn_dataset[i] = (trn_dataset[i][0][:100], trn_dataset[i][1])
    
### Val dataset
for i in range(len(val_dataset)):
    val_dataset[i] = (val_dataset[i][0][:100], val_dataset[i][1])
    
### Tst dataset
for i in range(len(tst_dataset)):
    tst_dataset[i] = (tst_dataset[i][0][:100], tst_dataset[i][1])

In [35]:
trn_text_iterator = BucketIterator(trn_dataset,8)
val_text_iterator = BucketIterator(val_dataset,8)
tst_text_iterator = BucketIterator(tst_dataset,8)

In [42]:
trn_embedings, trn_labels = get_elmo_embedding(trn_text_iterator, 8, device)

f = open("../preprocessed_embeddings/elmo_trn_text.pkl","wb")
pickle.dump(trn_embedings,f)
f.close()

f = open("../preprocessed_embeddings/elmo_trn_text_labels.pkl","wb")
pickle.dump(trn_labels,f)
f.close()

100%|██████████| 3874/3874 [12:21<00:00,  5.22it/s]


In [43]:
val_embedings, val_labels = get_elmo_embedding(val_text_iterator, 8, device)

f = open("../preprocessed_embeddings/elmo_val_text.pkl","wb")
pickle.dump(val_embedings,f)
f.close()

f = open("../preprocessed_embeddings/elmo_val_text_labels.pkl","wb")
pickle.dump(val_labels,f)
f.close()

100%|██████████| 1107/1107 [00:36<00:00, 30.63it/s]


In [41]:
tst_embedings, tst_labels = get_elmo_embedding(tst_text_iterator, 8, device)

f = open("../preprocessed_embeddings/elmo_tst_text.pkl","wb")
pickle.dump(tst_embedings,f)
f.close()

f = open("../preprocessed_embeddings/elmo_tst_text_labels.pkl","wb")
pickle.dump(tst_labels,f)
f.close()

100%|██████████| 554/554 [00:17<00:00, 31.29it/s]


# Download preprocessed embeddings

In [40]:
if not os.path.exists("../preprocessed_embeddings/"):
    !mkdir ../preprocessed_embeddings

## Download pre-processed embeddings
gdd.download_file_from_google_drive(file_id='1esvWZDtDMe-TUG7sR_U5N9QytebN4Cjy', dest_path='../preprocessed_embeddings/elmo_embeddings.zip', unzip=True)
!rm ../preprocessed_embeddings/elmo_embeddings.zip

Downloading 1esvWZDtDMe-TUG7sR_U5N9QytebN4Cjy into ../preprocessed_embeddings/elmo_embeddings.zip... Done.
Unzipping...Done.
