In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
import numpy as np
import os
import time
import random
import tensorflow as tf

%env TF_FORCE_GPU_ALLOW_GROWTH=true
# Making sure we cache the models and are not downloaded all the time
%env TFHUB_CACHE_DIR=./tfhub_modules

env: TF_FORCE_GPU_ALLOW_GROWTH=true
env: TFHUB_CACHE_DIR=./tfhub_modules


## Using pre-trained ELMo Model

### Downloading the ELMo Model from TFHub

In [2]:
import tensorflow_hub as hub
import tensorflow.keras.backend as K

K.clear_session()
elmo_layer = hub.KerasLayer("https://tfhub.dev/google/elmo/3", signature="tokens",signature_outputs_as_dict=True)

### Formatting the input for ELMo

In [45]:
def format_text_for_elmo(texts, lower=True, split=" ", max_len=None):
    
    token_inputs = []
    token_lengths = []
        
    max_len_inferred = 0
        
    for text in texts:        
        tokens = tf.keras.preprocessing.text.text_to_word_sequence(text, lower=lower, split=split)
        
        token_inputs.append(tokens)                   
        
        if len(tokens)>max_len_inferred:
            max_len_inferred = len(tokens)
    
    if max_len and max_len_inferred < max_len:
        max_len = max_len_inferred
    if not max_len:
        max_len = max_len_inferred
        
    for i, inp in enumerate(token_inputs):
        
        token_lengths.append(min(len(inp), max_len))
        # If the maximum length is less than input length, truncate
        if max_len < len(inp):
            inp = inp[:max_len]            
        # If the maximum length is greater than or equal to input length, add padding as needed
        else:            
            inp = inp+[""]*(max_len-len(inp))
        
        
        assert len(inp)==max_len
        
        token_inputs[i] = inp
    
    return {
        "tokens": tf.constant(token_inputs), 
        "sequence_len": tf.constant(token_lengths)
    }


print(format_text_for_elmo(["the cat sat on the mat", "the mat sat"], max_len=10))

{'tokens': <tf.Tensor: shape=(2, 6), dtype=string, numpy=
array([[b'the', b'cat', b'sat', b'on', b'the', b'mat'],
       [b'the', b'mat', b'sat', b'', b'', b'']], dtype=object)>, 'sequence_len': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([6, 3], dtype=int32)>}


In [4]:
# 001.txt - 005.txt in bbc/business
elmo_inputs = format_text_for_elmo([
    "Ad sales boost Time Warner profit",
    "Dollar gains on Greenspan speech",
    "Yukos unit buyer faces loan claim",
    "High fuel prices hit BA's profits",
    "Pernod takeover talk lifts Domecq"
])


elmo_result = elmo_layer(elmo_inputs)

for k,v in elmo_result.items():
    
    print("Tensor under key={} is a {} shaped Tensor".format(k, v.shape))

Tensor under key=sequence_len is a (5,) shaped Tensor
Tensor under key=elmo is a (5, 6, 1024) shaped Tensor
Tensor under key=default is a (5, 1024) shaped Tensor
Tensor under key=lstm_outputs1 is a (5, 6, 1024) shaped Tensor
Tensor under key=lstm_outputs2 is a (5, 6, 1024) shaped Tensor
Tensor under key=word_emb is a (5, 6, 512) shaped Tensor


## Generating Document Embeddings with ELMo

In [6]:
url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip'


def download_data(url, data_dir):
    """Download a file if not present, and make sure it's the right size."""
    
    # Create the data directory if not exist
    os.makedirs(data_dir, exist_ok=True)

    file_path = os.path.join(data_dir, 'bbc-fulltext.zip')
    
    # If file doesnt exist, download
    if not os.path.exists(file_path):
        print('Downloading file...')
        filename, _ = urlretrieve(url, file_path)
    else:
        print("File already exists")
  
    extract_path = os.path.join(data_dir, 'bbc')
    
    # If data has not been extracted already, extract data
    if not os.path.exists(extract_path):        
        with zipfile.ZipFile(os.path.join(data_dir, 'bbc-fulltext.zip'), 'r') as zipf:
            zipf.extractall(data_dir)
    else:
        print("bbc-fulltext.zip has already been extracted")
    
download_data(url, 'data')

File already exists
bbc-fulltext.zip has already been extracted


In [8]:
def read_data(data_dir):
    
    # This will contain the full list of stories
    news_stories = []    
    filenames = []
    print("Reading files")
    
    i = 0 # Just used for printing progress
    for root, dirs, files in os.walk(data_dir):
        
        for fi, f in enumerate(files):
            
            # We don't read the readme file
            if 'README' in f:
                continue
            
            # Printing progress
            i += 1
            print("."*i, f, end='\r')
            
            # Open the file
            with open(os.path.join(root, f), encoding='latin-1') as text_file:
                
                story = []
                # Read all the lines
                for row in text_file:
                                        
                    story.append(row.strip())
                    
                # Create a single string with all the rows in the doc
                story = ' '.join(story)                        
                # Add that to the list
                news_stories.append(story)  
                filenames.append(os.path.join(root, f))
                
        print('', end='\r')
        
    print("\nDetected {} stories".format(len(news_stories)))
    return news_stories, filenames
                
  
news_stories, filenames = read_data(os.path.join('data', 'bbc'))

# Printing some stats and sample data
print('{} words found in the total news set'.format(sum([len(story.split(' ')) for story in news_stories])))
print('Example words (start): ',news_stories[0][:50])
print('Example words (end): ',news_stories[-1][-50:])

Reading files


. 272.txt.. 127.txt... 370.txt.... 329.txt..... 240.txt...... 379.txt....... 339.txt........ 046.txt......... 140.txt.......... 349.txt........... 010.txt............ 352.txt............. 245.txt.............. 362.txt............... 166.txt................ 005.txt................. 092.txt.................. 354.txt................... 187.txt.................... 332.txt..................... 192.txt...................... 037.txt....................... 214.txt........................ 002.txt......................... 072.txt.......................... 202.txt........................... 269.txt............................ 322.txt............................. 144.txt.............................. 397.txt............................... 089.txt................................ 360.txt................................. 128.txt.................................. 225.txt................................... 337.txt.................................... 260.txt.........

865163 words found in the total news set
Example words (start):  Windows worm travels with Tetris  Users are being 
Example words (end):  is years at Stradey as "the best time of my life."


In [11]:
import pandas as pd

pd.Series([len(x.split(' ')) for x in news_stories]).describe(percentiles=[0.05, 0.95])

count    2225.000000
mean      388.837303
std       241.484273
min        91.000000
5%        164.200000
50%       336.000000
95%       736.800000
max      4489.000000
dtype: float64

In [46]:
batch_size = 4

news_elmo_embeddings = []

#InvalidArgumentError:  Incompatible shapes: [2,6,1] vs. [2,10,1024]
#	 [[node mul (defined at /home/thushv89/anaconda3/envs/packt.nlp.tf2/lib/python3.6/site-packages/tensorflow_hub/module_v2.py:106) ]] [Op:__inference_pruned_3391]

for i in range(0, len(news_stories), batch_size):
    print('.', end='')
    elmo_inputs = format_text_for_elmo(news_stories[i: min(i+batch_size, len(news_stories))], max_len=768)    
    elmo_result = elmo_layer(elmo_inputs)["default"]
    news_elmo_embeddings.append(elmo_result)
    
news_elmo_embeddings = np.concatenate(news_elmo_embeddings, axis=0)    

.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [47]:
os.makedirs('elmo_embeddings', exist_ok=True)

pd.DataFrame(
    news_elmo_embeddings, index=filenames
).to_pickle(
    os.path.join('elmo_embeddings', 'elmo_embeddings.pkl')
)

In [48]:
pd.read_pickle(os.path.join('elmo_embeddings', 'elmo_embeddings.pkl'))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
data/bbc/tech/272.txt,0.144291,0.015962,-0.151633,0.096555,-0.015913,0.075896,-0.033353,-0.118644,-0.192030,-0.124919,...,-0.418126,0.204618,0.007118,0.146415,0.010992,-0.005937,0.377529,-0.071118,0.302366,-0.099605
data/bbc/tech/127.txt,0.022870,-0.142899,-0.017096,-0.084165,0.320108,0.424914,-0.043930,0.257134,-0.215543,-0.046845,...,-0.219130,0.264653,0.020197,0.154055,0.091697,0.081720,0.279901,-0.111844,0.448175,0.007058
data/bbc/tech/370.txt,0.207623,0.058697,-0.008874,-0.088409,0.193419,0.046109,-0.107221,0.199647,-0.167632,0.003790,...,-0.054829,0.225892,0.052450,0.157943,-0.054407,0.171159,0.299693,0.078852,0.167330,-0.113994
data/bbc/tech/329.txt,0.022106,0.060943,-0.127390,-0.100214,0.184243,-0.077529,-0.157470,-0.042993,-0.204254,-0.021419,...,-0.337353,0.153419,0.052486,0.342915,0.268746,0.212994,0.665159,0.119243,0.474922,-0.215598
data/bbc/tech/240.txt,0.259128,-0.108082,0.076262,-0.080416,0.183988,0.329807,0.156697,0.495652,-0.104913,-0.120077,...,-0.218093,0.236378,0.076534,0.162548,0.025069,0.169282,0.229194,-0.025068,0.351246,0.069058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
data/bbc/sport/156.txt,-0.129396,-0.318909,0.051668,-0.004754,0.278222,0.212088,0.019598,0.088313,-0.207423,0.119865,...,-0.136864,0.209640,-0.043528,-0.028529,0.012553,-0.093324,0.170397,0.002566,0.681819,-0.002700
data/bbc/sport/151.txt,-0.189324,-0.460702,0.184651,0.008714,0.262374,0.318553,0.081744,-0.092602,-0.151647,-0.034163,...,-0.139452,0.205881,-0.088978,-0.145100,-0.125726,-0.159489,0.113482,-0.020790,0.695474,0.062283
data/bbc/sport/042.txt,-0.081969,-0.074043,-0.004957,0.078592,0.142221,0.236764,0.083222,-0.033401,-0.140084,-0.310319,...,0.048195,0.119734,-0.381953,0.163369,0.149823,-0.129863,-0.020217,-0.316813,0.319503,0.091818
data/bbc/sport/194.txt,-0.270698,-0.454356,0.062119,0.009040,0.250537,0.327598,-0.133687,0.171560,-0.061138,0.241108,...,-0.302931,0.165477,-0.093189,-0.233209,-0.016422,0.026764,0.151837,0.066245,0.472599,0.031884
