# Neural Topic Modelling vs LDA

In [1]:
!pip install nltk

[33mYou are using pip version 10.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [45]:
import os
import io
import re
import nltk
import time
import boto3
import shutil
import sagemaker
import numpy as np
import scipy.sparse as sparse
import sagemaker.amazon.common as smac
from pprint import pprint
from sagemaker.session import s3_input
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer, json_deserializer
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Import datasets

In [3]:
!mkdir -p datasets

In [4]:
#Wiki datasets
!curl -O https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
!unzip wikitext-2-v1.zip -d datasets

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4370k  100 4370k    0     0  18.4M      0 --:--:-- --:--:-- --:--:-- 18.4M
Archive:  wikitext-2-v1.zip
replace datasets/wikitext-2/wiki.test.tokens? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


## Parse data

In [8]:
def is_document_start(line):
    if len(line) < 4:
        return False
    if line[0] is '=' and line[-1] is '=':
        if line[2] is not '=':
            return True
        else:
            return False
    else:
        return False


def token_list_per_doc(input_dir, token_file):
    lines_list = []
    line_prev = ''
    prev_line_start_doc = False
    with open(os.path.join(input_dir, token_file), 'r', encoding='utf-8') as f:
        for l in f:
            line = l.strip()
            if prev_line_start_doc and line:
                # the previous line should not have been start of a document!
                lines_list.pop()
                lines_list[-1] = lines_list[-1] + ' ' + line_prev

            if line:
                if is_document_start(line) and not line_prev:
                    lines_list.append(line)
                    prev_line_start_doc = True
                else:
                    lines_list[-1] = lines_list[-1] + ' ' + line
                    prev_line_start_doc = False
            else:
                prev_line_start_doc = False
            line_prev = line

    print("{} documents parsed!".format(len(lines_list)))
    return lines_list

In [9]:
input_dir = 'datasets/wikitext-2'
train_file = 'wiki.train.tokens'
val_file = 'wiki.valid.tokens'
test_file = 'wiki.test.tokens'

train_doc_list = token_list_per_doc(input_dir, train_file)
val_doc_list = token_list_per_doc(input_dir, val_file)
test_doc_list = token_list_per_doc(input_dir, test_file)

600 documents parsed!
60 documents parsed!
60 documents parsed!


## Data Sanitisation

In [10]:
def lemmatize(doc):
    wnl = WordNetLemmatizer()
    token_pattern = re.compile(r"(?u)\b\w\w+\b")

    return [
        wnl.lemmatize(word) 
        for word in doc.split() 
        if len(word) >= 2 
        and re.match("[a-z].*", word)         
        and re.match(token_pattern, word)
    ]

In [11]:
vectorizer = CountVectorizer(
    input='content', 
    analyzer='word', #
    stop_words='english', #remove words that add no value
    tokenizer=lemmatize, #pass function reference to lemmatizer
    
    #Consider tuning these two to your specific use case
    max_df=0.90, #Remove terms that appear in more than 90% of documents
    min_df=3 #Remove words that appear in less than 3 documents
)

vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=3,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function lemmatize at 0x7fdff59a3bf8>, vocabulary=None)

In [12]:
#Apply the lemmatisation / vectorisation

#fit_transform learns the vocab dictionary of all tokens, and returns the term-document matrix
#transform simply returns the term-document matrix
train_vectors = vectorizer.fit_transform(train_doc_list)

test_vectors = vectorizer.transform(test_doc_list)
val_vectors = vectorizer.transform(val_doc_list)

  'stop_words.' % sorted(inconsistent))


In [13]:
vocab_list = vectorizer.get_feature_names()
vocab_size = len(vocab_list)
vocab_size

16650

In [14]:
#Shuffle the matrices
def shuffle_csr_matrix(matrix):
    indices = np.arange(matrix.shape[0])
    np.random.shuffle(indices)
    matrix = matrix[indices]
    return matrix
    

train_vectors = shuffle_csr_matrix(train_vectors)
test_vectors = shuffle_csr_matrix(test_vectors)
val_vectors = shuffle_csr_matrix(val_vectors)

In [15]:
#NTM expects float32 inputs
#Convert the entries in each matrix to float32
train_vectors = sparse.csr_matrix(train_vectors, dtype=np.float32)
test_vectors = sparse.csr_matrix(test_vectors, dtype=np.float32)
val_vectors = sparse.csr_matrix(val_vectors, dtype=np.float32)

In [16]:
#convert the data to RecordIO protobuf
def convert_to_protobuf(sparray, prefix):
    
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(array=sparray[:], file=buf, labels=None)
    buf.seek(0)

    #fname = os.path.join(prefix, fname_template.format(i))
    with open(f'data/{prefix}_data', 'wb') as f:
        f.write(buf.getvalue())
    print('Saved data to {}'.format(f'data/{prefix}_data'))

In [17]:
!mkdir -p data

In [18]:
convert_to_protobuf(train_vectors, prefix='train')
convert_to_protobuf(test_vectors, prefix='test')
convert_to_protobuf(val_vectors, prefix='validation')

Saved data to data/train_data
Saved data to data/test_data
Saved data to data/validation_data


In [19]:
!ls -lah data/

total 2.5M
drwxrwxr-x  2 ec2-user ec2-user 4.0K Feb 19 23:23 .
drwxrwxrwx 11 ec2-user ec2-user 4.0K Mar  2 01:09 ..
-rw-rw-r--  1 ec2-user ec2-user 213K Mar  2 01:10 test_data
-rw-rw-r--  1 ec2-user ec2-user 2.1M Mar  2 01:10 train_data
-rw-rw-r--  1 ec2-user ec2-user 205K Mar  2 01:10 validation_data


## Create the vocab file

In [20]:
#Create the vocab auxilliary file
!mkdir -p auxilliary
with open('auxilliary/vocab.txt', 'w', encoding='utf-8') as f:
    for item in vocab_list:
        f.write(item+'\n')

## Write the data files and vocab file to S3

In [21]:
account_id = boto3.client('sts').get_caller_identity()["Account"]
region = boto3.session.Session().region_name
bucket_name = f"neural-topic-modelling-{account_id}" #Generate a unique bucket name

boto3.client('s3', region_name=region).create_bucket(
    Bucket=bucket_name, 
    CreateBucketConfiguration={'LocationConstraint': region}
)

BucketAlreadyOwnedByYou: An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.

In [22]:
#Copy data to s3
s3_training_data_loc = f's3://{bucket_name}/train/train_data'
s3_testing_data_loc = f's3://{bucket_name}/test/test_data'
s3_validation_data_loc = f's3://{bucket_name}/validation/validation_data'
s3_vocab_data_loc = f's3://{bucket_name}/auxilliary/vocab.txt'


!aws s3 cp data/train_data s3://$bucket_name/train/train_data
!aws s3 cp data/test_data s3://$bucket_name/test/test_data
!aws s3 cp data/validation_data s3://$bucket_name/validation/validation_data
!aws s3 cp auxilliary/vocab.txt s3://$bucket_name/auxilliary/vocab.txt

upload: data/train_data to s3://neural-topic-modelling-817322365495/train/train_data
upload: data/test_data to s3://neural-topic-modelling-817322365495/test/test_data
upload: data/validation_data to s3://neural-topic-modelling-817322365495/validation/validation_data
upload: auxilliary/vocab.txt to s3://neural-topic-modelling-817322365495/auxilliary/vocab.txt


# Train the model

In [23]:
session = sagemaker.Session()
role = sagemaker.get_execution_role()
container = get_image_uri(boto3.Session().region_name, 'ntm')
ntm = sagemaker.estimator.Estimator(
    container, #Use the pre-built NTM container
    role,
    train_instance_count=1, 
    train_instance_type='ml.c4.xlarge', #'ml.p3.2xlarge', 
    output_path=f's3://{bucket_name}/models/model',
    sagemaker_session=session
)

In [24]:
num_topics = 20
ntm.set_hyperparameters(num_topics=num_topics, feature_dim=vocab_size, mini_batch_size=60, 
                        epochs=50, sub_sample=0.7)

In [30]:
s3_train = s3_input(s3_training_data_loc, distribution='ShardedByS3Key',
                    content_type='application/x-recordio-protobuf')
s3_test = s3_input(s3_testing_data_loc, distribution='FullyReplicated',
                  content_type='application/x-recordio-protobuf')
s3_val = s3_input(s3_validation_data_loc, distribution='FullyReplicated',
                  content_type='application/x-recordio-protobuf')
s3_vocab = s3_input(s3_vocab_data_loc, distribution='FullyReplicated', content_type='text/plain')

In [31]:
model = ntm.fit({'train': s3_train, 'validation': s3_val, 'auxiliary': s3_vocab, 'test': s3_test})

2020-03-02 01:14:53 Starting - Starting the training job...
2020-03-02 01:14:54 Starting - Launching requested ML instances......
2020-03-02 01:16:18 Starting - Preparing the instances for training......
2020-03-02 01:16:55 Downloading - Downloading input data...
2020-03-02 01:17:52 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
  from numpy.testing import nosetester[0m
[34m[03/02/2020 01:17:54 INFO 140117196126016] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/default-input.json: {u'num_patience_epochs': u'3', u'clip_gradient': u'Inf', u'encoder_layers': u'auto', u'optimizer': u'adadelta', u'_kvstore': u'auto_gpu', u'rescale_gradient': u'1.0', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'learning_rate': u'0.01', u'_data_format': u'record', u'sub_sample': u'1.0', u'epochs': u'50', u'weight_decay': u'0.0', u'_num_kv_servers': u'auto', u'encoder_layers_act

In [32]:
print('Training job name: {}'.format(ntm.latest_training_job.job_name))

Training job name: ntm-2020-03-02-01-14-53-592


# Deploy the model

In [33]:
predictor = ntm.deploy(
    instance_type='ml.m5.xlarge', 
    initial_instance_count=1,
    endpoint_name="neural-topic-modelling"
)

-----------!

# Run inference

In [64]:
predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = json_deserializer

In [84]:
results = predictor.predict(
    val_vectors.getrow(0).toarray()[0]
)

In [85]:
predictions = np.array([prediction['topic_weights'] for prediction in results['predictions']])

In [86]:
#Each entry in this array represents a computed topic
#note that len(predictions) = num_topics
predictions

array([[0.04545095, 0.09137032, 0.02837488, 0.0354532 , 0.01577244,
        0.06526145, 0.08169498, 0.01276142, 0.05082734, 0.0359521 ,
        0.09134295, 0.00839241, 0.0545993 , 0.0449562 , 0.04433573,
        0.08488413, 0.10063872, 0.02832018, 0.04672164, 0.03288965]])

# Delete the endpoint

In [None]:
sagemaker.Session().delete_endpoint(predictor.endpoint)