In [11]:
# Create embeddings for wikipedia texts
# Note: Embeddings computation for 11,020x2 texts takes 10,822 seconds on the EML4U experiment server.
#       In 1 hour you can process around 3,600 = 60*60 text-pairs.

# Current script
baseDir = "/home/eml4u/EML4U/notebooks/wikipedia-embeddings"

# File IDs (for input and output)
#title = "american-films"
#title = "british-films"
title = "indian-films"
#title = "living-people"
dateA = "20100408"
dateB = "20201101"
idA = dateA + "-" + title
idB = dateB + "-" + title

# Input directories
dataDirA = "/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/" + idA + "/"
dataDirB = "/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/" + idB + "/"

# Output files
outDir = "/home/eml4u/EML4U/data/wikipedia-embeddings/"
fileEmbeddingsA = outDir + idA + ".txt"
fileEmbeddingsB = outDir + idB + ".txt"
fileIds = outDir + title + ".txt"

print(dataDirA)
print(dataDirB)
print(fileEmbeddingsA)
print(fileEmbeddingsB)
print(fileIds)

/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/20100408-british-films/
/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/20201101-british-films/
/home/eml4u/EML4U/data/wikipedia-embeddings/20100408-british-films.txt
/home/eml4u/EML4U/data/wikipedia-embeddings/20201101-british-films.txt
/home/eml4u/EML4U/data/wikipedia-embeddings/british-films.txt


In [12]:
# Get file paths
import glob
filesA = glob.glob(dataDirA + '*.txt')
filesB = glob.glob(dataDirB + '*.txt')

In [13]:
# Development
# Limit number of file paths
if False:
    filesA = filesA[:20]
    filesB = filesB[:20]
# Print file paths
if False:
    print('\n'.join(map(str, filesA)))
    print()
    print('\n'.join(map(str, filesB)))

In [14]:
# Read files
textsA = []
for filename in filesA:
    fileobject = open(filename, "r") 
    text = fileobject.read()
    textsA.append(text)
    fileobject.close

textsB = []
for filename in filesB:
    fileobject = open(filename, "r") 
    text = fileobject.read()
    textsB.append(text)
    fileobject.close

In [15]:
# Print text sizes / texts
print("len(textsA):", len(textsA))
print("len(textsB):", len(textsB))

if False:
    print(textsA[0])
    print(textsB[0])

len(textsA): 2147
len(textsB): 2147


In [16]:
# Ensure similar filenames in both points of time
import ntpath
filenames = []
for x in range(len(filesA)):
    filenames.append(ntpath.basename(filesA[x]))
    if(ntpath.basename(filesA[x]) != ntpath.basename(filesB[x])):
        print (x , ntpath.basename(filesA[x]), ntpath.basename(filesB[x]))
print("len(filenames):", len(filenames))

len(filenames): 2147


In [7]:
# Prepare embeddings
import sys
import os
sys.path.append(os.path.abspath(baseDir))
from embedding import BertHuggingface

NUM_CLASSES = 8 # irrelevant if you dont want to retrain
bert = BertHuggingface(NUM_CLASSES)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [17]:
# Create embeddings
import time
print(time.asctime())
startTime = time.time()
embeddingsA = bert.embed(textsA)
embeddingsB = bert.embed(textsB)

print("Runtime: %s seconds" % (time.time() - startTime))
print("embeddingsA.shape:", embeddingsA.shape)
print("embeddingsB.shape:", embeddingsB.shape)

Wed Apr  7 13:31:22 2021
None
Runtime: 2056.714539527893 seconds
embeddingsA.shape: (2147, 768)
embeddingsB.shape: (2147, 768)


In [18]:
# Write embeddings/arrays to files
print(fileEmbeddingsA)
print(fileEmbeddingsB)
print(fileIds)

import numpy
numpy.savetxt(fileEmbeddingsA, embeddingsA)
numpy.savetxt(fileEmbeddingsB, embeddingsB)
with open(fileIds, "w") as outfile:
    outfile.write("\n".join(filenames))

/home/eml4u/EML4U/data/wikipedia-embeddings/20100408-british-films.txt
/home/eml4u/EML4U/data/wikipedia-embeddings/20201101-british-films.txt
/home/eml4u/EML4U/data/wikipedia-embeddings/british-films.txt


In [19]:
# Check: Load arrays
if True:
    loadedA = numpy.loadtxt(fileEmbeddingsA)
    loadedB = numpy.loadtxt(fileEmbeddingsB)
    with open(fileIds) as f:
        loadedFilenames = f.read().splitlines()
    print(numpy.array_equal(embeddingsA, loadedA))
    print(numpy.array_equal(embeddingsB, loadedB))
    print(numpy.array_equal(filenames, loadedFilenames))
    print(type(embeddingsA))
    print(type(loadedA))
    print(type(loadedFilenames))

True
True
True
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'list'>
