# Prep Wikimedia Data for CF GoogleNews Embeddings

In [None]:
# Import packages.
import string
from io import BytesIO
from tensorflow.python.lib.io import file_io
import msgpack
import numpy as np
import pandas as pd
from gensim.models.word2vec import Word2VecKeyedVectors

In [16]:
# Read in lists.
f = BytesIO(file_io.read_file_to_string('GoogleNews_CF_word2id.bin', binary_mode=True))
word2id = msgpack.unpack(f, raw=False)

f = BytesIO(file_io.read_file_to_string('GoogleNews_CF_id2word.bin', binary_mode=True))
id2word = msgpack.unpack(f, raw=False)

In [17]:
# Read in the word embedding.
f = BytesIO(file_io.read_file_to_string('GoogleNews-CF-embeddings.npy', binary_mode=True))
vocab = np.load(f)

# Later we can use this as a check that we haven't added any rows or
# changed any indexes, ensuring the integrity of the embeddings has
# been preserved.
print('vocab size:', len(vocab))

# The first row is padding so the vocab indexes start at 1.
word_embeddings_df = pd.DataFrame(data=vocab)
word_embeddings_df.head()

vocab size: 56536


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.052956,0.06546,0.066195,0.047072,0.052221,-0.082009,-0.061414,-0.116209,0.015629,0.099293,...,-0.127242,-0.066931,-0.060679,0.048911,0.046153,-0.035672,-0.044314,-0.035856,0.010895,-0.047072
2,-0.008512,-0.034224,0.032284,0.045868,-0.013143,-0.046221,-0.000948,-0.052219,0.046574,0.062451,...,-0.016318,0.00269,-0.059628,0.058923,0.005733,0.000345,0.013319,0.051513,-0.025227,0.017465
3,-0.012361,-0.02223,0.06554,0.039477,-0.08662,0.024913,-0.011163,-0.070522,0.092369,0.092752,...,-0.008863,-0.012265,-0.026254,-0.016193,-0.015235,0.050209,0.01581,0.00539,0.047909,-0.116515
4,0.01373,-0.030795,0.08387,-0.001946,-0.065557,0.095346,0.052906,-0.065769,0.02716,0.009564,...,-0.143228,-0.0139,-0.070325,0.095192,0.011897,0.07717,-0.004127,0.01682,0.083955,0.045543


In [18]:
# Decodes encoded text using the id2word dict.
def indexes_to_text(indexes):
    found_indexes_list = []
    not_found_indexes_list = []

    for index in indexes:
        if id2word.get(index) is not None:
            found_indexes_list.append(id2word_dict.get(index))
        else:
            not_found_indexes_list.append(index)

    print('Indexes not found:', not_found_indexes_list)

    return ' '.join(found_indexes_list)

In [48]:
# Read in features and labels.
f = BytesIO(file_io.read_file_to_string('wikimedia-personal-attacks-min-6-votes-GN-Encoded-data.bin', binary_mode=True))
data = msgpack.unpack(f, raw=False)

In [49]:
# For keeping number of words in longest document.
max_words = 0

# Create lists to store docs and labels.
docs = []
labels = []

# Iterate over data to build lists of docs and labels.
num_docs = len(data)
for i in range(num_docs):
#     sys.stdout.write("processing record %i of %i       \r" % (i + 1, num_docs))
#     sys.stdout.flush()

    # Get index of document.
    doc = data[i]['idx']

    # Retrieve document from saved data and cast to array.
    doc = [item for sublist in doc for item in sublist]

    # Add document to docs array.
    docs.append(doc)

    # Add label to label array at same index.
    labels.append(data[i]['label'])

    # Track maximum number of words in document.
    if len(doc) > max_words:
        max_words = len(doc)

In [51]:
len(docs)

115841

In [None]:
# Also create plain text version of GN vectors for retrofitting, etc.
