In [1]:
__author__ = "Jon Ball"
__version__ = "Summer 2022"

Code adapted from Muhammad Haseeb Khan and Adji Dieng's sample script:
https://github.com/adjidieng/ETM/blob/master/scripts/data_nyt.py

In [2]:
!python --version

Python 3.6.7


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from string import punctuation
from scipy import sparse

import numpy as np 
import pickle
import random
import math
import sys
import os

In [4]:
# Set seed for reproducibility
np.random.seed(42)

In [5]:
# Path for saving vocab, embeddings, etc.
save_path = os.path.join(os.getcwd(), "soced_min_df_2")

In [6]:
# Punctuation and misc. digits to add to stop word filter
punct = [punctuation[idx] for idx in range(len(punctuation))]
digits = [str(idx) for idx in range(1000)] + ["00", "000"]

https://github.com/adjidieng/ETM/blob/master/scripts/stops.txt

In [7]:
with open("stops.txt", "r") as infile:
    stops = [s.rstrip() for s in infile.readlines()]
    stops += punctuation
    stops += digits

https://github.com/adjidieng/ETM/blob/master/scripts/data_nyt.py

In [8]:
# Maximum / minimum document frequency
max_df = 0.7
min_df = 2 # min_df set low due to small sample size

In [9]:
with open(os.path.join("eric_data", "soced_etm_inputs.txt"), "r") as infile:
    docs = [line.lower().rstrip() for line in infile.readlines()]
print(f"""{len(docs)} docs loaded for Soc of Ed. 
Each doc is a title or sentence drawn from an article written by a sociologist of education.""")

5722 docs loaded for Soc of Ed. 
Each doc is a title or sentence drawn from an article written by a sociologist of education.


In [10]:
print(docs[0])

gender differences in context: the impact of track position on study involvement in flemish secondary education.


In [11]:
# Create tfidf vectorizer
tfidf_vec = TfidfVectorizer(min_df=min_df, max_df=max_df, stop_words=stops)
tfidf_matrix = tfidf_vec.fit_transform(docs)

In [12]:
# Get vocabulary
vocab = tfidf_vec.get_feature_names()
print(f"{len(vocab)} word types in the Soc of Ed text sample.")
print(f"  Initial vocabulary size: {len(vocab)}")

4364 word types in the Soc of Ed text sample.
  Initial vocabulary size: 4364


In [13]:
# Split in train/test/valid
print("Splitting documents into train/test/valid...")
n_docs = tfidf_matrix.shape[0]
trainSize = int(np.floor(0.85 * n_docs))
testSize = int(np.floor(0.10 * n_docs))
valSize = int(n_docs - trainSize - testSize)
idx_permute = np.random.permutation(n_docs).astype(int)

Splitting documents into train/test/valid...


In [14]:
train_idx = range(trainSize)
test_idx = range(trainSize, trainSize + testSize)
val_idx = range(trainSize + testSize, trainSize + testSize + valSize)

In [15]:
# Remove word types not in train data and map vocab to indices
vocab = sorted(list(set(
        [w for idx in train_idx for w in word_tokenize(docs[idx_permute[idx]]) if w in vocab]
)))
word2id = dict([(w, j) for j, w in enumerate(vocab)])
id2word = dict([(j, w) for j, w in enumerate(vocab)])
print(f"   Vocabulary after removing words not in train data: {len(vocab)}")

   Vocabulary after removing words not in train data: 4288


In [16]:
# Save vocab
with open(os.path.join(save_path, "vocab.pkl"), "wb") as outfile:
    pickle.dump(vocab, outfile)
# Save word-to-index mapping
with open(os.path.join(save_path, "word2id.pkl"), "wb") as outfile:
    pickle.dump(word2id, outfile)
# Save index-to-word mapping
with open(os.path.join(save_path, "id2word.pkl"), "wb") as outfile:
    pickle.dump(id2word, outfile)

In [17]:
docs_train = [
    [word2id[w] for w in word_tokenize(docs[idx_permute[idx]]) if w in word2id] for idx in train_idx
]
docs_test = [
    [word2id[w] for w in word_tokenize(docs[idx_permute[idx]]) if w in word2id] for idx in test_idx
]
docs_val = [
    [word2id[w] for w in word_tokenize(docs[idx_permute[idx]]) if w in word2id] for idx in val_idx
]
del docs

In [18]:
print(f"   Number of documents (train): {len(docs_train)} [this should be equal to {trainSize}]")
print(f"   Number of documents (test): {len(docs_test)} [this should be equal to {testSize}]")
print(f"   Number of documents (valid): {len(docs_val)} [this should be equal to {valSize}]")

   Number of documents (train): 4863 [this should be equal to 4863]
   Number of documents (test): 572 [this should be equal to 572]
   Number of documents (valid): 287 [this should be equal to 287]


In [19]:
# Remove empty documents
print("Removing empty documents...")

def remove_empty(in_docs):
    return [doc for doc in in_docs if doc!=[]]

docs_train = remove_empty(docs_train)
docs_test = remove_empty(docs_test)
docs_val = remove_empty(docs_val)

Removing empty documents...


In [20]:
# Remove test documents with length=1
docs_test = [doc for doc in docs_test if len(doc)>1]

In [21]:
# Split test set in 2 halves
print("Splitting test documents in 2 halves...")
docs_test_h1 = [[w for i,w in enumerate(doc) if i <= len(doc) / 2.0-1] for doc in docs_test]
docs_test_h2 = [[w for i,w in enumerate(doc) if i > len(doc) / 2.0-1] for doc in docs_test]

Splitting test documents in 2 halves...


Added step: load embeddings and map to indices

In [22]:
%%time
# Load pre-trained ERIC word embeddings, map to index, and store in matrix format
embeddings = np.zeros((len(vocab), 300)) # Vocab size x embedding size

eric_embeds = {}
with open(os.path.join("eric_data", "eric_embeds_50.txt"), "r") as infile:
    for line in infile.readlines():
        e = line.split()
        eric_embeds[e[0]] = np.array(e[1:])
        
for embed in eric_embeds:
    if embed in vocab:
        embeddings[word2id[embed],] = eric_embeds[embed]

with open(os.path.join(save_path, "embeddings.npy"), "wb") as outfile:
    np.save(outfile, embeddings)

del eric_embeds, embeddings

CPU times: user 38.4 s, sys: 2.26 s, total: 40.7 s
Wall time: 41.1 s


In [23]:
# Getting lists of words and doc_indices
print("Creating lists of words...")

def create_list_words(in_docs):
    return [x for y in in_docs for x in y]

words_train = create_list_words(docs_train)
words_test = create_list_words(docs_test)
words_test_h1 = create_list_words(docs_test_h1)
words_test_h2 = create_list_words(docs_test_h2)
words_val = create_list_words(docs_val)

print("  len(words_train): ", len(words_train))
print("  len(words_test): ", len(words_test))
print("  len(words_test_h1): ", len(words_test_h1))
print("  len(words_test_h2): ", len(words_test_h2))
print("  len(words_val): ", len(words_val))

Creating lists of words...
  len(words_train):  55782
  len(words_test):  6404
  len(words_test_h1):  3067
  len(words_test_h2):  3337
  len(words_val):  3252


In [24]:
# Get doc indices
print("Getting doc indices...")

def create_doc_indices(in_docs):
    aux = [[j for i in range(len(doc))] for j, doc in enumerate(in_docs)]
    return [int(x) for y in aux for x in y]

doc_indices_train = create_doc_indices(docs_train)
doc_indices_test = create_doc_indices(docs_test)
doc_indices_test_h1 = create_doc_indices(docs_test_h1)
doc_indices_test_h2 = create_doc_indices(docs_test_h2)
doc_indices_val = create_doc_indices(docs_val)

print("  len(np.unique(doc_indices_train)): {} [this should be {}]".format(len(np.unique(doc_indices_train)), len(docs_train)))
print("  len(np.unique(doc_indices_test)): {} [this should be {}]".format(len(np.unique(doc_indices_test)), len(docs_test)))
print("  len(np.unique(doc_indices_test_h1)): {} [this should be {}]".format(len(np.unique(doc_indices_test_h1)), len(docs_test_h1)))
print("  len(np.unique(doc_indices_test_h2)): {} [this should be {}]".format(len(np.unique(doc_indices_test_h2)), len(docs_test_h2)))
print("  len(np.unique(doc_indices_val)): {} [this should be {}]".format(len(np.unique(doc_indices_val)), len(docs_val)))

Getting doc indices...
  len(np.unique(doc_indices_train)): 4854 [this should be 4854]
  len(np.unique(doc_indices_test)): 565 [this should be 565]
  len(np.unique(doc_indices_test_h1)): 565 [this should be 565]
  len(np.unique(doc_indices_test_h2)): 565 [this should be 565]
  len(np.unique(doc_indices_val)): 286 [this should be 286]


In [25]:
# Number of documents in each set
n_docs_train = len(docs_train)
n_docs_test = len(docs_test)
n_docs_test_h1 = len(docs_test_h1)
n_docs_test_h2 = len(docs_test_h2)
n_docs_val = len(docs_val)

In [26]:
# Remove unused variables
del docs_train
del docs_test
del docs_test_h1
del docs_test_h2
del docs_val

In [27]:
# Create bow representation
print("Creating bow representation...")

def create_bow(doc_indices, words, n_docs, vocab_size):
    return sparse.coo_matrix(([1]*len(doc_indices),(doc_indices, words)), shape=(n_docs, len(vocab))).tocsr()

bow_train = create_bow(doc_indices_train, words_train, n_docs_train, len(vocab))
bow_test = create_bow(doc_indices_test, words_test, n_docs_test, len(vocab))
bow_test_h1 = create_bow(doc_indices_test_h1, words_test_h1, n_docs_test_h1, len(vocab))
bow_test_h2 = create_bow(doc_indices_test_h2, words_test_h2, n_docs_test_h2, len(vocab))
bow_val = create_bow(doc_indices_val, words_val, n_docs_val, len(vocab))

del words_train
del words_test
del words_test_h1
del words_test_h2
del words_val
del doc_indices_train
del doc_indices_test
del doc_indices_test_h1
del doc_indices_test_h2
del doc_indices_val

Creating bow representation...


In [28]:
# Save bow matrices
# Train
sparse.save_npz(os.path.join(save_path, "bow_train.npz"), bow_train)
# Test
sparse.save_npz(os.path.join(save_path, "bow_test.npz"), bow_test)
# Test split 1
sparse.save_npz(os.path.join(save_path, "bow_test_h1.npz"), bow_test_h1)
# Test split 2
sparse.save_npz(os.path.join(save_path, "bow_test_h2.npz"), bow_test_h2)
# Val
sparse.save_npz(os.path.join(save_path, "bow_val"), bow_val)

print("Data ready !!")
print("*************")

Data ready !!
*************
