In [2]:
import import_ipynb

import numpy as np
import dataset
import glove

In [3]:
root_directories = ["ADS16_Benchmark_part1", "ADS16_Benchmark_part2"]

In [7]:
def load_ftrs(root_directories, max_embedding_dim=50, words_per_ad=20, verbose=True):
    categories_list = []
    text_vectors_list = []

    if verbose:
        print("Loading word embeddings...")
    emb_ftrs, emb_word_indices = glove.load_vectors(max_embedding_dim=max_embedding_dim, verbose=False)
    if verbose:
        print("Done loading word embeddings")

    ad_categories = list(dataset.ad_category_iterator(root_directories))
    last_category_id = ad_categories[-1][0]
    for (
        category_id,
        category_one_hot,
        texts
    ) in ad_categories:
        categories_list.append(np.broadcast_to(category_one_hot, (len(texts), *category_one_hot.shape)))
        embeddings = glove.embed(texts, emb_ftrs, emb_word_indices, words_per_sample=words_per_ad)
        embeddings = embeddings.reshape((len(texts), np.product(embeddings.shape[1:])))
        text_vectors_list.append(embeddings)
        if verbose:
            print(f"Loaded category {category_id}/{last_category_id}", end="\r")
    if verbose:
        print("")
    
    categories_list = np.concatenate(categories_list, axis=0)
    text_vectors_list = np.concatenate(text_vectors_list, axis=0)

    ftrs = np.concatenate([categories_list, text_vectors_list], axis=-1)

    return ftrs

if __name__ == "__main__":
    ftrs = load_ftrs(root_directories)

Loading word embeddings...
Done loading word embeddings
Loaded category 20/20
