# Embeddings reader

- Get mapping original number to raw ID
- Get splits
- Get embeddings of splits

In [36]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from vm notebook dir
import sys; sys.path.insert(0, '../../')

import os
from datetime import datetime

from ExplainingDriftTextEmbeddings.access.file_storage import FileStorage
from ExplainingDriftTextEmbeddings.access.amazon_pickle_reader import AmazonPickleReader
from ExplainingDriftTextEmbeddings.access.interim_storage import InterimStorage

from BenchTest.amore.amazon_reviews_reader import AmazonReviewsReader

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
file_storage = FileStorage()

## Create reader

In [3]:
# Create AmazonPickleReader
amazon_raw_id = 'amazon_raw.pickle'
print(file_storage.get_filepath(amazon_raw_id))

amazon_directoy = os.path.dirname(file_storage.get_filepath(amazon_raw_id))
print(amazon_directoy)

amazon_reader = AmazonPickleReader(amazon_directoy)

# First call: Initialize mapping
#amazon_reader.get_raw_id(1)

/home/eml4u/EML4U/notebooks/ExplainingDriftTextEmbeddings/data/clustering/amazon_raw.pickle
/home/eml4u/EML4U/notebooks/ExplainingDriftTextEmbeddings/data/clustering


## Get mapping original number to raw ID

In [4]:
interim_storage = InterimStorage('amazon_originalno_to_rawid')

# Caching
if False:
    originalno_to_rawid = {}
    for originalno in amazon_reader.originalno_to_rawid.keys():
        originalno_to_rawid[originalno] = amazon_reader.get_raw_id(originalno)

    interim_storage.write(originalno_to_rawid)

    interim_storage.get_filepath()
    # '/tmp/InterimStorage/amazon_originalno_to_rawid.pickle.bz2'

# Use cache
else:
    originalno_to_rawid = interim_storage.read()

print(len(originalno_to_rawid))
# 7911684 -> all

7911684


## Get splits

In [27]:
split_file_id = 'AMORE-IDs'

In [28]:
# Swap to current root
import sys; sys.path.insert(0, '../../BenchTest')
from amore.reviews import Reviews
import pprint

In [48]:
def print_splits(file_id):
    splits = InterimStorage(file_id).read()
    #pprint.pprint(splits)
    for i, split in enumerate(splits):
        print(i, split)
    for split in splits:
        print(split.get_review_ids()[:5], '...')
    return splits

In [49]:
splits = print_splits(split_file_id)

0 Split(AMORE1,A,10000)
1 Split(AMORE1,B,9901)
2 Split(AMORE1,B,99)
3 Split(TEST-1star,A,10000)
4 Split(TEST-1star,B,10000)
[334241, 705914, 1070406, 1206648, 1229742] ...
[3569722, 3570714, 3571533, 3571568, 3577009] ...
[3116450, 3239880, 4268728, 4606075, 709244] ...
[623398, 712539, 934046, 940102, 1095175] ...
[4984171, 4986916, 4986928, 4986976, 4987253] ...


## Get embeddings

In [41]:
# Get splits by names instead of indexes
# Missing here: name-dist combination can contain multiple splits
# e.g.:
# Split(AMORE1,A,10000),
# Split(AMORE1,B,9901),
# Split(AMORE1,B,99),
if False:
    split_dict = {}
    for i, split in enumerate(splits):
        name = split.get_split_name()
        dist = split.get_distribution_name()
        if(name not in split_dict.keys()):
            split_dict[name] = {}
        if(dist not in split_dict[name].keys()):
            split_dict[name][dist] = i
    print(split_dict)

    def get_ids(split_name, distribution):
        return splits[split_dict[name][dist]]
    
    get_ids('TEST-1star', 'A')
    # {'AMORE1': {'A': 0, 'B': 1}, 'TEST-1star': {'A': 3, 'B': 4}}

In [50]:
results = {}

# Split by index
first_split_index = 3
last_split_index = 4
for i in range(first_split_index,last_split_index+1):
    split = splits[i]
    print(split)
    embeddings = []
    for review_no in split.get_review_ids():
        raw_id = originalno_to_rawid[review_no]
        embeddings.append(amazon_reader.get_bow50(raw_id))
    results[i] = embeddings

Split(TEST-1star,A,10000)
Split(TEST-1star,B,10000)


In [51]:
for i in results:
    print(i, len(results[i]), len(results[i][0]))

3 10000 50
4 10000 50


In [52]:
results_storage = InterimStorage('tmp_embeddings_list').write(results)