# Embeddings reader

- Get mapping original number to raw ID
- Get splits
- Get embeddings of splits

In [23]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from vm notebook dir
import sys; sys.path.insert(0, '../../')

import os
from datetime import datetime

from ExplainingDriftTextEmbeddings.access.file_storage import FileStorage
from ExplainingDriftTextEmbeddings.access.amazon_pickle_reader import AmazonPickleReader
from ExplainingDriftTextEmbeddings.access.interim_storage import InterimStorage

from BenchTest.amore.amazon_reviews_reader import AmazonReviewsReader
from BenchTest.transformation.reduction import Reduction

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
file_storage = FileStorage()

## Create reader

In [5]:
# Create AmazonPickleReader
amazon_raw_id = 'amazon_raw.pickle'
print(file_storage.get_filepath(amazon_raw_id))

amazon_directoy = os.path.dirname(file_storage.get_filepath(amazon_raw_id))
print(amazon_directoy)

amazon_pickle_reader = AmazonPickleReader(amazon_directoy)

/home/eml4u/EML4U/notebooks/ExplainingDriftTextEmbeddings/data/clustering/amazon_raw.pickle
/home/eml4u/EML4U/notebooks/ExplainingDriftTextEmbeddings/data/clustering


## Get mapping original number to raw ID

In [8]:
interim_storage = InterimStorage('amazon_originalno_to_rawid')

# Caching
if False:
    originalno_to_rawid = {}
    for originalno in amazon_pickle_reader.originalno_to_rawid.keys():
        originalno_to_rawid[originalno] = amazon_pickle_reader.get_raw_id(originalno)
    print(interim_storage.write(originalno_to_rawid).get_filepath())
    # '/tmp/InterimStorage/amazon_originalno_to_rawid.pickle.bz2'

# Use cache
else:
    originalno_to_rawid = interim_storage.read()

print('Example:', next(iter(originalno_to_rawid.items())))
print('Size:', len(originalno_to_rawid))
# 7,911,684 -> all

Example: (2381344, 0)
Size: 7911684


## Get embeddings

In [17]:
# e.g. print(get_embeddings(amazon_pickle_reader, [100], originalno_to_rawid))
def get_embeddings(amazon_pickle_reader, revnum_include, revnum_to_rawid):
    embeddings = {}
    for revnum in revnum_include:
        raw_id = revnum_to_rawid[revnum]
        embeddings[revnum] = amazon_pickle_reader.get_bow50(raw_id)
    return embeddings

## Get embeddings for 2006-2012 pos/neg

In [18]:
reviewnumbers_2006_2012_posneg = InterimStorage('reviewnumbers_2006-2012_posneg').read()
print(len(reviewnumbers_2006_2012_posneg))
# 1,203,682

1203682


In [21]:
reviewnumbers_2006_2012_posneg_emb = get_embeddings(amazon_pickle_reader, reviewnumbers_2006_2012_posneg, originalno_to_rawid)
print(len(reviewnumbers_2006_2012_posneg_emb))
print(next(iter(reviewnumbers_2006_2012_posneg_emb.items())))

1203682
(2097152, array([-0.06299116, -0.12531292, -0.4487808 ,  1.1607794 , -0.36575422,
       -0.08572024, -0.16950664, -0.14035107,  0.60498494, -0.23605575,
        0.2969638 ,  0.25585127,  0.03586745,  0.15513225,  0.07766075,
        0.41837838,  0.58457077,  0.3820759 ,  0.19753297,  0.6963815 ,
        0.50449896,  0.4742624 , -0.09804211,  0.14021917, -0.51908594,
        0.7161566 , -0.54179746, -0.37193182, -0.64238405,  0.79483426,
       -0.58267075,  0.16197357, -0.45821607,  0.53412277, -0.60332555,
       -0.3527259 , -0.72167045, -0.7264509 ,  0.01920276,  0.66644484,
        0.11820004, -0.20682223, -0.6488411 , -0.28977698,  0.87654084,
        0.19570935, -0.8078851 , -0.44058782,  0.42540267, -0.29402822],
      dtype=float32))


In [22]:
print(InterimStorage('reviewnumbers_2006-2012_posneg_emb').write(reviewnumbers_2006_2012_posneg_emb).get_filepath())

/tmp/InterimStorage/reviewnumbers_2006-2012_posneg_emb.pickle.bz2


#### Reduce dimensions

In [None]:
reviewnumbers_2006_2012_posneg_emb_pca = {}
reduction = Reduction()
reduction.pca_fit(list(reviewnumbers_2006_2012_posneg_emb.values()))
for item in reviewnumbers_2006_2012_posneg_emb.items():
    reviewnumbers_2006_2012_posneg_emb_pca[item[0]] = reduction.pca_transform(item[1])

In [30]:
x = reduction.pca_transform(list(reviewnumbers_2006_2012_posneg_emb.values()))

In [32]:
len(x)

1203682

In [33]:
x[0]

array([ 0.84218181, -1.01260462])

In [34]:
reviewnumbers_2006_2012_posneg_emb_pca = {}
reduction = Reduction()
reduction.umap_fit(list(reviewnumbers_2006_2012_posneg_emb.values()))
umap = reduction.umap_transform(list(reviewnumbers_2006_2012_posneg_emb.values()))