## Filtering: Model comparison

In [1]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

import timeit
import numpy as np
import scipy.spatial.distance
from collections import OrderedDict

from access.file_storage import FileStorage
from access.interim_storage import InterimStorage

KEY_NUMBER = 0
#KEY_YEAR   = 1
#KEY_STAR   = 2

In [2]:
key_tfidf_tsvd___ = 'tfidf_tsvd'
key_tfidf_umap___ = 'tfidf_umap'
key_countvec_tsvd = 'countvec_tsvd'
key_countvec_umap = 'countvec_umap'

key_vectors         = 'vectors'
key_np_array        = 'np_array'
key_centroid        = 'centroid'
key_distances       = 'distances'

In [3]:
types = [key_tfidf_tsvd___,
         key_tfidf_umap___,
         key_countvec_tsvd,
         key_countvec_umap]

In [4]:
stars_neg = [1,2]
stars_pos = [4,5]
stars_posneg = stars_neg + stars_pos
years_2006_2012 = list(range(2006,2012+1))

## Read data

In [5]:
data = {}

### Vectors

In [6]:
data[key_vectors] = {}
time_begin = timeit.default_timer()
data[key_vectors][key_tfidf_tsvd___] = InterimStorage('tsvd').read()
data[key_vectors][key_tfidf_umap___] = InterimStorage('umap').read()
data[key_vectors][key_countvec_tsvd] = InterimStorage('countvec-tsvd').read()
data[key_vectors][key_countvec_umap] = InterimStorage('countvec-umap').read()
print('Runtime:', timeit.default_timer() - time_begin)
# Runtime: 20.973824976943433

Runtime: 20.973824976943433


In [7]:
def print_overview(data, title):
    if isinstance(data, dict):
        print(len(data), type(data), title, next(iter(data.values())))
        print(' e.g.', next(iter(data.items())))
    else:
        print(len(data), type(data), title, data[0])

for typ in types:
    print_overview(data[key_vectors][typ], typ)

#1203682 <class 'dict'> tfidf_tsvd [0.2935860120266387, 0.06035900338941396]
#1203682 <class 'dict'> tfidf_umap [4.668518543243408, 8.601895332336426]
#1203682 <class 'dict'> countvec_tsvd [5.905195593819305, -1.8458777033241844]
#1203682 <class 'dict'> countvec_umap [7.554241180419922, -0.3533836901187897]

1203682 <class 'dict'> tfidf_tsvd [0.2935860120266387, 0.06035900338941396]
 e.g. (3, [0.2935860120266387, 0.06035900338941396])
1203682 <class 'dict'> tfidf_umap [4.668518543243408, 8.601895332336426]
 e.g. (3, [4.668518543243408, 8.601895332336426])
1203682 <class 'dict'> countvec_tsvd [5.905195593819305, -1.8458777033241844]
 e.g. (3, [5.905195593819305, -1.8458777033241844])
1203682 <class 'dict'> countvec_umap [7.554241180419922, -0.3533836901187897]
 e.g. (3, [7.554241180419922, -0.3533836901187897])


### Review-number to indexes

In [8]:
revnum_to_index = {}
first_type = next(iter(types))
for i, tup in enumerate(data[key_vectors][first_type].items()):
    revnum_to_index[tup[0]] = i

# Check if keys of all types are equal:
first_type_keylist = list(data[key_vectors][first_type].keys())
for typ in types:
    key_list = list(data[key_vectors][typ].keys())
    if not np.array_equal(first_type_keylist, key_list):
        print('Different keys:', typ)

### Year Star

In [9]:
def count_ysl(ysl):
    c = 0
    for year in ysl.keys():
        for star in ysl[year].keys():
            c += len(ysl[year][star])
    return c

In [10]:
ys_lists = InterimStorage('deduplicated').read()
print('Reviews in ys_lists:', count_ysl(ys_lists))
# Reviews in ys_lists: 1,727,821

Reviews in ys_lists: 1727821
Runtime: 3.353083639405668


## Convert to Numpy ndarrays, clean NAN

https://numpy.org/doc/stable/reference/generated/numpy.nan_to_num.html

In [11]:
data[key_np_array] = {}
for typ in types:
    data[key_np_array][typ] = np.nan_to_num(np.array(list(data[key_vectors][typ].values())))
    print_overview(data[key_np_array][typ], typ)
    
# 1203682 <class 'numpy.ndarray'> tfidf_tsvd    [ 0.29358601  0.060359  ]
# 1203682 <class 'numpy.ndarray'> tfidf_umap    [ 4.66851854  8.60189533]
# 1203682 <class 'numpy.ndarray'> countvec_tsvd [ 5.90519559 -1.8458777 ]
# 1203682 <class 'numpy.ndarray'> countvec_umap [ 7.55424118 -0.35338369]

1203682 <class 'numpy.ndarray'> tfidf_tsvd [0.29358601 0.060359  ]
1203682 <class 'numpy.ndarray'> tfidf_umap [4.66851854 8.60189533]
1203682 <class 'numpy.ndarray'> countvec_tsvd [ 5.90519559 -1.8458777 ]
1203682 <class 'numpy.ndarray'> countvec_umap [ 7.55424118 -0.35338369]


## Centroids

In [12]:
def get_centroid(data):
    print(data.min(axis=0), data.max(axis=0), data.mean(axis=0))
    return data.mean(axis=0)

In [13]:
data[key_centroid] = {}
for typ in types:
    data[key_centroid][typ] = get_centroid(data[key_np_array][typ])

#[ 0.          -0.40866756]   [0.65589508  0.77644091]   [0.24590463 -0.01219842]
#[-5.77501488  -1.45871496]  [13.72292328 15.23633862]   [6.34190018  6.32022436]
#[ 0.         -34.30849225] [105.43668317 78.95606391]   [3.50654809 -0.39607092]
#[-2.58597469  -8.12982941]  [14.06348991 10.52065468]   [5.02704603  0.35577971]

[ 0.         -0.40866756] [0.65589508 0.77644091] [ 0.24590463 -0.01219842]
[-5.77501488 -1.45871496] [13.72292328 15.23633862] [6.34190018 6.32022436]
[  0.         -34.30849225] [105.43668317  78.95606391] [ 3.50654809 -0.39607092]
[-2.58597469 -8.12982941] [14.06348991 10.52065468] [5.02704603 0.35577971]
Runtime: 0.4649447062984109


## Distances to centroid

https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html

In [14]:
data[key_distances] = {}
for typ in types:
    data[key_distances][typ] = scipy.spatial.distance.cdist(
        data[key_np_array][typ],
        data[key_centroid][typ].reshape(1,-1),
        metric='cosine')
    print(len(data[key_distances][typ]))

1203682
1203682
1203682
1203682
Runtime: 0.09357141004875302


## Sort by distances to centroid

In [15]:
def filter_revnums(years, stars, number=-1):
    revums = set()
    for year in ys_lists.keys():
        if(year not in years):
            continue
        for star in ys_lists[year].keys():
            if(star not in stars):
                continue
            for tup in ys_lists[year][star]:
                revums.add(tup[KEY_NUMBER])
                if(len(revums) == number):
                    return revums
    return revums

In [16]:
def get_distances(years, stars, typ):
    results = []
    for revnum in filter_revnums(years, stars):
        if(revnum not in revnum_to_index.keys()):
            continue
        index = revnum_to_index[revnum]
        results.append((revnum,
                       index,
                       data[key_distances][typ][index]))
    return results

In [17]:
# (revnum, index, distance) sorted by distances to centroid
time_begin = timeit.default_timer()
distance_sorted = {}
for typ in types:
    distance_sorted[typ] = sorted(get_distances(years_2006_2012, stars_posneg, typ), key=lambda tup: tup[2])
    print(len(distance_sorted[typ]))
print('Runtime:', timeit.default_timer() - time_begin)
# Runtime: 52.87649042997509

1203682
1203682
1203682
1203682
Runtime: 52.87649042997509


In [18]:
distance_sorted[first_type][:10]

[(30617, 17517, array([8.67417249e-13])),
 (88042, 48374, array([5.99575944e-12])),
 (2168205, 672267, array([1.21821442e-10])),
 (5505968, 1047477, array([1.34721234e-10])),
 (247442, 128434, array([1.81096582e-10])),
 (196474, 105286, array([1.8520363e-10])),
 (2531243, 734996, array([2.74675505e-10])),
 (84144, 45808, array([3.21328741e-10])),
 (411619, 198726, array([3.52041063e-10])),
 (739897, 317424, array([3.95095623e-10]))]