## Filtering: Model comparison

In [1]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

import timeit
import numpy as np
import scipy.spatial.distance
from collections import OrderedDict
from alibi_detect.cd import LSDDDrift
from alibi_detect.cd import KSDrift

from access.file_storage import FileStorage
from access.interim_storage import InterimStorage

KEY_NUMBER = 0
#KEY_YEAR   = 1
#KEY_STAR   = 2

In [2]:
key_tfidf_tsvd___ = 'tfidf_tsvd'
key_tfidf_umap___ = 'tfidf_umap'
key_countvec_tsvd = 'countvec_tsvd'
key_countvec_umap = 'countvec_umap'

key_vectors         = 'vectors'
key_np_array        = 'np_array'
key_centroid        = 'centroid'
key_distances       = 'distances'

In [3]:
types = [key_tfidf_tsvd___,
         key_tfidf_umap___,
         key_countvec_tsvd,
         key_countvec_umap]

In [4]:
stars_neg = [1,2]
stars_pos = [4,5]
stars_posneg = stars_neg + stars_pos
years_2006_2012 = list(range(2006,2012+1))

## Read data

In [5]:
data = {}

### Vectors

In [6]:
data[key_vectors] = {}
time_begin = timeit.default_timer()
data[key_vectors][key_tfidf_tsvd___] = InterimStorage('tsvd').read()
data[key_vectors][key_tfidf_umap___] = InterimStorage('umap').read()
data[key_vectors][key_countvec_tsvd] = InterimStorage('countvec-tsvd').read()
data[key_vectors][key_countvec_umap] = InterimStorage('countvec-umap').read()
print('Runtime:', timeit.default_timer() - time_begin)
# Runtime: 20.973824976943433

Runtime: 20.973824976943433


In [7]:
def print_overview(data, title):
    if isinstance(data, dict):
        print(len(data), type(data), title, next(iter(data.values())))
        print(' e.g.', next(iter(data.items())))
    else:
        print(len(data), type(data), title, data[0])

for typ in types:
    print_overview(data[key_vectors][typ], typ)

#1203682 <class 'dict'> tfidf_tsvd [0.2935860120266387, 0.06035900338941396]
#1203682 <class 'dict'> tfidf_umap [4.668518543243408, 8.601895332336426]
#1203682 <class 'dict'> countvec_tsvd [5.905195593819305, -1.8458777033241844]
#1203682 <class 'dict'> countvec_umap [7.554241180419922, -0.3533836901187897]

1203682 <class 'dict'> tfidf_tsvd [0.2935860120266387, 0.06035900338941396]
 e.g. (3, [0.2935860120266387, 0.06035900338941396])
1203682 <class 'dict'> tfidf_umap [4.668518543243408, 8.601895332336426]
 e.g. (3, [4.668518543243408, 8.601895332336426])
1203682 <class 'dict'> countvec_tsvd [5.905195593819305, -1.8458777033241844]
 e.g. (3, [5.905195593819305, -1.8458777033241844])
1203682 <class 'dict'> countvec_umap [7.554241180419922, -0.3533836901187897]
 e.g. (3, [7.554241180419922, -0.3533836901187897])


### Review-number to indexes

In [8]:
revnum_to_index = {}
first_type = next(iter(types))
for i, tup in enumerate(data[key_vectors][first_type].items()):
    revnum_to_index[tup[0]] = i

# Check if keys of all types are equal:
first_type_keylist = list(data[key_vectors][first_type].keys())
for typ in types:
    key_list = list(data[key_vectors][typ].keys())
    if not np.array_equal(first_type_keylist, key_list):
        print('Different keys:', typ)

### Year Star

In [9]:
def count_ysl(ysl):
    c = 0
    for year in ysl.keys():
        for star in ysl[year].keys():
            c += len(ysl[year][star])
    return c

In [10]:
ys_lists = InterimStorage('deduplicated').read()
print('Reviews in ys_lists:', count_ysl(ys_lists))
# Reviews in ys_lists: 1,727,821

Reviews in ys_lists: 1727821
Runtime: 3.353083639405668


## Convert to Numpy ndarrays, clean NAN

https://numpy.org/doc/stable/reference/generated/numpy.nan_to_num.html

In [11]:
data[key_np_array] = {}
for typ in types:
    data[key_np_array][typ] = np.nan_to_num(np.array(list(data[key_vectors][typ].values())))
    print_overview(data[key_np_array][typ], typ)
    
# 1203682 <class 'numpy.ndarray'> tfidf_tsvd    [ 0.29358601  0.060359  ]
# 1203682 <class 'numpy.ndarray'> tfidf_umap    [ 4.66851854  8.60189533]
# 1203682 <class 'numpy.ndarray'> countvec_tsvd [ 5.90519559 -1.8458777 ]
# 1203682 <class 'numpy.ndarray'> countvec_umap [ 7.55424118 -0.35338369]

1203682 <class 'numpy.ndarray'> tfidf_tsvd [0.29358601 0.060359  ]
1203682 <class 'numpy.ndarray'> tfidf_umap [4.66851854 8.60189533]
1203682 <class 'numpy.ndarray'> countvec_tsvd [ 5.90519559 -1.8458777 ]
1203682 <class 'numpy.ndarray'> countvec_umap [ 7.55424118 -0.35338369]


## Centroids

In [178]:
def get_centroid(data, verbose=True):
    if(verbose):
        print(data.min(axis=0), data.max(axis=0), data.mean(axis=0))
    return data.mean(axis=0)

In [13]:
data[key_centroid] = {}
for typ in types:
    data[key_centroid][typ] = get_centroid(data[key_np_array][typ])

#[ 0.          -0.40866756]   [0.65589508  0.77644091]   [0.24590463 -0.01219842]
#[-5.77501488  -1.45871496]  [13.72292328 15.23633862]   [6.34190018  6.32022436]
#[ 0.         -34.30849225] [105.43668317 78.95606391]   [3.50654809 -0.39607092]
#[-2.58597469  -8.12982941]  [14.06348991 10.52065468]   [5.02704603  0.35577971]

[ 0.         -0.40866756] [0.65589508 0.77644091] [ 0.24590463 -0.01219842]
[-5.77501488 -1.45871496] [13.72292328 15.23633862] [6.34190018 6.32022436]
[  0.         -34.30849225] [105.43668317  78.95606391] [ 3.50654809 -0.39607092]
[-2.58597469 -8.12982941] [14.06348991 10.52065468] [5.02704603 0.35577971]
Runtime: 0.4649447062984109


In [180]:
for typ in types:
    print(type(data[key_centroid][typ]), len(data[key_centroid][typ]), data[key_centroid][typ].shape)

<class 'numpy.ndarray'> 2 (2,)
<class 'numpy.ndarray'> 2 (2,)
<class 'numpy.ndarray'> 2 (2,)
<class 'numpy.ndarray'> 2 (2,)


# CountVec test

Tmp result: Distances use spatial 2d data. Approach not transferable 1:1

In [142]:
if False:
    countvec = InterimStorage('countvec-object').read()
    print(type(countvec), countvec.shape)
    # <class 'scipy.sparse.csr.csr_matrix'> (1203682, 1000)

In [200]:
if False:
    countvec_centroid = get_centroid(countvec, verbose=False)
    print(type(countvec_centroid), len(countvec_centroid), countvec_centroid.shape)
    # <class 'numpy.matrix'> 1 (1, 1000)

## Distances to centroid

https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html

In [14]:
data[key_distances] = {}
for typ in types:
    data[key_distances][typ] = scipy.spatial.distance.cdist(
        data[key_np_array][typ],
        data[key_centroid][typ].reshape(1,-1),
        metric='cosine')
    print(len(data[key_distances][typ]))

1203682
1203682
1203682
1203682
Runtime: 0.09357141004875302


## Sort by distances to centroid

In [15]:
def filter_revnums(years, stars, number=-1):
    revums = set()
    for year in ys_lists.keys():
        if(year not in years):
            continue
        for star in ys_lists[year].keys():
            if(star not in stars):
                continue
            for tup in ys_lists[year][star]:
                revums.add(tup[KEY_NUMBER])
                if(len(revums) == number):
                    return revums
    return revums

In [28]:
def get_distances(years, stars, typ):
    results = []
    for revnum in filter_revnums(years, stars):
        if(revnum not in revnum_to_index.keys()):
            continue
        index = revnum_to_index[revnum]
        results.append((revnum,
                       index,
                       data[key_distances][typ][index]))
    return results

In [35]:
def get_sorted(years, stars, typ, number):
    return sorted(get_distances(years, stars, typ), key=lambda tup: tup[2])[:number]

## Test models

- train with 10,000 original data
- first run: check same pos/neg and next year

### Setup test data

In [109]:
# Sorted test data
# (revnum, index, distance)

years = [2006]
stars = stars_neg
number = 10 * 1000

test_a = {}
for typ in types:
    test_a[typ] = get_sorted(years, stars, typ, number)
    print(len(test_a[typ]), test_a[typ][0])

years = [2007]
stars = stars_neg
number = 1000

test_b = {}
for typ in types:
    test_b[typ] = get_sorted(years, stars, typ, number)
    print(len(test_b[typ]), test_b[typ][0])

10000 (3674589, 880405, array([1.84628435e-09]))
10000 (866185, 360158, array([4.49262183e-10]))
10000 (2737794, 763435, array([2.02624584e-11]))
10000 (771781, 328708, array([1.06212739e-09]))
1000 (1195693, 456448, array([3.72756004e-09]))
1000 (1317174, 489855, array([3.87999632e-11]))
1000 (11808, 6842, array([2.50033194e-09]))
1000 (706823, 307462, array([5.85524296e-10]))


### Get test data

In [134]:
def get_vectors(typ, tup_list):
    results = []
    for tup in tup_list:
        revnum = tup[0]
        results.append(data[key_vectors][typ][revnum])
    return results

In [135]:
test_a_data = {}
for typ in types:
    test_a_data[typ] = np.array(get_vectors(typ, test_a[typ]))
    print(len(test_a_data[typ]), test_a_data[typ][0])

test_b_data = {}
for typ in types:
    test_b_data[typ] = np.array(get_vectors(typ, test_b[typ]))
    print(len(test_b_data[typ]), test_b_data[typ][0])

10000 [ 0.44190483 -0.02189435]
10000 [5.14050293 5.12324047]
10000 [ 3.59826866 -0.40640773]
10000 [5.77657413 0.40855858]
1000 [ 0.26087684 -0.01291856]
1000 [6.66289902 6.64024305]
1000 [ 4.45731446 -0.50378092]
1000 [7.61741257 0.53937   ]


### Detect drift

In [136]:
lsdd_models = {}
for typ in types:
    lsdd_models[typ] = LSDDDrift(test_a_data[typ], backend='pytorch', p_val=.05)

No GPU detected, fall back on CPU.
No GPU detected, fall back on CPU.
No GPU detected, fall back on CPU.
No GPU detected, fall back on CPU.


In [207]:
items = 100
for typ in types:
    print(lsdd_models[typ].predict(test_b_data[typ][:items]), typ)

{'data': {'is_drift': 1, 'distance': 0.2080069807353167, 'p_val': 0.0, 'threshold': 0.05, 'distance_threshold': 0.027574384038127245}, 'meta': {'name': 'LSDDDriftTorch', 'detector_type': 'offline', 'data_type': None, 'backend': 'pytorch'}} tfidf_tsvd
{'data': {'is_drift': 1, 'distance': 0.2236533694163399, 'p_val': 0.0, 'threshold': 0.05, 'distance_threshold': 0.025436377010314665}, 'meta': {'name': 'LSDDDriftTorch', 'detector_type': 'offline', 'data_type': None, 'backend': 'pytorch'}} tfidf_umap
{'data': {'is_drift': 1, 'distance': 0.09907036999165518, 'p_val': 0.0, 'threshold': 0.05, 'distance_threshold': 0.025063496910358148}, 'meta': {'name': 'LSDDDriftTorch', 'detector_type': 'offline', 'data_type': None, 'backend': 'pytorch'}} countvec_tsvd
{'data': {'is_drift': 1, 'distance': 0.5861391329899622, 'p_val': 0.0, 'threshold': 0.05, 'distance_threshold': 0.022992458360456238}, 'meta': {'name': 'LSDDDriftTorch', 'detector_type': 'offline', 'data_type': None, 'backend': 'pytorch'}} cou

In [208]:
items = 17
for typ in types:
    print(lsdd_models[typ].predict(test_b_data[typ][:items]), typ)

{'data': {'is_drift': 1, 'distance': 0.22643205341573092, 'p_val': 0.029999999329447746, 'threshold': 0.05, 'distance_threshold': 0.18962159716988036}, 'meta': {'name': 'LSDDDriftTorch', 'detector_type': 'offline', 'data_type': None, 'backend': 'pytorch'}} tfidf_tsvd
{'data': {'is_drift': 1, 'distance': 0.21846356382244125, 'p_val': 0.0, 'threshold': 0.05, 'distance_threshold': 0.1245077548487161}, 'meta': {'name': 'LSDDDriftTorch', 'detector_type': 'offline', 'data_type': None, 'backend': 'pytorch'}} tfidf_umap
{'data': {'is_drift': 0, 'distance': 0.11468764712753406, 'p_val': 0.11999999731779099, 'threshold': 0.05, 'distance_threshold': 0.1421287848658859}, 'meta': {'name': 'LSDDDriftTorch', 'detector_type': 'offline', 'data_type': None, 'backend': 'pytorch'}} countvec_tsvd
{'data': {'is_drift': 1, 'distance': 0.5346411084523447, 'p_val': 0.0, 'threshold': 0.05, 'distance_threshold': 0.15384079487337832}, 'meta': {'name': 'LSDDDriftTorch', 'detector_type': 'offline', 'data_type': Non

In [139]:
ks_models = {}
for typ in types:
    ks_models[typ] = KSDrift(test_a_data[typ], p_val=.05)

In [209]:
items = 100
for typ in types:
    print(ks_models[typ].predict(test_b_data[typ][:items]), typ)

{'data': {'is_drift': 1, 'distance': array([0.0746, 0.4438], dtype=float32), 'p_val': array([6.1321586e-01, 2.7482557e-18], dtype=float32), 'threshold': 0.025}, 'meta': {'name': 'KSDrift', 'detector_type': 'offline', 'data_type': None}} tfidf_tsvd
{'data': {'is_drift': 1, 'distance': array([0.1403, 0.2685], dtype=float32), 'p_val': array([3.6637045e-02, 8.5813508e-07], dtype=float32), 'threshold': 0.025}, 'meta': {'name': 'KSDrift', 'detector_type': 'offline', 'data_type': None}} tfidf_umap
{'data': {'is_drift': 1, 'distance': array([0.1441, 0.266 ], dtype=float32), 'p_val': array([2.9460110e-02, 1.1293345e-06], dtype=float32), 'threshold': 0.025}, 'meta': {'name': 'KSDrift', 'detector_type': 'offline', 'data_type': None}} countvec_tsvd
{'data': {'is_drift': 1, 'distance': array([0.1121, 0.6289], dtype=float32), 'p_val': array([1.5385664e-01, 2.0175678e-38], dtype=float32), 'threshold': 0.025}, 'meta': {'name': 'KSDrift', 'detector_type': 'offline', 'data_type': None}} countvec_umap


In [210]:
items = 20
for typ in types:
    print(ks_models[typ].predict(test_b_data[typ][:items]), typ)

{'data': {'is_drift': 1, 'distance': array([0.0898, 0.4573], dtype=float32), 'p_val': array([9.9225801e-01, 2.4129814e-04], dtype=float32), 'threshold': 0.025}, 'meta': {'name': 'KSDrift', 'detector_type': 'offline', 'data_type': None}} tfidf_tsvd
{'data': {'is_drift': 1, 'distance': array([0.1801, 0.3526], dtype=float32), 'p_val': array([0.48057827, 0.00994284], dtype=float32), 'threshold': 0.025}, 'meta': {'name': 'KSDrift', 'detector_type': 'offline', 'data_type': None}} tfidf_umap
{'data': {'is_drift': 0, 'distance': array([0.1809, 0.2584], dtype=float32), 'p_val': array([0.47501478, 0.1149784 ], dtype=float32), 'threshold': 0.025}, 'meta': {'name': 'KSDrift', 'detector_type': 'offline', 'data_type': None}} countvec_tsvd
{'data': {'is_drift': 1, 'distance': array([0.1311, 0.6397], dtype=float32), 'p_val': array([8.3855766e-01, 1.7329899e-08], dtype=float32), 'threshold': 0.025}, 'meta': {'name': 'KSDrift', 'detector_type': 'offline', 'data_type': None}} countvec_umap


Result: No drift only for 17/10,000 and 20/10,000 with countvec_tsvd

## Test 2

In [221]:
# Sorted test data
# (revnum, index, distance)

years = [2012]
stars = stars_pos
number = 20 * 1000

test_c = {}
test_d = {}
for typ in types:
    test_c[typ] = get_sorted(years, stars, typ, number)
    
    test_d[typ] = test_c[typ][10000:]
    test_c[typ] = test_c[typ][:10000]

    print(len(test_c[typ]), test_c[typ][0])
    print(len(test_d[typ]), test_d[typ][0])

10000 (2168205, 672267, array([1.21821442e-10]))
10000 (2120661, 663302, array([0.03133819]))
10000 (553970, 253433, array([1.2654533e-10]))
10000 (5909519, 1080056, array([0.00392179]))
10000 (6305053, 1108731, array([2.46049847e-11]))
10000 (20708, 11647, array([0.02450828]))
10000 (1124060, 436416, array([1.47770685e-13]))
10000 (4299, 2623, array([0.00039369]))


In [223]:
test_c_data = {}
for typ in types:
    test_c_data[typ] = np.array(get_vectors(typ, test_c[typ]))
    print(len(test_c_data[typ]), test_c_data[typ][0])

test_d_data = {}
for typ in types:
    test_d_data[typ] = np.array(get_vectors(typ, test_d[typ]))
    print(len(test_d_data[typ]), test_d_data[typ][0])

10000 [ 0.11459675 -0.00568652]
10000 [6.41290808 6.39119291]
10000 [ 2.90560355 -0.3282138 ]
10000 [7.82497025 0.55379325]
10000 [ 0.16921694 -0.05245187]
10000 [2.23367977 2.65991354]
10000 [8.96012968 0.98395396]
10000 [8.61043835 0.36697456]


In [224]:
lsdd_models2 = {}
for typ in types:
    lsdd_models2[typ] = LSDDDrift(test_c_data[typ], backend='pytorch', p_val=.05)

No GPU detected, fall back on CPU.
No GPU detected, fall back on CPU.
No GPU detected, fall back on CPU.
No GPU detected, fall back on CPU.


In [225]:
items = 100
for typ in types:
    print(lsdd_models2[typ].predict(test_d_data[typ][:items]), typ)

{'data': {'is_drift': 1, 'distance': 0.2775459685075212, 'p_val': 0.0, 'threshold': 0.05, 'distance_threshold': 0.02871767204098396}, 'meta': {'name': 'LSDDDriftTorch', 'detector_type': 'offline', 'data_type': None, 'backend': 'pytorch'}} tfidf_tsvd
{'data': {'is_drift': 1, 'distance': 0.05982610741315045, 'p_val': 0.0, 'threshold': 0.05, 'distance_threshold': 0.021381666802750204}, 'meta': {'name': 'LSDDDriftTorch', 'detector_type': 'offline', 'data_type': None, 'backend': 'pytorch'}} tfidf_umap
{'data': {'is_drift': 1, 'distance': 0.12407905523392465, 'p_val': 0.0, 'threshold': 0.05, 'distance_threshold': 0.02204285953650531}, 'meta': {'name': 'LSDDDriftTorch', 'detector_type': 'offline', 'data_type': None, 'backend': 'pytorch'}} countvec_tsvd
{'data': {'is_drift': 1, 'distance': 0.15252088426797597, 'p_val': 0.0, 'threshold': 0.05, 'distance_threshold': 0.025914991336287704}, 'meta': {'name': 'LSDDDriftTorch', 'detector_type': 'offline', 'data_type': None, 'backend': 'pytorch'}} cou

In [226]:
items = 17
for typ in types:
    print(lsdd_models2[typ].predict(test_d_data[typ][:items]), typ)

{'data': {'is_drift': 1, 'distance': 0.2915562181494523, 'p_val': 0.0, 'threshold': 0.05, 'distance_threshold': 0.17668543965159264}, 'meta': {'name': 'LSDDDriftTorch', 'detector_type': 'offline', 'data_type': None, 'backend': 'pytorch'}} tfidf_tsvd
{'data': {'is_drift': 0, 'distance': 0.10156469026953054, 'p_val': 0.12999999523162842, 'threshold': 0.05, 'distance_threshold': 0.1177074757052693}, 'meta': {'name': 'LSDDDriftTorch', 'detector_type': 'offline', 'data_type': None, 'backend': 'pytorch'}} tfidf_umap
{'data': {'is_drift': 1, 'distance': 0.23933894311854342, 'p_val': 0.0, 'threshold': 0.05, 'distance_threshold': 0.17698098505196166}, 'meta': {'name': 'LSDDDriftTorch', 'detector_type': 'offline', 'data_type': None, 'backend': 'pytorch'}} countvec_tsvd
{'data': {'is_drift': 1, 'distance': 0.20328837042110426, 'p_val': 0.029999999329447746, 'threshold': 0.05, 'distance_threshold': 0.13616785922588304}, 'meta': {'name': 'LSDDDriftTorch', 'detector_type': 'offline', 'data_type': No

In [227]:
ks_models2 = {}
for typ in types:
    ks_models2[typ] = KSDrift(test_c_data[typ], p_val=.05)

In [228]:
items = 100
for typ in types:
    print(ks_models2[typ].predict(test_d_data[typ][:items]), typ)

{'data': {'is_drift': 1, 'distance': array([0.0759, 0.3699], dtype=float32), 'p_val': array([5.9152770e-01, 1.1633506e-12], dtype=float32), 'threshold': 0.025}, 'meta': {'name': 'KSDrift', 'detector_type': 'offline', 'data_type': None}} tfidf_tsvd
{'data': {'is_drift': 0, 'distance': array([0.1035, 0.1163], dtype=float32), 'p_val': array([0.22343378, 0.12681895], dtype=float32), 'threshold': 0.025}, 'meta': {'name': 'KSDrift', 'detector_type': 'offline', 'data_type': None}} tfidf_umap
{'data': {'is_drift': 1, 'distance': array([0.0461, 0.2959], dtype=float32), 'p_val': array([9.7816527e-01, 3.5323406e-08], dtype=float32), 'threshold': 0.025}, 'meta': {'name': 'KSDrift', 'detector_type': 'offline', 'data_type': None}} countvec_tsvd
{'data': {'is_drift': 1, 'distance': array([0.067, 0.254], dtype=float32), 'p_val': array([7.4009562e-01, 4.0630616e-06], dtype=float32), 'threshold': 0.025}, 'meta': {'name': 'KSDrift', 'detector_type': 'offline', 'data_type': None}} countvec_umap


Result: Better results for same year, even if splitted 10,000|10,000 and not pairwise