# AMORE datasets

- Note: Years 1997 to 1999 are not included in Doc2Vec embeddings. (2000 to 2012 included.)
- Ideas:
    - 100/0 to 0/100 neg/pos
    - 50/50 to 40/60 neg/pos
    - build on results on that: other distributions, e.g. 45/55
    - 50/50 to 40/30/30 neg/posCluster1/posCluster2
    - for token-level and document-level evaluation

In [1]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

import bz2
import pickle
import timeit
import numpy as np
from scipy.sparse import csr_matrix

from access.file_storage import FileStorage
from access.interim_storage import InterimStorage
from amore.amazon_reviews_reader import AmazonReviewsReader
from amore.opinion_lexicon import OpinionLexicon

In [2]:
# For multiple usage afterwards

file_storage = FileStorage()

opinion_lexicon = OpinionLexicon(file_storage.get_filepath('opinion-words'))
print('negative words:', len(opinion_lexicon.get_negative_set()))
print('positive words:', len(opinion_lexicon.get_positive_set()))
# negative words: 4783
# positive words: 2006

negative words: 4783
positive words: 2006


## Read data

In [3]:
# Read deduplicated review Ids
with bz2.BZ2File(file_storage.get_filepath('deduplicated'), 'r') as file:
    year_star_ids = pickle.loads(file.read())
print('Available years:', sorted(year_star_ids.keys()))
print('Example stars:  ', sorted(year_star_ids[2007].keys()))
print('Example entry:  ', year_star_ids[2007][1][0])
count = 0
for year in year_star_ids:
    for star in year_star_ids[year]:
        count += len(year_star_ids[year][star])
print('Reviews:', count)

# Available years: Available years: [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012]
# Example stars:   [1, 2, 3, 4, 5]
# Example entry:   [4368, 2007, 1]
# Reviews: 1727821

Available years: [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012]
Example stars:   [1, 2, 3, 4, 5]
Example entry:   [4368, 2007, 1]
Reviews: 1727821


In [4]:
# Read review texts
min_year = 2000
max_docs = -1
start_time = timeit.default_timer()
reader = AmazonReviewsReader(file_storage.get_filepath('amazon_gz_file'), AmazonReviewsReader.MODE_TYPED, min_year=min_year, max_docs=max_docs)
revno_to_text = {}
def get_texts(item):
    return (item[AmazonReviewsReader.KEY_SUMMARY] + " " + item[AmazonReviewsReader.KEY_TEXT]).replace('<br />', ' ')
for item in reader:
    revno_to_text[item[AmazonReviewsReader.KEY_NUMBER]] = get_texts(item)
print('Texts:', len(revno_to_text))
print('Runtime:', timeit.default_timer() - start_time)

# start year: 2007
# Texts: 4662381
# Runtime: 265.2943881880492

# start year: 2000
# Texts: 7827594
# Runtime: 312.4320105519146

Texts: 7827594
Runtime: 307.6331848814152


### Read document-term matrix files

In [5]:
# Read document-term matrix
start_time = timeit.default_timer()
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-DocTermMatrix'), 'r') as file:
    doc_term_matrix = pickle.loads(file.read())
    print('document-term matrix:', doc_term_matrix.shape, type(doc_term_matrix))
    print('Runtime:', timeit.default_timer() - start_time)
    
#print(doc_term_matrix)
#       (0, 299799)  3
#        :       :
# (1203681, 367201)  1

# start year: 2007
# document-term matrix: (1203682, 486546) <class 'scipy.sparse.csr.csr_matrix'>
# Runtime:  26.56719038821757

# start year: 2000
# document-term matrix: (1584098, 607181) <class 'scipy.sparse.csr.csr_matrix'>
# Runtime: 54.29142002761364

document-term matrix: (1584098, 607181) <class 'scipy.sparse.csr.csr_matrix'>
Runtime: 39.86092622112483


In [6]:
# Read vocabulary of document-term matrix
# Invert it
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-Vocabulary'), 'r') as file:
    vocabulary = pickle.loads(file.read())
    print('vocabulary:', len(vocabulary), type(vocabulary))
    print('example:', next(iter(vocabulary.items())))

inv_vocabulary = {v: k for k, v in vocabulary.items()}
print('inv_vocabulary:', len(inv_vocabulary), type(inv_vocabulary))
print('example:', next(iter(inv_vocabulary.items())))

tokens = list(dict(sorted(inv_vocabulary.items())).values())

# start year: 2007
# vocabulary: 486546 <class 'dict'>
# example: ('movie', 299799)
# inv_vocabulary: 486546 <class 'dict'>
# example: (299799, 'movie')

# start year: 2000
# vocabulary: 607181 <class 'dict'>
# example: ('movie', 371301)
# inv_vocabulary: 607181 <class 'dict'>
# example: (371301, 'movie')

vocabulary: 607181 <class 'dict'>
example: ('movie', 371301)
inv_vocabulary: 607181 <class 'dict'>
example: (371301, 'movie')


In [7]:
# Read count-vector-ID to review-ID mapping
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-VecidRevno'), 'r') as file:
    vecid_revno = pickle.loads(file.read())
    print('vectorizer ID to review no:', len(vecid_revno), type(vecid_revno))
    print('example:', next(iter(vecid_revno.items())))

# start year: 2007
# vectorizer ID to review no: 1203682 <class 'dict'>
# example: (0, 3)

# start year: 2000
# vectorizer ID to review no: 1584098 <class 'dict'>
# example: (0, 3)

vectorizer ID to review no: 1584098 <class 'dict'>
example: (0, 3)


## Access data

- Docs:
    - [docs.scipy.org scipy.sparse.csr_matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html)
    - [docs.scipy.org scipy.sparse.spmatrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.spmatrix.html)
    - [docs.scipy.org sparse](https://docs.scipy.org/doc/scipy/reference/sparse.html)

In [8]:
def get_text(review_id):
    return revno_to_text[review_id]

def get_review_ids(years, stars):
    ids = []
    for year in year_star_ids:
        if year in years:
            for star in year_star_ids[year]:
                if star in stars:
                    for tup in year_star_ids[year][star]:
                        ids.append(tup[0])
    return ids

In [9]:
def delete_from_csr(mat, row_indices=[], col_indices=[]):
    """
    Remove the rows (denoted by ``row_indices``) and columns (denoted by ``col_indices``) from the CSR sparse matrix ``mat``.
    WARNING: Indices of altered axes are reset in the returned matrix

    import numpy as np
    from scipy.sparse import csr_matrix
    
    https://stackoverflow.com/a/45486349
    """
    if not isinstance(mat, csr_matrix):
        raise ValueError("works only for CSR format -- use .tocsr() first")

    rows = []
    cols = []
    if row_indices:
        rows = list(row_indices)
    if col_indices:
        cols = list(col_indices)

    if len(rows) > 0 and len(cols) > 0:
        row_mask = np.ones(mat.shape[0], dtype=bool)
        row_mask[rows] = False
        col_mask = np.ones(mat.shape[1], dtype=bool)
        col_mask[cols] = False
        return mat[row_mask][:,col_mask]
    elif len(rows) > 0:
        mask = np.ones(mat.shape[0], dtype=bool)
        mask[rows] = False
        return mat[mask]
    elif len(cols) > 0:
        mask = np.ones(mat.shape[1], dtype=bool)
        mask[cols] = False
        return mat[:,mask]
    else:
        return mat

In [26]:
class Matrix:
    """
    Sparse matrix data access and filtering.
    """

    # document-term matrix, scipy.sparse.csr.csr_matrix
    doc_term_matrix = None
    
    # matrix-document-index to review-id
    review_indices = None
    reviewids_to_reviewindices = {}
    
    # matrix-term-index to token
    token_indices = None
    token_to_tokenindices = {}
        
    def __init__(self, doc_term_matrix: csr_matrix, review_indices: list, token_indices: list):
        self.doc_term_matrix = doc_term_matrix
        
        self.review_indices = review_indices
        self.reviewids_to_reviewindices = {}
        for i, item in enumerate(self.review_indices):
            self.reviewids_to_reviewindices[item] = i
        
        self.token_indices = token_indices
        for i, item in enumerate(self.token_indices):
            self.token_to_tokenindices[item] = i
        
        print('Created:', self)
        
    def __repr__(self):
        return 'Matrix ' + str(self.doc_term_matrix.shape) + ', ' + str(len(self.review_indices)) + ' reviews, ' + str(len(self.token_indices)) + ' tokens'

    def get_token_data(self, review_id=None, review_index=None):
        """
        Gets sparse matrix data for review index or ID.
        """
        if review_index is None:
            review_index = self.reviewids_to_reviewindices[review_id]
        return self.doc_term_matrix[review_index].data
    
    def get_token_indices(self, review_id=None, review_index=None):
        """
        Gets sparse matrix indices of tokens for review index or ID.
        """
        if review_index is None:
            review_index = self.reviewids_to_reviewindices[review_id]
        return self.doc_term_matrix[review_index].indices

    def get_token_counts(self, review_id=None, review_index=None):
        """
        Returns dict (token, count) for review index or ID.
        """
        if review_index is None:
            review_index = self.reviewids_to_reviewindices[review_id]
        token_counts = {}
        token_data = self.get_token_data(review_index=review_index)
        for i, token_index in enumerate(self.get_token_indices(review_index=review_index)):
            token = self.token_indices[token_index]
            token_counts[token] = token_data[i]
        return dict(sorted(token_counts.items(), key=lambda item: item[1], reverse=True))
    
    def filter_remove_reviews(self, review_ids):
        """
        Removes given review IDs.
        """
        review_indices_remove = []
        for review_id in review_ids:
            review_indices_remove.append(self.reviewids_to_reviewindices[review_id])
        review_indices_new = self.review_indices.copy()
        for review_index_remove in sorted(review_indices_remove, reverse=True):
            review_indices_new.pop(review_index_remove)
        return Matrix(delete_from_csr(csr_matrix(self.doc_term_matrix), review_indices_remove, []), review_indices_new, self.token_indices.copy())
    
    def filter_keep_reviews(self, review_ids):
        """
        Keeps only given review IDs.
        """
        doc_indices_extract = []
        new_review_indices = []
        for review_id in review_ids:
            doc_indices_extract.append(self.reviewids_to_reviewindices[review_id])
            new_review_indices.append(review_id)

        # Filter matrix
        print('Filtering. Based on', len(review_ids), 'review IDs')
        new_doc_term_matrix = self.doc_term_matrix[doc_indices_extract,:]
        
        return Matrix(new_doc_term_matrix, new_review_indices, self.token_indices)

    def get_overall_token_occurences(self):
        """
        Returns tokens and their occurences (counted max 1 time) in all documents.
        """
        # Note (TODO): Expensive. Could be improved by using matrix instead of dict.
        
        # Count non-zero values of token-indices
        tokenindex_occurences = {}
        for tokenindex in self.doc_term_matrix.nonzero()[1]:
            if tokenindex in tokenindex_occurences:
                tokenindex_occurences[tokenindex] += 1
            else:
                tokenindex_occurences[tokenindex] = 1
                
        # Sort by values/counts
        tokenindex_occurences = dict(sorted(tokenindex_occurences.items(), key=lambda item: item[1], reverse=True))
        
        # Token-indices to tokens
        token_occurences = {}
        for item in tokenindex_occurences.items():
            token_occurences[self.token_indices[item[0]]] = item[1]
        return token_occurences

    def filter_tokens(self, tokens):
        """
        Filters matrix by a given set of tokens (e.g. positive words).
        """
        # Collect available token-indices (required to filter matrix)
        tokenindices = []
        new_token_indices = {}
        for token in tokens:
            if token in self.token_to_tokenindices:
                tokenindices.append(self.token_to_tokenindices[token])
                new_token_indices[self.token_to_tokenindices[token]] = token
        new_token_indices = dict(sorted(new_token_indices.items(), reverse=False))
        new_token_indices = list(new_token_indices.values())
        
        # Filter matrix
        new_doc_term_matrix = self.doc_term_matrix[:,tokenindices]
                
        return Matrix(new_doc_term_matrix, self.review_indices, new_token_indices)

    def filter_reviews_by_tokens(self, tokens: list) -> dict:
        """
        Returns review IDs and occurences of given tokens (counted max 1 time) in all documents.
        """
        # Note (TODO): Expensive. Could maybe be improved by using matrix instead of dict.
        token_indices = []
        for token in tokens:
            token_indices.append(self.token_to_tokenindices[token])
            
            
        reviews_to_occurences = {}
        nonzero = self.doc_term_matrix.nonzero()
        i = -1
                
        for tokenindex in nonzero[1]:
            i += 1
            if tokenindex in token_indices:
                review_index = nonzero[0][i]
                if review_index in reviews_to_occurences:
                    reviews_to_occurences[review_index] += 1
                else:
                    reviews_to_occurences[review_index] = 1
        
        reviewids_to_occurences = {}
        for item in reviews_to_occurences.items():
            reviewids_to_occurences[self.review_indices[item[0]]] = item[1]
            
        return reviewids_to_occurences

## Create matrixes

In [14]:
matrix = Matrix(doc_term_matrix, vecid_revno.values(), tokens)

Created: Matrix (1584098, 607181), 1584098 reviews, 607181 tokens


In [15]:
m_2000_neg = matrix.filter_keep_reviews(get_review_ids(years=[2005], stars=[1,2]))
m_2000_pos = matrix.filter_keep_reviews(get_review_ids(years=[2005], stars=[4,5]))
m_2001_neg = matrix.filter_keep_reviews(get_review_ids(years=[2006], stars=[1,2]))
m_2001_pos = matrix.filter_keep_reviews(get_review_ids(years=[2006], stars=[4,5]))

Filtering. Based on 17466 review IDs
Created: Matrix (17466, 607181), 17466 reviews, 607181 tokens
Filtering. Based on 90403 review IDs
Created: Matrix (90403, 607181), 90403 reviews, 607181 tokens
Filtering. Based on 16993 review IDs
Created: Matrix (16993, 607181), 16993 reviews, 607181 tokens
Filtering. Based on 99536 review IDs
Created: Matrix (99536, 607181), 99536 reviews, 607181 tokens


## Filter

#### Get tokens which occur predominantly in only one dataset: 1.) Get ratio for each token.

In [16]:
def get_ratio(matrix_a, matrix_b, min_token_occurences = -1):
    ratio = {}
    only_a = {}
    rare = {}
    a_token_occurences = matrix_a.get_overall_token_occurences()
    b_token_occurences = matrix_b.get_overall_token_occurences()
    print('Input sizes:', len(a_token_occurences), '/', len(b_token_occurences))
    for tok_occ in a_token_occurences.items():

        # Filter rare
        if min_token_occurences != -1 and \
           tok_occ[0] in b_token_occurences and \
           min_token_occurences > tok_occ[1] and \
           min_token_occurences > b_token_occurences[tok_occ[0]]:
            #rare.append(tok_occ[0])
            rare[tok_occ[0]] = tok_occ[1] + b_token_occurences[tok_occ[0]]
            b_token_occurences.pop(tok_occ[0])
            continue
            
        # Only A
        if not tok_occ[0] in b_token_occurences:
            only_a[tok_occ[0]] = tok_occ[1]
            continue
        
        # Add ratio, remove from B
        ratio_a = tok_occ[1] / len(a_token_occurences)
        ratio_b = b_token_occurences.pop(tok_occ[0]) / len(b_token_occurences)
        ratio[tok_occ[0]] = round(ratio_a / ratio_b, 2)

    ratio_inverted = {}
    for item in ratio.items():
        ratio_inverted[item[0]] = round(1 / item[1], 2)

    print('Output sizes:', 'ratio', len(ratio), '; rare', len(rare), '; only a', len(only_a), '; only b', len(b_token_occurences))
    return dict(sorted(ratio.items(), key=lambda item: item[1], reverse=True)), \
    dict(sorted(ratio_inverted.items(), key=lambda item: item[1], reverse=True)), \
    rare, \
    dict(sorted(only_a.items(), key=lambda item: item[1], reverse=True)), \
    dict(sorted(b_token_occurences.items(), key=lambda item: item[1], reverse=True))

def filter_tokens_by_ratio(ratio_results, min_ratio, max_tokens, exclude_tokens):
    tokens = []
    for item in ratio_results.items():
        if item[0] in exclude_tokens:
            continue
        if item[1] >= min_ratio:
            tokens.append(item[0])
        if len(tokens) >= max_tokens:
            break
    return tokens

### Ratio 2000

In [47]:
ratio_a, ratio_b, rare, only_a, only_b = get_ratio(
    m_2000_pos,
    m_2000_neg,
    min_token_occurences=4000)
print()
print('ratio_a', list(ratio_a.items())[:50])
print()
print('ratio_b', list(ratio_b.items())[:50])
print()
print('rare', list(rare.items())[:10])
print()
print('only_a', list(only_a.items())[:10])
print()
print('only_b', list(only_b.items())[:10])

Input sizes: 153373 / 63005
Output sizes: ratio 228 ; rare 50075 ; only a 103070 ; only b 12702

ratio_a [('excellent', 8.86), ('awesome', 8.71), ('wonderful', 8.41), ('amazing', 8.15), ('highly', 7.59), ('perfect', 7.34), ('favorite', 7.01), ('brilliant', 6.49), ('season', 5.89), ('collection', 5.38), ('beautiful', 5.22), ('enjoyed', 5.13), ('heart', 4.96), ('extras', 4.95), ('loved', 4.85), ('great', 4.62), ('episodes', 4.58), ('features', 4.57), ('best', 4.54), ('classic', 4.52), ('today', 4.49), ('episode', 4.38), ('definitely', 4.27), ('performances', 4.26), ('song', 4.24), ('family', 4.22), ('works', 4.22), ('songs', 4.19), ('shows', 4.17), ('fun', 4.08), ('love', 4.06), ('enjoy', 4.05), ('gives', 3.97), ('including', 3.85), ('drama', 3.68), ('young', 3.66), ('lives', 3.64), ('recommend', 3.58), ('father', 3.46), ('days', 3.44), ('early', 3.43), ('years', 3.36), ('job', 3.34), ('finally', 3.33), ('entertaining', 3.32), ('music', 3.3), ('performance', 3.28), ('human', 3.27), ('nic

In [66]:
# filtering top ratio words (found 16 with only some blacklisted)
filter_tokens_a = filter_tokens_by_ratio(ratio_a, 2, 16, ['highly',
                                                          'season', 'collection',
                                                          'extras', 'episodes', 'features',
                                                          'classic', 'today', 'episode', 'definitely', 'performances', 'song', 'family', 'works', 'songs', 'shows'])
print(filter_tokens_a)

# min ratio had to set down to 1.4 to get at least one word
# money seems to be a thing
filter_tokens_b = filter_tokens_by_ratio(ratio_b, 1.4, 10, ['money'])
print(filter_tokens_b)

['excellent', 'awesome', 'wonderful', 'amazing', 'perfect', 'favorite', 'brilliant', 'beautiful', 'enjoyed', 'heart', 'loved', 'great', 'best', 'fun', 'love', 'enjoy']
['bad']


### Ratio 2001

In [67]:
ratio_a, ratio_b, rare, only_a, only_b = get_ratio(
    m_2001_pos,
    m_2001_neg,
    min_token_occurences=4000)
print()
print('ratio_a', list(ratio_a.items())[:50])
print('ratio_a', list(ratio_a.keys())[:50])
print()
print('ratio_b', list(ratio_b.items())[:50])
print()
print('rare', list(rare.items())[:10])
print()
print('only_a', list(only_a.items())[:10])
print()
print('only_b', list(only_b.items())[:10])

Input sizes: 152728 / 61592
Output sizes: ratio 227 ; rare 49486 ; only a 103015 ; only b 11879

ratio_a [('awesome', 10.43), ('wonderful', 9.68), ('excellent', 9.32), ('amazing', 9.26), ('highly', 9.03), ('perfect', 7.76), ('fantastic', 7.75), ('favorite', 7.61), ('season', 7.14), ('collection', 6.17), ('loved', 6.09), ('enjoyed', 5.87), ('heart', 5.49), ('episodes', 5.44), ('great', 5.17), ('fun', 5.0), ('features', 4.96), ('classic', 4.94), ('best', 4.92), ('beautiful', 4.89), ('shows', 4.64), ('enjoy', 4.64), ('definitely', 4.56), ('episode', 4.46), ('love', 4.39), ('today', 4.38), ('songs', 4.28), ('recommend', 4.26), ('lives', 4.18), ('family', 4.14), ('works', 4.05), ('performances', 4.02), ('drama', 3.99), ('happy', 3.96), ('gives', 3.94), ('series', 3.8), ('entertaining', 3.75), ('including', 3.65), ('music', 3.64), ('performance', 3.61), ('live', 3.59), ('father', 3.59), ('early', 3.58), ('young', 3.53), ('john', 3.53), ('job', 3.46), ('nice', 3.46), ('history', 3.45), ('year

#### Get tokens which occur predominantly in only one dataset: 2.) Get top tokens.

In [74]:
filter_tokens_a = filter_tokens_by_ratio(ratio_a, 2, 16, ['highly',
                                                          'season', 'collection',
                                                          'episodes'])
print(filter_tokens_a)

filter_tokens_b = filter_tokens_by_ratio(ratio_b, 1.3, 10, ['money'])
print(filter_tokens_b)

['awesome', 'wonderful', 'excellent', 'amazing', 'perfect', 'fantastic', 'favorite', 'loved', 'enjoyed', 'heart', 'great', 'fun', 'features', 'classic', 'best', 'beautiful']
['money', 'bad']


#### Create new matrix only consisting of filtered tokens.

In [20]:
matrix_filtered = matrix.filter_tokens(filter_tokens_a + filter_tokens_b)


Created: Matrix (1584098, 14), 1584098 reviews, 14 tokens
['amazing', 'awesome', 'bad', 'beautiful', 'best', 'brilliant', 'enjoyed', 'excellent', 'favorite', 'great', 'heart', 'loved', 'perfect', 'wonderful'] 14


In [31]:
matrix_filtered_pos = matrix_filtered.filter_keep_reviews(get_review_ids(years=[2005], stars=[4,5]))

Filtering. Based on 90403 review IDs
Created: Matrix (90403, 14), 90403 reviews, 14 tokens


In [35]:
matrix_filtered_neg = matrix_filtered.filter_keep_reviews(get_review_ids(years=[2005], stars=[1,2]))

Filtering. Based on 17466 review IDs
Created: Matrix (17466, 14), 17466 reviews, 14 tokens


- make lists of words (Atok Btok) to be used for filtering for both distributions
- add review to list e.g. if at least 20% or 5 tokens of Atok are included and less than 10% or 2 tokens of Btok are included.

TODO:

-  Sum up occurences

In [32]:
# in: tokens
# out: revId to counts of all tokens
# get token indices
# iterate nonzero[0] and filter by if token_index searched
# check nonzero[1] which is revIndex and create revIndex to number of token found.

reviews_token_occurences = matrix_filtered_pos.filter_reviews_by_tokens(filter_tokens_a)
print(len(reviews_token_occurences))

69239


In [33]:
# Check dict review-id to number of occurences
if False:
    it = iter(reviews_token_occurences.items())
    print(next(it))
    print(next(it))
    print(filter_tokens_a)
    get_text(6589)

In [13]:
# Tests for matrix class

def print_matrix_info(m, t=8):
    print('Review IDs ', list(m.reviewids_to_reviewindices.keys()))
    print('Token IDs', t,  list(m.token_to_tokenindices.keys())[:t])

def get_test_matrix(print_info=False):
    r = list(vecid_revno.values())[:8]
    t = list(tokens)
    c = doc_term_matrix[:8]
    if print_info:
        print('r    ', r)
        print('t[:8]', t[:8])
    m = Matrix(c, r, t)
    return m

if False:
    # Test matrix generation
    m = get_test_matrix(True)
    print_matrix_info(m)
    m = None

if False:
    # Test filter_remove_reviews()
    m = get_test_matrix()
    revs = list(m.reviewids_to_reviewindices.keys())
    rem = revs.copy()
    print_matrix_info(m)
    print(m.get_token_data(revs[0]))
    print(m.get_token_data(revs[7]))
    print()
    rem.pop(7)
    rem.pop(5)
    rem.pop(0)
    print('Revs to remove', rem)
    m = m.filter_remove_reviews(rem)
    print_matrix_info(m)
    print(m.get_token_data(revs[0]))
    print(m.get_token_data(revs[7]))
    m = None
    revs = None
    rem = None

if False:
    # Test filter_keep_reviews()
    m = get_test_matrix()
    revs = list(m.reviewids_to_reviewindices.keys())
    print_matrix_info(m)
    print(m.get_token_data(revs[1]))
    print(m.get_token_data(revs[6]))
    print()
    revs.pop(7)
    revs.pop(5)
    revs.pop(0)
    print('Revs to keep', revs)
    m = m.filter_keep_reviews(revs)
    print_matrix_info(m)
    print(m.get_token_data(revs[0]))
    print(m.get_token_data(revs[4]))
    m = None
    revs = None
    
# print_matrix_info = None
# get_test_matrix = None