# AMORE datasets

- Note: Years 1997 to 1999 contain only a few reviews and are not included in Doc2Vec embeddings.  
  (The years 2000 to 2012 are included.)

In [1]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

import bz2
import pickle
import timeit
import numpy as np
from scipy.sparse import csr_matrix

from access.file_storage import FileStorage
from access.interim_storage import InterimStorage
from amore.amazon_reviews_reader import AmazonReviewsReader
from amore.opinion_lexicon import OpinionLexicon

## Read data

* **opinion_lexicon**: Instance of OpinionLexicon to access negative and positive words. Not used for now.
* **year_star_ids**: Collection of IDs sorted by years and stars. Used afterwards to filter by stars and years in method **get_review_ids**.
* **reader**: Instance of AmazonReviewsReader to access review data. Used afterwards to create revno_to_text.
* **revno_to_text**: Dictionary review-number to full text. Used afterwards to access review-texts in method **get_text**.

In [2]:
# For multiple usage afterwards

file_storage = FileStorage()

opinion_lexicon = OpinionLexicon(file_storage.get_filepath('opinion-words'))
print('negative words:', len(opinion_lexicon.get_negative_set()))
print('positive words:', len(opinion_lexicon.get_positive_set()))
# negative words: 4783
# positive words: 2006

negative words: 4783
positive words: 2006


In [3]:
# Read deduplicated review Ids
with bz2.BZ2File(file_storage.get_filepath('deduplicated'), 'r') as file:
    year_star_ids = pickle.loads(file.read())
print('Available years:', sorted(year_star_ids.keys()))
print('Example stars:  ', sorted(year_star_ids[2007].keys()))
print('Example entry:  ', year_star_ids[2007][1][0])
count = 0
for year in year_star_ids:
    for star in year_star_ids[year]:
        count += len(year_star_ids[year][star])
print('Reviews:', count)

# Available years: Available years: [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012]
# Example stars:   [1, 2, 3, 4, 5]
# Example entry:   [4368, 2007, 1]
# Reviews: 1727821

Available years: [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012]
Example stars:   [1, 2, 3, 4, 5]
Example entry:   [4368, 2007, 1]
Reviews: 1727821


In [4]:
# Read review texts
min_year = 2000
max_docs = -1
start_time = timeit.default_timer()
reader = AmazonReviewsReader(file_storage.get_filepath('amazon_gz_file'), AmazonReviewsReader.MODE_TYPED, min_year=min_year, max_docs=max_docs)
revno_to_text = {}
def get_texts(item):
    return (item[AmazonReviewsReader.KEY_SUMMARY] + " " + item[AmazonReviewsReader.KEY_TEXT]).replace('<br />', ' ')
for item in reader:
    revno_to_text[item[AmazonReviewsReader.KEY_NUMBER]] = get_texts(item)
print('Texts:', len(revno_to_text))
print('Runtime:', timeit.default_timer() - start_time)

# start year: 2007
# Texts: 4662381
# Runtime: 265.2943881880492

# start year: 2000
# Texts: 7827594
# Runtime: 312.4320105519146

Texts: 7827594
Runtime: 291.32208580401493


### Read document-term matrix files

Data required to create **Matrix** instances afterwards.

* **doc_term_matrix**: Matrix containing document-IDs and term-IDs. Used afterwards for building Matrix.
* **vocabulary**: Dictionary containing tuples of terms and term-IDs. Used afterwards to create inv_vocabulary.
* **inv_vocabulary**: Dictionary containing tuples of term-IDs and terms.Used afterwards to create tokens.
* **tokens**: Ordered list of terms, index represents term-ID. Used afterwards as token-indices for building Matrix.
* **vecid_revno**: Values used afterwards as review-indices for building Matrix.

In [5]:
# Read document-term matrix
start_time = timeit.default_timer()
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-DocTermMatrix'), 'r') as file:
    doc_term_matrix = pickle.loads(file.read())
    print('document-term matrix:', doc_term_matrix.shape, type(doc_term_matrix))
    print('Runtime:', timeit.default_timer() - start_time)
    
#print(doc_term_matrix)
#       (0, 299799)  3
#        :       :
# (1203681, 367201)  1

# start year: 2007
# document-term matrix: (1203682, 486546) <class 'scipy.sparse.csr.csr_matrix'>
# Runtime:  26.56719038821757

# start year: 2000
# document-term matrix: (1584098, 607181) <class 'scipy.sparse.csr.csr_matrix'>
# Runtime: 54.29142002761364

document-term matrix: (1584098, 607181) <class 'scipy.sparse.csr.csr_matrix'>
Runtime: 37.48579945595702


In [6]:
# Read vocabulary of document-term matrix
# Invert it
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-Vocabulary'), 'r') as file:
    vocabulary = pickle.loads(file.read())
    print('vocabulary:', len(vocabulary), type(vocabulary))
    print('example:', next(iter(vocabulary.items())))

inv_vocabulary = {v: k for k, v in vocabulary.items()}
print('inv_vocabulary:', len(inv_vocabulary), type(inv_vocabulary))
print('example:', next(iter(inv_vocabulary.items())))

tokens = list(dict(sorted(inv_vocabulary.items())).values())

# start year: 2007
# vocabulary: 486546 <class 'dict'>
# example: ('movie', 299799)
# inv_vocabulary: 486546 <class 'dict'>
# example: (299799, 'movie')

# start year: 2000
# vocabulary: 607181 <class 'dict'>
# example: ('movie', 371301)
# inv_vocabulary: 607181 <class 'dict'>
# example: (371301, 'movie')

vocabulary: 607181 <class 'dict'>
example: ('movie', 371301)
inv_vocabulary: 607181 <class 'dict'>
example: (371301, 'movie')


In [7]:
# Read count-vector-ID to review-ID mapping
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-VecidRevno'), 'r') as file:
    vecid_revno = pickle.loads(file.read())
    print('vectorizer ID to review no:', len(vecid_revno), type(vecid_revno))
    print('example:', next(iter(vecid_revno.items())))

# start year: 2007
# vectorizer ID to review no: 1203682 <class 'dict'>
# example: (0, 3)

# start year: 2000
# vectorizer ID to review no: 1584098 <class 'dict'>
# example: (0, 3)

vectorizer ID to review no: 1584098 <class 'dict'>
example: (0, 3)


## Data access methods and Matrix class

- Docs:
    - [docs.scipy.org scipy.sparse.csr_matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html)
    - [docs.scipy.org scipy.sparse.spmatrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.spmatrix.html)
    - [docs.scipy.org sparse](https://docs.scipy.org/doc/scipy/reference/sparse.html)

In [8]:
def get_text(review_id):
    return revno_to_text[review_id]

def get_review_ids(years, stars):
    ids = []
    for year in year_star_ids:
        if year in years:
            for star in year_star_ids[year]:
                if star in stars:
                    for tup in year_star_ids[year][star]:
                        ids.append(tup[0])
    return ids

In [9]:
# Used afterwards in Matrix class.

def delete_from_csr(mat, row_indices=[], col_indices=[]):
    """
    Remove the rows (denoted by ``row_indices``) and columns (denoted by ``col_indices``) from the CSR sparse matrix ``mat``.
    WARNING: Indices of altered axes are reset in the returned matrix

    import numpy as np
    from scipy.sparse import csr_matrix
    
    https://stackoverflow.com/a/45486349
    """
    if not isinstance(mat, csr_matrix):
        raise ValueError("works only for CSR format -- use .tocsr() first")

    rows = []
    cols = []
    if row_indices:
        rows = list(row_indices)
    if col_indices:
        cols = list(col_indices)

    if len(rows) > 0 and len(cols) > 0:
        row_mask = np.ones(mat.shape[0], dtype=bool)
        row_mask[rows] = False
        col_mask = np.ones(mat.shape[1], dtype=bool)
        col_mask[cols] = False
        return mat[row_mask][:,col_mask]
    elif len(rows) > 0:
        mask = np.ones(mat.shape[0], dtype=bool)
        mask[rows] = False
        return mat[mask]
    elif len(cols) > 0:
        mask = np.ones(mat.shape[1], dtype=bool)
        mask[cols] = False
        return mat[:,mask]
    else:
        return mat

In [185]:
class Matrix:
    """
    Sparse matrix data access and filtering.
    """

    # document-term matrix, scipy.sparse.csr.csr_matrix
    doc_term_matrix = None
    
    # matrix-document-index to review-id
    review_indices = None
    reviewids_to_reviewindices = {}
    
    # matrix-term-index to token
    token_indices = None # list of tokens
    token_to_tokenindices = {}
        
    def __init__(self, doc_term_matrix: csr_matrix, review_indices: list, token_indices: list):
        self.doc_term_matrix = doc_term_matrix
        
        self.review_indices = review_indices
        self.reviewids_to_reviewindices = {}
        for i, item in enumerate(self.review_indices):
            self.reviewids_to_reviewindices[item] = i
        
        self.token_indices = token_indices
        for i, item in enumerate(self.token_indices):
            self.token_to_tokenindices[item] = i
        
        print('Created:', self)
        
    def __repr__(self):
        return 'Matrix ' + str(self.doc_term_matrix.shape) + ', ' + str(len(self.review_indices)) + ' reviews, ' + str(len(self.token_indices)) + ' tokens'

    def get_token_data(self, review_id=None, review_index=None):
        """
        Gets sparse matrix data for review index or ID.
        """
        if review_index is None:
            review_index = self.reviewids_to_reviewindices[review_id]
        return self.doc_term_matrix[review_index].data
    
    def get_token_indices(self, review_id=None, review_index=None):
        """
        Gets sparse matrix indices of tokens for review index or ID.
        """
        if review_index is None:
            review_index = self.reviewids_to_reviewindices[review_id]
        return self.doc_term_matrix[review_index].indices

    def get_token_counts(self, review_id=None, review_index=None):
        """
        Returns dict (token, count) for review index or ID.
        """
        if review_index is None:
            review_index = self.reviewids_to_reviewindices[review_id]
        token_counts = {}
        token_data = self.get_token_data(review_index=review_index)
        for i, token_index in enumerate(self.get_token_indices(review_index=review_index)):
            token = self.token_indices[token_index]
            token_counts[token] = token_data[i]
        return dict(sorted(token_counts.items(), key=lambda item: item[1], reverse=True))
    
    def filter_remove_reviews(self, review_ids):
        """
        Removes given review IDs.
        """
        review_indices_remove = []
        for review_id in review_ids:
            review_indices_remove.append(self.reviewids_to_reviewindices[review_id])
        review_indices_new = self.review_indices.copy()
        for review_index_remove in sorted(review_indices_remove, reverse=True):
            review_indices_new.pop(review_index_remove)
        return Matrix(delete_from_csr(csr_matrix(self.doc_term_matrix), review_indices_remove, []), review_indices_new, self.token_indices.copy())
    
    def filter_keep_reviews(self, review_ids):
        """
        Keeps only given review IDs.
        """
        doc_indices_extract = []
        new_review_indices = []
        for review_id in review_ids:
            doc_indices_extract.append(self.reviewids_to_reviewindices[review_id])
            new_review_indices.append(review_id)

        # Filter matrix
        print('Filtering. Based on', len(review_ids), 'review IDs')
        new_doc_term_matrix = self.doc_term_matrix[doc_indices_extract,:]
        
        return Matrix(new_doc_term_matrix, new_review_indices, self.token_indices)

    def get_overall_token_occurences(self):
        """
        Returns tokens and their occurences (counted max 1 time) in all documents.
        """
        # Count non-zero values of token-indices
        tokenindex_occurences = {}
        for tokenindex in self.doc_term_matrix.nonzero()[1]:
            if tokenindex in tokenindex_occurences:
                tokenindex_occurences[tokenindex] += 1
            else:
                tokenindex_occurences[tokenindex] = 1
                
        # Sort by values/counts
        tokenindex_occurences = dict(sorted(tokenindex_occurences.items(), key=lambda item: item[1], reverse=True))
        
        # Token-indices to tokens
        token_occurences = {}
        for item in tokenindex_occurences.items():
            token_occurences[self.token_indices[item[0]]] = item[1]
        return token_occurences

    def filter_tokens(self, tokens):
        """
        Filters matrix by a given set of tokens (e.g. positive words).
        """
        # Collect available token-indices (required to filter matrix)
        tokenindices = []
        new_token_indices = {}
        for token in tokens:
            if token in self.token_to_tokenindices:
                tokenindices.append(self.token_to_tokenindices[token])
                new_token_indices[self.token_to_tokenindices[token]] = token
        new_token_indices = dict(sorted(new_token_indices.items(), reverse=False))
        new_token_indices = list(new_token_indices.values())
        
        # Filter matrix
        new_doc_term_matrix = self.doc_term_matrix[:,tokenindices]
                
        return Matrix(new_doc_term_matrix, self.review_indices, new_token_indices)

    def filter_reviews_by_tokens(self, tokens: list) -> dict:
        """
        Returns review IDs and occurences of given tokens (counted max 1 time) in all documents.
        """
        token_indices = []
        for token in tokens:
            token_indices.append(self.token_to_tokenindices[token])
            
        reviews_to_occurences = {}
        nonzero = self.doc_term_matrix.nonzero()
        i = -1  
        for tokenindex in nonzero[1]:
            i += 1
            if tokenindex in token_indices:
                review_index = nonzero[0][i]
                if review_index in reviews_to_occurences:
                    reviews_to_occurences[review_index] += 1
                else:
                    reviews_to_occurences[review_index] = 1
        
        reviewids_to_occurences = {}
        for item in reviews_to_occurences.items():
            reviewids_to_occurences[self.review_indices[item[0]]] = item[1]
            
        return reviewids_to_occurences
    
    def count_token_occurences(self, review_ids, tokens):
        """
        Sums up the occurencies of given tokens in given reviews.
        """
        review_to_tokenoccurences = {}
        token_indices = []
        for token in tokens:
            token_indices.append(self.token_to_tokenindices[token])
        for review_id in review_ids:
            if not review_id in review_ids:
                continue
            review_to_tokenoccurences[review_id] = 0
            for counts in self.get_token_data(review_id=review_id):
                review_to_tokenoccurences[review_id] += 1
        return review_to_tokenoccurences

## Create base matrixes

In [193]:
# Main matrix containing all data
matrix = Matrix(doc_term_matrix, vecid_revno.values(), tokens)

Created: Matrix (1584098, 607181), 1584098 reviews, 607181 tokens


In [194]:
ids_neg = get_review_ids(years=[2005, 2006], stars=[1,2])
ids_pos = get_review_ids(years=[2005, 2006], stars=[4,5])
m_neg = matrix.filter_keep_reviews(ids_neg)
m_pos = matrix.filter_keep_reviews(ids_pos)
m_both = matrix.filter_keep_reviews(ids_neg + ids_pos)

Filtering. Based on 34459 review IDs
Created: Matrix (34459, 607181), 34459 reviews, 607181 tokens
Filtering. Based on 189939 review IDs
Created: Matrix (189939, 607181), 189939 reviews, 607181 tokens
Filtering. Based on 224398 review IDs
Created: Matrix (224398, 607181), 224398 reviews, 607181 tokens


## Filter

#### Get tokens which occur predominantly in only one dataset: Get ratio for each token.

* **get_ratio**
  * In: matrix-A, matrix-B
  * In: min_token_occurences (token has to occure at least x times in each of both matrixes)
  * Out: tokens sorted by ratio
  * Out: tokens sorted by inverted ratio
  * Out: tokens not reached minimum
  * Out: tokens only in A
  * Out: tokens only in B
* **filter_tokens_by_ratio**
  * In: ratio (generated by method get_ratio)
  * In: min_ratio (minimum threshold)
  * In: max_tokens (maximum number of tokens to return)
  * In: exclude_tokens (tokens not to return)

In [195]:
def get_ratio(matrix_a, matrix_b, min_token_occurences = -1):
    ratio = {}
    only_a = {}
    rare = {}
    # Pairs of tokens and occurences
    a_token_occurences = matrix_a.get_overall_token_occurences()
    b_token_occurences = matrix_b.get_overall_token_occurences()
    print('Input sizes:', len(a_token_occurences), '/', len(b_token_occurences))
    for tok_occ in a_token_occurences.items():

        # Filter/remove rare:
        # minimum is set AND
        # token has to be in both matrixes AND
        # threshold reached in A AND
        # threshod reached in B AND
        if min_token_occurences != -1 and \
           tok_occ[0] in b_token_occurences and \
           min_token_occurences > tok_occ[1] and \
           min_token_occurences > b_token_occurences[tok_occ[0]]:
            rare[tok_occ[0]] = str(tok_occ[1]) + " / " + str(b_token_occurences[tok_occ[0]])
            b_token_occurences.pop(tok_occ[0])
            continue
            
        # Only A
        if not tok_occ[0] in b_token_occurences:
            only_a[tok_occ[0]] = tok_occ[1]
            continue
        
        # Add ratio, remove from B
        ratio_a = tok_occ[1] / len(a_token_occurences)
        ratio_b = b_token_occurences.pop(tok_occ[0]) / len(b_token_occurences)
        ratio[tok_occ[0]] = round(ratio_a / ratio_b, 2)

    ratio_inverted = {}
    for item in ratio.items():
        ratio_inverted[item[0]] = round(1 / item[1], 2)

    print('Output sizes:', 'ratio', len(ratio), '; rare', len(rare), '; only a', len(only_a), '; only b', len(b_token_occurences))
    return dict(sorted(ratio.items(), key=lambda item: item[1], reverse=True)), \
    dict(sorted(ratio_inverted.items(), key=lambda item: item[1], reverse=True)), \
    rare, \
    dict(sorted(only_a.items(), key=lambda item: item[1], reverse=True)), \
    dict(sorted(b_token_occurences.items(), key=lambda item: item[1], reverse=True))

def filter_tokens_by_ratio(ratio_results, min_ratio, max_tokens, exclude_tokens):
    tokens = []
    for item in ratio_results.items():
        if item[0] in exclude_tokens:
            continue
        if item[1] >= min_ratio:
            tokens.append(item[0])
        if len(tokens) >= max_tokens:
            break
    return tokens

# Get tokens to use

In [196]:
ratio_a, ratio_b, rare, only_a, only_b = get_ratio(
    m_pos,
    m_neg,
    min_token_occurences=10*1000)
print()
print('ratio_a', list(ratio_a.items())[:50])
print()
print('ratio_b', list(ratio_b.items())[:50])
print()
print('rare', list(rare.items())[:10])
print()
print('only_a', list(only_a.items())[:10])
print()
print('only_b', list(only_b.items())[:10])

Input sizes: 217041 / 86377
Output sizes: ratio 159 ; rare 68141 ; only a 148741 ; only b 18077

ratio_a [('excellent', 8.89), ('wonderful', 8.82), ('amazing', 8.47), ('highly', 8.11), ('perfect', 7.38), ('favorite', 7.14), ('season', 6.34), ('collection', 5.64), ('enjoyed', 5.39), ('loved', 5.33), ('heart', 5.1), ('beautiful', 4.94), ('episodes', 4.86), ('great', 4.79), ('features', 4.65), ('classic', 4.62), ('best', 4.61), ('fun', 4.42), ('episode', 4.33), ('definitely', 4.32), ('shows', 4.3), ('enjoy', 4.24), ('love', 4.13), ('family', 4.09), ('performances', 4.05), ('gives', 3.87), ('recommend', 3.82), ('young', 3.52), ('entertaining', 3.46), ('series', 3.41), ('music', 3.39), ('performance', 3.35), ('live', 3.33), ('years', 3.32), ('job', 3.32), ('nice', 3.29), ('john', 3.26), ('set', 3.23), ('true', 3.2), ('finally', 3.2), ('truly', 3.2), ('world', 3.19), ('different', 3.18), ('dvd', 3.16), ('especially', 3.13), ('history', 3.12), ('life', 3.1), ('fans', 3.08), ('cast', 3.06), ('

In [197]:
filter_tokens_a = filter_tokens_by_ratio(ratio_a, 2, 10,
['highly', 'season', 'collection', 'heart', 'episodes', 'features', 'classic', 'episode', 'definitely', 'shows', 'family', 'performances', 'gives', 'recommend', 'young', 'series',
 'music', 'performance', 'live', 'years', 'job', 'john', 'set', 'true', 'finally', 'truly', 'world', 'different', 'dvd', 'especially', 'history'
])
print(filter_tokens_a)

filter_tokens_b = filter_tokens_by_ratio(ratio_b, 1.25, 10, [])
print(filter_tokens_b)

['excellent', 'wonderful', 'amazing', 'perfect', 'favorite', 'enjoyed', 'loved', 'beautiful', 'great', 'best']
['bad']


#### Create new matrix only consisting of filtered tokens.

In [198]:
# Specify source data
matrix_filtered     = m_both.filter_tokens(filter_tokens_a + filter_tokens_b)
matrix_filtered_neg = matrix_filtered.filter_keep_reviews(get_review_ids(years=[2005], stars=[1,2]))
matrix_filtered_pos = matrix_filtered.filter_keep_reviews(get_review_ids(years=[2005], stars=[4,5]))
del matrix_filtered

Created: Matrix (224398, 11), 224398 reviews, 11 tokens
Filtering. Based on 17466 review IDs
Created: Matrix (17466, 11), 17466 reviews, 11 tokens
Filtering. Based on 90403 review IDs
Created: Matrix (90403, 11), 90403 reviews, 11 tokens


In [211]:
reviews_a = {}
for review_item in matrix_filtered_neg.count_token_occurences(get_review_ids(years=[2005], stars=[1,2]), filter_tokens_a).items():
    if review_item[1] == 0:
        reviews_a[review_item[0]] = review_item[1]
print(len(reviews_a))

8192


In [212]:
reviews_b = {}
for review_item in matrix_filtered_pos.count_token_occurences(get_review_ids(years=[2005], stars=[4,5]), filter_tokens_a).items():
    if review_item[1] > 0:
        reviews_b[review_item[0]] = review_item[1]
print(len(reviews_b))

68371


TODO:  
To be able to finally argue that a selection is based on a set of terms, those terms should be exactly defined.  
The ratio of word usage is already known here.  
Positive words with a high ratio should be used predominantly in positive sets.  
Positive words with a ratio higher a little larger than 1 should be omitted, i.e. not included overall.  
Positive words with a high ratio should not be used for negative reviews.

In [192]:
# Check dict review-id to number of occurences
if True:
    print('a', list(reviews_a.keys())[0:9])
    print('b', list(reviews_b)[0:9])
    print(get_text(589196))

a [13953, 31987, 58505, 58531, 74958, 76559, 88261, 88957, 93935]
b [10659, 10660, 10663, 10664, 13955, 51663, 54703, 58455, 71825]
Tundra-Intensive Blatant "Jaws" Ripoff Words can't describe how ludicrous and painful the whole "Snowbeast" experience is. Retailing at around $4.00, this is one of the most overpriced DVDs that I have ever seen.  The concept is that a Bigfoot-like monster is haunting a Colorado ski resort immediately prior to the annual "winter carnival," and is shredding the skiers. This movie, made for TV in 1977, is such a blatant "Jaws" knockoff that I imagine the only reason the makers of "Jaws" didn't sue, was because they were convulsing with laughter. Where to start? Well the music is as good as any other place...it is a wonderful tribute to the "Jaws" score; the "we have to close the beach" theme from "Jaws" transmogrifies into "we have to close the winter carnival"; instead of the shark scuttling the boat, the creature rolls a bunch of logs down a hill and knock

In [170]:
# Tests for matrix class

def print_matrix_info(m, t=8):
    print('Review IDs ', list(m.reviewids_to_reviewindices.keys()))
    print('Token IDs', t,  list(m.token_to_tokenindices.keys())[:t])

def get_test_matrix(print_info=False):
    r = list(vecid_revno.values())[:8]
    t = list(tokens)
    c = doc_term_matrix[:8]
    if print_info:
        print('r    ', r)
        print('t[:8]', t[:8])
    m = Matrix(c, r, t)
    return m

if False:
    # Test matrix generation
    m = get_test_matrix(True)
    print_matrix_info(m)
    m = None

if False:
    # Test filter_remove_reviews()
    m = get_test_matrix()
    revs = list(m.reviewids_to_reviewindices.keys())
    rem = revs.copy()
    print_matrix_info(m)
    print(m.get_token_data(revs[0]))
    print(m.get_token_data(revs[7]))
    print()
    rem.pop(7)
    rem.pop(5)
    rem.pop(0)
    print('Revs to remove', rem)
    m = m.filter_remove_reviews(rem)
    print_matrix_info(m)
    print(m.get_token_data(revs[0]))
    print(m.get_token_data(revs[7]))
    m = None
    revs = None
    rem = None

if False:
    # Test filter_keep_reviews()
    m = get_test_matrix()
    revs = list(m.reviewids_to_reviewindices.keys())
    print_matrix_info(m)
    print('1', m.get_token_data(revs[1]))
    print('6', m.get_token_data(revs[6]))
    print()
    revs.pop(7)
    revs.pop(5)
    revs.pop(0)
    print('Revs to keep', revs)
    m = m.filter_keep_reviews(revs)
    print_matrix_info(m)
    print('0', m.get_token_data(revs[0]))
    print('4', m.get_token_data(revs[4]))
    m = None
    revs = None
    
# print_matrix_info = None
# get_test_matrix = None