# AMORE datasets

- Note: Years 1997 to 1999 are not included in Doc2Vec embeddings. (2000 to 2012 included.)
- Ideas:
    - 100/0 to 0/100 neg/pos
    - 50/50 to 40/60 neg/pos
    - build on results on that: other distributions, e.g. 45/55
    - 50/50 to 40/30/30 neg/posCluster1/posCluster2
    - for token-level and document-level evaluation
- Docs:
    - [docs.scipy.org scipy.sparse.csr_matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html)
    - [docs.scipy.org scipy.sparse.spmatrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.spmatrix.html)
    - [docs.scipy.org sparse](https://docs.scipy.org/doc/scipy/reference/sparse.html)

In [1]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

import bz2
import pickle
import timeit
import numpy as np
from scipy.sparse import csr_matrix

from access.file_storage import FileStorage
from access.interim_storage import InterimStorage
from amore.amazon_reviews_reader import AmazonReviewsReader
from amore.opinion_lexicon import OpinionLexicon

In [2]:
# For multiple usage afterwards

file_storage = FileStorage()

opinion_lexicon = OpinionLexicon(file_storage.get_filepath('opinion-words'))
print('negative words:', len(opinion_lexicon.get_negative_set()))
print('positive words:', len(opinion_lexicon.get_positive_set()))
# negative words: 4783
# positive words: 2006

negative words: 4783
positive words: 2006


## Read data

In [3]:
# Read deduplicated review Ids
with bz2.BZ2File(file_storage.get_filepath('deduplicated'), 'r') as file:
    year_star_ids = pickle.loads(file.read())
print('Available years:', sorted(year_star_ids.keys()))
print('Example stars:  ', sorted(year_star_ids[2007].keys()))
print('Example entry:  ', year_star_ids[2007][1][0])
count = 0
for year in year_star_ids:
    for star in year_star_ids[year]:
        count += len(year_star_ids[year][star])
print('Reviews:', count)

# Available years: Available years: [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012]
# Example stars:   [1, 2, 3, 4, 5]
# Example entry:   [4368, 2007, 1]
# Reviews: 1727821

Available years: [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012]
Example stars:   [1, 2, 3, 4, 5]
Example entry:   [4368, 2007, 1]
Reviews: 1727821


In [4]:
# Read review texts
min_year = 2000
max_docs = -1
start_time = timeit.default_timer()
reader = AmazonReviewsReader(file_storage.get_filepath('amazon_gz_file'), AmazonReviewsReader.MODE_TYPED, min_year=min_year, max_docs=max_docs)
revno_to_text = {}
def get_texts(item):
    return (item[AmazonReviewsReader.KEY_SUMMARY] + " " + item[AmazonReviewsReader.KEY_TEXT]).replace('<br />', ' ')
for item in reader:
    revno_to_text[item[AmazonReviewsReader.KEY_NUMBER]] = get_texts(item)
print('Texts:', len(revno_to_text))
print('Runtime:', timeit.default_timer() - start_time)

# start year: 2007
# Texts: 4662381
# Runtime: 265.2943881880492

# start year: 2000
# Texts: 7827594
# Runtime: 312.4320105519146

Texts: 7827594
Runtime: 324.32746749557555


### Read document-term matrix files

In [5]:
# Read document-term matrix
start_time = timeit.default_timer()
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-DocTermMatrix'), 'r') as file:
    doc_term_matrix = pickle.loads(file.read())
    print('document-term matrix:', doc_term_matrix.shape, type(doc_term_matrix))
    print('Runtime:', timeit.default_timer() - start_time)
    
#print(doc_term_matrix)
#       (0, 299799)  3
#        :       :
# (1203681, 367201)  1

# start year: 2007
# document-term matrix: (1203682, 486546) <class 'scipy.sparse.csr.csr_matrix'>
# Runtime:  26.56719038821757

# start year: 2000
# document-term matrix: (1584098, 607181) <class 'scipy.sparse.csr.csr_matrix'>
# Runtime: 54.29142002761364

document-term matrix: (1584098, 607181) <class 'scipy.sparse.csr.csr_matrix'>
Runtime: 38.51574709266424


In [6]:
# Read vocabulary of document-term matrix
# Invert it
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-Vocabulary'), 'r') as file:
    vocabulary = pickle.loads(file.read())
    print('vocabulary:', len(vocabulary), type(vocabulary))
    print('example:', next(iter(vocabulary.items())))

inv_vocabulary = {v: k for k, v in vocabulary.items()}
print('inv_vocabulary:', len(inv_vocabulary), type(inv_vocabulary))
print('example:', next(iter(inv_vocabulary.items())))

# start year: 2007
# vocabulary: 486546 <class 'dict'>
# example: ('movie', 299799)
# inv_vocabulary: 486546 <class 'dict'>
# example: (299799, 'movie')

# start year: 2000
# vocabulary: 607181 <class 'dict'>
# example: ('movie', 371301)
# inv_vocabulary: 607181 <class 'dict'>
# example: (371301, 'movie')

vocabulary: 607181 <class 'dict'>
example: ('movie', 371301)
inv_vocabulary: 607181 <class 'dict'>
example: (371301, 'movie')


In [7]:
# Read count-vector-ID to review-ID mapping
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-VecidRevno'), 'r') as file:
    vecid_revno = pickle.loads(file.read())
    print('vectorizer ID to review no:', len(vecid_revno), type(vecid_revno))
    print('example:', next(iter(vecid_revno.items())))

# start year: 2007
# vectorizer ID to review no: 1203682 <class 'dict'>
# example: (0, 3)

# start year: 2000
# vectorizer ID to review no: 1584098 <class 'dict'>
# example: (0, 3)

vectorizer ID to review no: 1584098 <class 'dict'>
example: (0, 3)


## Access data

In [8]:
def get_text(review_id):
    return revno_to_text[review_id]

def get_review_ids(years, stars):
    ids = []
    for year in year_star_ids:
        if year in years:
            for star in year_star_ids[year]:
                if star in stars:
                    ids += year_star_ids[year][star]
    return ids

In [9]:
def delete_from_csr(mat, row_indices=[], col_indices=[]):
    """
    Remove the rows (denoted by ``row_indices``) and columns (denoted by ``col_indices``) from the CSR sparse matrix ``mat``.
    WARNING: Indices of altered axes are reset in the returned matrix

    import numpy as np
    from scipy.sparse import csr_matrix
    
    https://stackoverflow.com/a/45486349
    """
    if not isinstance(mat, csr_matrix):
        raise ValueError("works only for CSR format -- use .tocsr() first")

    rows = []
    cols = []
    if row_indices:
        rows = list(row_indices)
    if col_indices:
        cols = list(col_indices)

    if len(rows) > 0 and len(cols) > 0:
        row_mask = np.ones(mat.shape[0], dtype=bool)
        row_mask[rows] = False
        col_mask = np.ones(mat.shape[1], dtype=bool)
        col_mask[cols] = False
        return mat[row_mask][:,col_mask]
    elif len(rows) > 0:
        mask = np.ones(mat.shape[0], dtype=bool)
        mask[rows] = False
        return mat[mask]
    elif len(cols) > 0:
        mask = np.ones(mat.shape[1], dtype=bool)
        mask[cols] = False
        return mat[:,mask]
    else:
        return mat

In [10]:
class Matrix:
    # Note (TODO): Maybe a simplified version could become to replace the indices dicts by lists,
    # as indices probably simply increment starting at 0.

    # document-term matrix, scipy.sparse.csr.csr_matrix
    doc_term_matrix = None
    
    # matrix-document-index to review-id: dict
    docindex_to_reviewid = None
    reviewid_to_docindex = None
    
    # matrix-term-index to token: dict
    tokenindex_to_token = None
    token_to_tokenindex = None
        
    def __init__(self, doc_term_matrix, docindex_to_reviewid, tokenindex_to_token):
        print('Document-term matrix:       ', doc_term_matrix.shape, type(doc_term_matrix))
        print('Document-index to review-id:', len(docindex_to_reviewid), type(docindex_to_reviewid))
        print('Term-index to token:        ', len(tokenindex_to_token), type(tokenindex_to_token))
        self.doc_term_matrix      = doc_term_matrix
        self.docindex_to_reviewid = docindex_to_reviewid
        self.tokenindex_to_token  = tokenindex_to_token
        
    def __repr__(self):
        return 'Matrix ' + str(self.doc_term_matrix.shape) + ', docs ' + str(len(self.docindex_to_reviewid)) + ', tokens ' + str(len(self.tokenindex_to_token))
    
    def get_reviewid_to_docindex(self, review_id):
        if not self.reviewid_to_docindex:
            self.reviewid_to_docindex = {v: k for k, v in self.docindex_to_reviewid.items()}
        if review_id in self.reviewid_to_docindex.keys():
            return self.reviewid_to_docindex[review_id]
        else:
            return None
    
    def get_token_to_tokenindex(self, token):
        if not self.token_to_tokenindex:
            self.token_to_tokenindex = {v: k for k, v in self.tokenindex_to_token.items()}
        if token in self.token_to_tokenindex:
            return self.token_to_tokenindex[token]
        else:
            return None
        
    def get_token_indices(self, review_id=None, doc_index=None):
        """
        Returns 1-dimensional numpy.ndarray.
        """
        if not doc_index:
            doc_index = self.get_reviewid_to_docindex(review_id)
        return self.doc_term_matrix[doc_index].indices
        
    def get_token_data(self, review_id=None, doc_index=None):
        """
        Returns 1-dimensional numpy.ndarray.
        """
        if not doc_index:
            doc_index = self.get_reviewid_to_docindex(review_id)
        return self.doc_term_matrix[doc_index].data
    
    def get_token_counts(self, review_id):
        """
        Returns dict (token, count).
        """
        token_counts = {}
        doc_index = self.get_reviewid_to_docindex(review_id)
        token_data = self.get_token_data(doc_index=doc_index)
        for i, token_index in enumerate(self.get_token_indices(doc_index=doc_index)):
            token = self.get_token(token_index)
            if token:
                token_counts[token] = token_data[i]
        return dict(sorted(token_counts.items(), key=lambda item: item[1], reverse=True))
    
    def get_token(self, token_index):
        if token_index in self.tokenindex_to_token:
            return self.tokenindex_to_token[token_index]
        else:
            return None
        
    def get_overall_token_counts(self):
        """
        Returns tokens and their sums of (multiple) occurences in all documents.
        """
        token_sums = self.doc_term_matrix.sum(0)
        token_counts = {}
        for token_index in range(0, token_sums.shape[1]):
            token_counts[self.get_token(token_index)] = token_sums.item(token_index)
        return dict(sorted(token_counts.items(), key=lambda item: item[1], reverse=True))
    
    def get_overall_token_occurences(self):
        """
        Returns tokens and their occurences (counted max 1 time) in all documents.
        """
        # Note (TODO): Expensive. Could be improved by using matrix instead of dict.
        
        # Count non-zero values of token-indices
        tokenindex_occurences = {}
        for tokenindex in self.doc_term_matrix.nonzero()[1]:
            if tokenindex in tokenindex_occurences:
                tokenindex_occurences[tokenindex] += 1
            else:
                tokenindex_occurences[tokenindex] = 1
                
        # Sort by values/counts
        tokenindex_occurences = dict(sorted(tokenindex_occurences.items(), key=lambda item: item[1], reverse=True))
        
        # Token-indices to tokens
        token_occurences = {}
        for item in tokenindex_occurences.items():
            token_occurences[self.get_token(item[0])] = item[1]
        return token_occurences
        
    def filter_min_count(self, min_count):  
        """
        Filters matrix based on token minimum counts (overall word usage)
        """
        # Sum up token occurences in docs
        token_sums = self.doc_term_matrix.sum(0)
        print('Filtering. Based on', token_sums.shape[1], 'summed up tokens')
        
        # Create new inverse vocabulary
        token_indices_extract = []
        new_inv_vocabulary = {}
        new_token_index = 0
        for token_index in range(0, token_sums.shape[1]):
            if token_sums.item(token_index) >= min_count:
                token_indices_extract.append(token_index)
                new_inv_vocabulary[new_token_index] = self.get_token(token_index)
                new_token_index += 1

        # Filter matrix
        new_doc_term_matrix = self.doc_term_matrix[:,token_indices_extract]
        
        return Matrix(new_doc_term_matrix, self.docindex_to_reviewid, new_inv_vocabulary)
    
    def filter_tokens(self, tokens):
        """
        Filters matrix by a given set of tokens (e.g. positive words).
        """
        
        # Collect available token-indices
        tokenindex_to_token = {}
        for token in tokens:
            token_index = self.get_token_to_tokenindex(token)
            if token_index:
                tokenindex_to_token[token_index] = token
        
        # Create new inverse vocabulary
        new_inv_vocabulary = {}
        for new_token_index, token_index in enumerate(tokenindex_to_token.keys()):
            new_inv_vocabulary[new_token_index] = tokenindex_to_token[token_index]
            new_token_index += 1

        # Filter matrix
        new_doc_term_matrix = self.doc_term_matrix[:,list(tokenindex_to_token.keys())]
                
        return Matrix(new_doc_term_matrix, self.docindex_to_reviewid, new_inv_vocabulary)
    
    def filter_reviews(self, years=None, stars=None):
        """
        Filters matrix by years and stars of reviews.
        """

        # Get review-ids
        review_ids = []
        for review_tup in get_review_ids(years, stars):
            review_ids.append(review_tup[0])
        review_ids = sorted(review_ids)
        
        # Collect matrix-doc-indices from review-ids
        doc_indices_extract = []
        new_docindex_to_reviewid = {}
        new_docindex = 0
        for review_id in review_ids:
            doc_indices_extract.append(self.get_reviewid_to_docindex(review_id))
            new_docindex_to_reviewid[new_docindex] = review_id
            new_docindex += 1

        # Filter matrix
        print('Filtering. Based on', len(review_ids), 'review IDs')
        new_doc_term_matrix = self.doc_term_matrix[doc_indices_extract,:]
        
        return Matrix(new_doc_term_matrix, new_docindex_to_reviewid, self.tokenindex_to_token)

    def filter_remove_reviews(self, review_ids):
        """
        Removes given review IDs.
        """
        doc_indices_remove = []
        for review_id in review_ids:
            doc_indices_remove.append(self.get_reviewid_to_docindex(review_id))

        docindex_to_reviewid_keep = dict(self.docindex_to_reviewid)
        for doc_index in doc_indices_remove:
            docindex_to_reviewid_keep.pop(doc_index)
        
        new_docindex_to_reviewid = {}
        i = 0
        for item in sorted(docindex_to_reviewid_keep.items()):
            if i < 10:
                print(item[0], item[1])
            new_docindex_to_reviewid[i] = item[1]
            i += 1
            
        return Matrix(delete_from_csr(csr_matrix(self.doc_term_matrix), doc_indices_remove, []), new_docindex_to_reviewid, self.tokenindex_to_token)

## Additional matrices

#### Limit tokens

- all: 607,181
- min 2: 299,517
- min 10: 111,678
- min 100: 38,080
- min 1k: 10,842
- min 10k: 1,842
- min 100k: 124
- min 1m: 2

Top 124 occurences (counted max 1 times per doc):  
'movie', 'great', 'like', 'good', 'film', 'dvd', 'time', 'love', 'best', 'story', 'watch', 'way', 'movies', 'people', 'seen', 'better', 'think', 'know', 'little', 'life', 'watching', 'new', 'series', 'years', 'characters', 'old', 'end', 'want', 'excellent', 'character', 'set', 'bad', 'acting', 'real', 'work', 'films', 'man', 'video', 'makes', 'music', 'fun', 'worth', 'fan', 'long', 'scenes', 'times', 'got', 'recommend', 'going', 'buy', 'world', 'lot', 'look', 'enjoy', 'funny', 'plot', 'thought', 'right', 'thing', 'come', 'family', 'cast', 'things', 'loved', 'wonderful', 'feel', 'watched', 'action', 'actually', 'original', 'classic', 'quality', 'day', 'shows', 'big', 'saw', 'actors', 'season', 'bit', 'scene', 'year', 'favorite', 'stars', 'young', 'interesting', 'true', 'different', 'special', 'far', 'highly', 'pretty', 'version', 'especially', 'away', 'enjoyed', 'looking', 'sure', 'hard', 'director', 'amazing', 'job', 'fact', 'performance', 'beautiful', 'amazon', 'perfect', 'gets', 'star', 'second', 'fans', 'high', 'live', 'played', 'comedy', 'collection', 'sound', 'play', 'role', 'book', 'episodes', 'kids', 'episode', 'quot', 'horror'

Top 124 counts:  
'movie', 'film', 'great', 'like', 'dvd', 'good', 'time', 'story', 'love', 'best', 'watch', 'series', 'quot', 'movies', 'people', 'way', 'life', 'think', 'better', 'little', 'seen', 'new', 'season', 'know', 'characters', 'films', 'video', 'years', 'set', 'character', 'old', 'watching', 'man', 'music', 'bad', 'end', 'work', 'scenes', 'fun', 'real', 'excellent', 'want', 'world', 'family', 'acting', 'action', 'original', 'makes', 'fan', 'funny', 'long', 'going', 'lot', 'version', 'got', 'times', 'look', 'worth', 'plot', 'buy', 'scene', 'right', 'quality', 'things', 'thing', 'classic', 'day', 'cast', 'thought', 'feel', 'recommend', 'shows', 'enjoy', 'actually', 'book', 'wonderful', 'come', 'big', 'young', 'amazon', 'loved', 'special', 'episodes', 'bit', 'year', 'watched', 'saw', 'performance', 'actors', 'stars', 'different', 'interesting', 'episode', 'true', 'pretty', 'director', 'favorite', 'far', 'star', 'comedy', 'amazing', 'live', 'away', 'beautiful', 'especially', 'fact', 'job', 'looking', 'hard', 'horror', 'fans', 'played', 'sound', 'highly', 'sure', 'collection', 'gets', 'enjoyed', 'perfect', 'role', 'second', 'high', 'play', 'kids'

In [11]:
# all reviews starting from 2000 without 3 stars
m = Matrix(doc_term_matrix, vecid_revno, inv_vocabulary)
print()

if False:
    m2 = m.filter_min_count(2)
    print()

m100 = m.filter_min_count(100)

# Test, should be "Term-index to token: 10842"
if False:
    m1k = m.filter_min_count(1000)
    m1k = m2.filter_min_count(1000)

Document-term matrix:        (1584098, 607181) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 1584098 <class 'dict'>
Term-index to token:         607181 <class 'dict'>

Filtering. Based on 607181 summed up tokens
Document-term matrix:        (1584098, 38080) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 1584098 <class 'dict'>
Term-index to token:         38080 <class 'dict'>


#### Limit matrix to subset of reviews

In [12]:
# matrix to use afterwards
matrix = m100.filter_reviews(years=[2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012], stars=[1,2,4,5])

# Only keep matrix in memory
m = None
m2 = None
m100 = None
m1k = None

Filtering. Based on 1572849 review IDs
Document-term matrix:        (1572849, 38080) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 1572849 <class 'dict'>
Term-index to token:         38080 <class 'dict'>


#### Limit matrix tokens to pos/neg words

In [13]:
# original list size: 4,783
m_neg = matrix.filter_tokens(opinion_lexicon.get_negative_set())
#print('some negetive words:', list(m_neg.tokenindex_to_token.values())[:3])

Document-term matrix:        (1572849, 3084) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 1572849 <class 'dict'>
Term-index to token:         3084 <class 'dict'>


In [14]:
# original list size: 2,006
m_pos = matrix.filter_tokens(opinion_lexicon.get_positive_set())
#print('some positive words:', list(m_pos.tokenindex_to_token.values())[:3])

Document-term matrix:        (1572849, 1500) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 1572849 <class 'dict'>
Term-index to token:         1500 <class 'dict'>


## Example review

(e.g. review ID 6590 contains pos and neg words)

In [15]:
m_2007_pos = matrix.filter_reviews(years=[2007], stars=[4,5])
print()

review_ids = get_review_ids([2007], [5])
print("Number of review IDs:", len(review_ids))
KEY_ID = 0
review_id = review_ids[11][KEY_ID]
print("Review ID:", review_id)

Filtering. Based on 146616 review IDs
Document-term matrix:        (146616, 38080) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 146616 <class 'dict'>
Term-index to token:         38080 <class 'dict'>

Number of review IDs: 108952
Review ID: 6590


In [16]:
print()
print('Review text:')
print(get_text(review_id))

print()
print(matrix.get_token_counts(review_id))

print()
print('Based on 2007 4 and 5 star reviews')
print(m_2007_pos.get_token_counts(review_id))


Review text:
A Priceless Treasure Bobby Short was a favorite performer of mine since I moved to New York in 1955. A friend had his LP with "At the Moving Picture Ball" on it and I insisted she play it whenever I dropped by for coffee, which I did every morning when I was between jobs. I always intended to splurge and go see him at the Carlyle but I was too poor and then too populist for a night club and then too stingy. And there were always the great, great records. And suddenly, after only half a century, he was gone. Thank god for this DVD of a wonderful performance at the club. The ebullience, the superb artistry and the glow of his personal niceness make it a marvelous experience to treasure over and over. And he does "Moving Picture Ball" and other personal favorites, "On the Amazon" and "Why Shouldn't I?"  Bobby fans won't need prompting but this great treat should also be a key discovery for anyone interested in popular song styling of the civilized pre-wail-and-whine era, in 

## Compare years

- Assume 50/50 to 40/60 (neg/pos)  
  It makes sense to decrease number of negatives, as there are less available.  
  Later similar sets may also be generated, e.g. 50/50 to 45/55 (neg/pos)
- Goal: Get and explain drift based on benchmark data.  
  Should be based on tokens (words) and also documents.  
  Drift in docs: Both, Bow and also BERT (focus on semantics, not words), could have advantages
- Explaination on docs: Set of docs (prototypes) would be returned.  
  -> Each of the 60% positive docs should be valid. And none of the 40% negative docs.  
  -> Goal of algo will be to have exact docs/tokens. Design benchmark data to enable this.

Same negative words should be be included in neg reviews of both years

In [17]:
m_2000_neg = matrix.filter_reviews(years=[2005], stars=[1,2])
print()
m_2000_pos = matrix.filter_reviews(years=[2005], stars=[4,5])
print()
m_2001_neg = matrix.filter_reviews(years=[2006], stars=[1,2])
print()
m_2001_pos = matrix.filter_reviews(years=[2006], stars=[4,5])

Filtering. Based on 17466 review IDs
Document-term matrix:        (17466, 38080) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 17466 <class 'dict'>
Term-index to token:         38080 <class 'dict'>

Filtering. Based on 90403 review IDs
Document-term matrix:        (90403, 38080) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 90403 <class 'dict'>
Term-index to token:         38080 <class 'dict'>

Filtering. Based on 16993 review IDs
Document-term matrix:        (16993, 38080) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 16993 <class 'dict'>
Term-index to token:         38080 <class 'dict'>

Filtering. Based on 99536 review IDs
Document-term matrix:        (99536, 38080) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 99536 <class 'dict'>
Term-index to token:         38080 <class 'dict'>


#### Check word/token usage

In [18]:
def print_token_overview(matrix, matrix_name, occurences=True, top=20):
    print(matrix_name, matrix.doc_term_matrix.shape, 'top', top)
    if occurences:
        print('Occurences: ', list(matrix.get_overall_token_occurences().items())[:top])
    else:
        print('Counts:     ', list(matrix.get_overall_token_counts().items())[:top])
    print()

In [19]:
print_token_overview(m_2000_neg, 'm_2000_neg')

if False:
    print_token_overview(m_2000_neg, 'm_2000_neg', occurences=False)
    print_token_overview(m_2000_pos, 'm_2000_pos')
    print_token_overview(m_2001_neg, 'm_2001_neg', top=100)
    print_token_overview(m_2001_pos, 'm_2001_pos', top=100)

m_2000_neg (17466, 38080) top 20
Occurences:  [('movie', 10355), ('like', 7235), ('film', 6588), ('good', 5585), ('time', 5139), ('bad', 4852), ('dvd', 4048), ('story', 3616), ('people', 3600), ('better', 3530), ('way', 3432), ('think', 3165), ('movies', 3160), ('know', 3130), ('watch', 3129), ('great', 3110), ('acting', 2910), ('plot', 2838), ('seen', 2750), ('money', 2640)]



In [20]:
def get_ratio(matrix_a, matrix_b, min_token_occurences = -1):
    ratio = {}
    only_a = {}
    only_b = {}
    rare = []
    a_token_occurences = matrix_a.get_overall_token_occurences()
    b_token_occurences = matrix_b.get_overall_token_occurences()
    print('Input sizes:', len(a_token_occurences), '/', len(b_token_occurences))
    for tok_occ in a_token_occurences.items():

        # Filter rare
        if min_token_occurences != -1 and \
           tok_occ[0] in b_token_occurences and \
           min_token_occurences > tok_occ[1] and \
           min_token_occurences > b_token_occurences[tok_occ[0]]:
            rare.append(tok_occ[0])
            b_token_occurences.pop(tok_occ[0])
            continue
            
        # Only A
        if not tok_occ[0] in b_token_occurences:
            only_a[tok_occ[0]] = tok_occ[1]
            continue
        
        # Add ratio, remove from B
        ratio_a = tok_occ[1] / len(a_token_occurences)
        ratio_b = b_token_occurences.pop(tok_occ[0]) / len(b_token_occurences)
        ratio[tok_occ[0]] = ratio_a / ratio_b
            
    print('Output sizes:', 'ratio', len(ratio), '; only a', len(only_a), '; only b', len(b_token_occurences), '; rare', len(rare))
    return dict(sorted(ratio.items(), key=lambda item: item[1], reverse=True)), only_a, b_token_occurences, rare

In [21]:
ratio, only_a, only_b, rare = get_ratio(m_2000_pos, m_2000_neg, min_token_occurences=1000)
print('ratio', list(ratio.items())[:50])
print('only_a', list(only_a.items())[:10])
print('only_b', list(only_b.items())[:10])
print('rare', rare[:10])

Input sizes: 37702 / 32975
Output sizes: ratio 1220 ; only a 4768 ; only b 41 ; rare 31714
ratio [('outstanding', 31.15784810326214), ('superb', 28.1636127606463), ('beautifully', 24.484116207461973), ('delightful', 24.36269222381365), ('captures', 23.117242561926883), ('hooked', 23.00236420177007), ('gem', 22.980094385404445), ('wonderfully', 21.55692629818359), ('recommended', 20.57704733726798), ('magnificent', 20.384223648612807), ('finest', 19.892475339853533), ('touching', 19.43338107615865), ('terrific', 18.846184253081464), ('excellent', 18.845633666873763), ('awesome', 18.47691866651018), ('wonderful', 17.883745440248138), ('fascinating', 17.332217842778466), ('amazing', 17.318079487449495), ('favorites', 17.028657227271992), ('unique', 16.325075592806748), ('fantastic', 16.286965112032416), ('highly', 16.129475889873216), ('fabulous', 15.701277524200076), ('bonus', 15.672150364529157), ('perfect', 15.601335873347645), ('favorite', 14.903661344225771), ('packed', 14.8269911999

In [22]:
ratio, only_a, only_b, rare = get_ratio(m_2000_neg, m_2000_pos, min_token_occurences=1000)
print('ratio', list(ratio.items())[:10])
print('only_a', list(only_a.items())[:10])
print('only_b', list(only_b.items())[:10])
print('rare', rare[:10])

Input sizes: 32975 / 37702
Output sizes: ratio 1220 ; only a 41 ; only b 4768 ; rare 31714
ratio [('waste', 4.402619033110612), ('worst', 2.037940921604039), ('horrible', 1.800039257403425), ('awful', 1.7893180720319632), ('terrible', 1.5519201230298887), ('boring', 1.4451619134949723), ('stupid', 1.2091838354750541), ('worse', 1.1099394992263976), ('poor', 0.8083152489848967), ('rent', 0.7820021209234612)]
only_a [('snoozefest', 12), ('hudgens', 5), ('birthed', 4), ('powerpoint', 3), ('tortilla', 2), ('kenshiro', 2), ('subsidies', 2), ('laos', 2), ('graysmith', 2), ('bettie', 2)]
only_b [('emmys', 112), ('dreamer', 102), ('transfere', 100), ('withers', 88), ('byrd', 86), ('excelent', 84), ('gifford', 84), ('hatton', 82), ('greys', 80), ('rusesabagina', 75)]
rare ['crap', 'disappointing', 'lame', 'cheap', 'poorly', 'ridiculous', 'garbage', 'avoid', 'disappointment', 'wasted']


- Use positive words with high ratio to create drift
- Do not use words with low ratio (best: near 1.0) to create drift. Try to create equal document sets.
- Remove rare words from matrix

In [23]:
print('pos', len(opinion_lexicon.get_positive_set()), '| ratio', len(ratio.keys()))

tmp = opinion_lexicon.get_positive_set().union(ratio.keys())
print('pos UNION ratio', len(tmp))

tmp = opinion_lexicon.get_positive_set() - ratio.keys()
print('pos - ratio', len(tmp))

tmp = ratio.keys() - opinion_lexicon.get_positive_set()
print('ratio - pos', len(tmp))

print(list(tmp)[:20])

pos 2006 | ratio 1220
pos UNION ratio 3082
pos - ratio 1862
ratio - pos 1076
['section', 'complete', 'disney', 'rate', 'weeks', 'added', 'christmas', 'sense', '2004', 'hearing', 'form', 'charles', 'edge', 'seeing', 'fiction', 'mean', '100', 'baby', 'gun', 'bring']


In [24]:
tokens_ratio_high = []
tokens_ratio_low = []
ratio_threshold = 1.5
for item in ratio.items():
    if item[1] >= ratio_threshold:
        tokens_ratio_high.append(item[0])
    else:
        tokens_ratio_low.append(item[0])
        
print(len(tokens_ratio_high))
print(len(tokens_ratio_low))

5
1215


In [25]:
tmp = m_2000_pos.filter_tokens(tokens_ratio_high).doc_term_matrix
print(tmp.shape)
print(tmp.count_nonzero())
tmp.eliminate_zeros()
tmp.prune()
print(tmp.shape)
print(tmp.count_nonzero())

# does not work this way, clean matrix based on document filtering

Document-term matrix:        (90403, 5) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 90403 <class 'dict'>
Term-index to token:         5 <class 'dict'>
(90403, 5)
4318
(90403, 5)
4318


In [67]:
tmp = m_2000_pos#.filter_tokens(tokens_ratio_high) # TODO: probably buggy
print()

review_ids = list(tmp.docindex_to_reviewid.values())
print('review_ids', len(review_ids))

key = 4
review_id = review_ids[key]
doc_index = tmp.get_reviewid_to_docindex(review_id)
print('ids', review_id, doc_index)
print('data', tmp.doc_term_matrix.data[doc_index])
review_id_2 = review_ids[key+1]
doc_index = tmp.get_reviewid_to_docindex(review_id_2)
print('ids', review_id_2, doc_index)
print('data', tmp.doc_term_matrix.data[doc_index])
print()

review_ids.pop(review_id)
review_ids.pop(review_id_2)
tmp = tmp.filter_remove_reviews(review_ids)
doc_index = tmp.get_reviewid_to_docindex(review_id)
print('ids', review_id, doc_index)
print('data', tmp.doc_term_matrix.data[doc_index])
doc_index = tmp.get_reviewid_to_docindex(review_id_2)
print('ids', review_id_2, doc_index)
print('data', tmp.doc_term_matrix.data[doc_index])
print('shape', tmp.doc_term_matrix.shape)
#print('data', tmp.doc_term_matrix.data)


review_ids 90403
ids 232 4
data 1
ids 233 5
data 2

232 5625
234 5997
Document-term matrix:        (2, 38080) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 2 <class 'dict'>
Term-index to token:         38080 <class 'dict'>
ids 232 None
data [[ 2  2  4  1  1  1  2  1  1  1  3  1  2  1  2  1  1  1  1  1  1  1  1  2
   1  1  1  1  1  1  2  3  1  1  1  1  2  1  1  1  1  1  1  1  1  1  1  3
   1  1  1  1  1  1  1  1 13  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
   1  1  2  1  1  1  1  1  1  1  1  1  3  1  1  1  1  1  1 10  1  1  1  1
   1  2  1  1  1  1  1  1  2  3  1  1  1  1  1  1  1  1  1  1  1  1  1  1
   1  1  1  1  1  1  1  1  1  1  1  2  3  1  1  1  1  1  1  1  1  1  1  1
   1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  2  1  1  1
   1  2  1  1  1  1  1  1  3  1  1  1  1  1  1  1  7  1  1  2  1  3  1  1
   1  2  1  2  1  1  1  1]]
ids 233 None
data [[ 2  2  4  1  1  1  2  1  1  1  3  1  2  1  2  1  1  1  1  1  1  1  1  2
   1  1  1  1  1  1  2  3

In [64]:
print(tmp.docindex_to_reviewid)

{0: 5625, 1: 5997}


In [65]:
tmp = m_2000_pos.filter_tokens(tokens_ratio_high)

Document-term matrix:        (90403, 5) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 90403 <class 'dict'>
Term-index to token:         5 <class 'dict'>


In [66]:
print(tmp.docindex_to_reviewid[232])

5625
