# Test results: extract_CountVectorizer notebook

[docs.scipy.org scipy.sparse.csr_matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html)

In [1]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

import bz2
import pickle
import timeit

from access.file_storage import FileStorage
from access.interim_storage import InterimStorage
from amore.amazon_reviews_reader import AmazonReviewsReader
from amore.opinion_lexicon import OpinionLexicon

In [2]:
# For multiple usage afterwards

file_storage = FileStorage()

opinion_lexicon = OpinionLexicon(file_storage.get_filepath('opinion-words'))
print('negative words:', len(opinion_lexicon.get_negative_set()))
print('positive words:', len(opinion_lexicon.get_positive_set()))
# negative words: 4783
# positive words: 2006

negative words: 4783
positive words: 2006


## Read data

In [3]:
# Read deduplicated review Ids
with bz2.BZ2File(file_storage.get_filepath('deduplicated'), 'r') as file:
    year_star_ids = pickle.loads(file.read())
print('Available years:', year_star_ids.keys())
print('Example stars:  ', year_star_ids[2007].keys())
print('Example entry:  ', year_star_ids[2007][1][0])
count = 0
for year in year_star_ids:
    for star in year_star_ids[year]:
        count += len(year_star_ids[year][star])
print('Reviews:', count)

# Available years: dict_keys([2007, 2006, 2008, 2003, 2002, 2004, 2000, 2009, 2011, 2010, 2001, 2005, 2012, 1999, 1998, 1997])
# Example stars:   dict_keys([3, 5, 4, 1, 2])
# Example entry:   [4368, 2007, 1]
# Reviews: 1727821

Available years: dict_keys([2007, 2006, 2008, 2003, 2002, 2004, 2000, 2009, 2011, 2010, 2001, 2005, 2012, 1999, 1998, 1997])
Example stars:   dict_keys([3, 5, 4, 1, 2])
Example entry:   [4368, 2007, 1]
Reviews: 1727821


In [4]:
# Read review texts
max_docs = -1
min_year = 2007
start_time = timeit.default_timer()
reader = AmazonReviewsReader(file_storage.get_filepath('amazon_gz_file'), AmazonReviewsReader.MODE_TYPED, min_year=min_year, max_docs=max_docs)
revno_to_text = {}
def get_texts(item):
    return (item[AmazonReviewsReader.KEY_SUMMARY] + " " + item[AmazonReviewsReader.KEY_TEXT]).replace('<br />', ' ')
for item in reader:
    revno_to_text[item[AmazonReviewsReader.KEY_NUMBER]] = get_texts(item)
print('Texts:', len(revno_to_text))
print('Runtime:', timeit.default_timer() - start_time)

# Texts: 4662381
# Runtime: 265.2943881880492

Texts: 4662381
Runtime: 264.8010023571551


### Read document-term matrix files

In [5]:
# Read document-term matrix
start_time = timeit.default_timer()
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-DocTermMatrix'), 'r') as file:
    doc_term_matrix = pickle.loads(file.read())
    print('document-term matrix:', doc_term_matrix.shape, type(doc_term_matrix))
    print('Runtime:', timeit.default_timer() - start_time)
    
# document-term matrix: (1203682, 486546) <class 'scipy.sparse.csr.csr_matrix'>
# Runtime:  26.56719038821757

#print(doc_term_matrix)
#       (0, 299799)  3
#        :       :
# (1203681, 367201)  1

document-term matrix: (1203682, 486546) <class 'scipy.sparse.csr.csr_matrix'>
Runtime: 25.682468444108963


In [6]:
# Read vocabulary of document-term matrix
# Invert it
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-Vocabulary'), 'r') as file:
    vocabulary = pickle.loads(file.read())
    print('vocabulary:', len(vocabulary), type(vocabulary))
    print('example:', next(iter(vocabulary.items())))

inv_vocabulary = {v: k for k, v in vocabulary.items()}
print('inv_vocabulary:', len(inv_vocabulary), type(inv_vocabulary))
print('example:', next(iter(inv_vocabulary.items())))

# vocabulary: 486546 <class 'dict'>
# example: ('movie', 299799)

# inv_vocabulary: 486546 <class 'dict'>
# example: (299799, 'movie')

vocabulary: 486546 <class 'dict'>
example: ('movie', 299799)
inv_vocabulary: 486546 <class 'dict'>
example: (299799, 'movie')


In [7]:
# Read count-vector-ID to review-ID mapping
# Invert it
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-VecidRevno'), 'r') as file:
    vecid_revno = pickle.loads(file.read())
    print('vectorizer ID to review no:', len(vecid_revno), type(vecid_revno))
    print('example:', next(iter(vecid_revno.items())))

# vectorizer ID to review no: 1203682 <class 'dict'>
# example: (0, 3)

vectorizer ID to review no: 1203682 <class 'dict'>
example: (0, 3)


## Access data

In [8]:
def get_text(review_id):
    return revno_to_text[review_id]

def get_review_ids(years, stars):
    ids = []
    for year in year_star_ids:
        if year in years:
            for star in year_star_ids[year]:
                if star in stars:
                    ids += year_star_ids[year][star]
    return ids

In [36]:
class Matrix:
    # Note (TODO): Maybe a simplified version could become to replace the indices dicts by lists,
    # as indices probably simply increment starting at 0.

    # document-term matrix, scipy.sparse.csr.csr_matrix
    doc_term_matrix = None
    
    # matrix-document-index to review-id: dict
    docindex_to_reviewid = None
    reviewid_to_docindex = None
    
    # matrix-term-index to token: dict
    tokenindex_to_token = None
    token_to_tokenindex = None
        
    def __init__(self, doc_term_matrix, docindex_to_reviewid, tokenindex_to_token):
        print('Document-term matrix:       ', doc_term_matrix.shape, type(doc_term_matrix))
        print('Document-index to review-id:', len(docindex_to_reviewid), type(docindex_to_reviewid))
        print('Term-index to token:        ', len(tokenindex_to_token), type(tokenindex_to_token))
        self.doc_term_matrix      = doc_term_matrix
        self.docindex_to_reviewid = docindex_to_reviewid
        self.tokenindex_to_token  = tokenindex_to_token
    
    def get_reviewid_to_docindex(self, review_id):
        if not self.reviewid_to_docindex:
            self.reviewid_to_docindex = {v: k for k, v in self.docindex_to_reviewid.items()}
        if review_id in self.reviewid_to_docindex.keys():
            return self.reviewid_to_docindex[review_id]
        else:
            return None
    
    def get_token_to_tokenindex(self, token):
        if not self.token_to_tokenindex:
            self.token_to_tokenindex = {v: k for k, v in self.tokenindex_to_token.items()}
        if token in self.token_to_tokenindex:
            return self.token_to_tokenindex[token]
        else:
            return None
        
    def get_token_indices(self, review_id=None, doc_index=None):
        """
        Returns 1-dimensional numpy.ndarray.
        """
        if not doc_index:
            doc_index = self.get_reviewid_to_docindex(review_id)
        return self.doc_term_matrix[doc_index].indices
        
    def get_token_data(self, review_id=None, doc_index=None):
        """
        Returns 1-dimensional numpy.ndarray.
        """
        if not doc_index:
            doc_index = self.get_reviewid_to_docindex(review_id)
        return self.doc_term_matrix[doc_index].data
    
    def get_token_counts(self, review_id):
        """
        Returns dict (token, count).
        """
        token_counts = {}
        doc_index = self.get_reviewid_to_docindex(review_id)
        token_data = self.get_token_data(doc_index=doc_index)
        for i, token_index in enumerate(self.get_token_indices(doc_index=doc_index)):
            token = self.get_token(token_index)
            if token:
                token_counts[token] = token_data[i]
        return dict(sorted(token_counts.items(), key=lambda item: item[1], reverse=True))
    
    def get_token(self, token_index):
        if token_index in self.tokenindex_to_token:
            return self.tokenindex_to_token[token_index]
        else:
            return None
        
    def get_overall_token_counts(self):
        """
        Returns tokens and their sums of occurences in all documents.
        """
        token_sums = self.doc_term_matrix.sum(0)
        token_counts = {}
        for token_index in range(0, token_sums.shape[1]):
            token_counts[self.get_token(token_index)] = token_sums.item(token_index)
        return dict(sorted(token_counts.items(), key=lambda item: item[1], reverse=True))
    
    def get_overall_token_occurences(self):
        """
        Returns tokens and their occurences in all documents.
        """
        # Note (TODO): Expensive. Could be improved by using matrix instead of dict.
        
        # Count non-zero values of token-indices
        tokenindex_occurences = {}
        for tokenindex in self.doc_term_matrix.nonzero()[1]:
            if tokenindex in tokenindex_occurences:
                tokenindex_occurences[tokenindex] += 1
            else:
                tokenindex_occurences[tokenindex] = 1
                
        # Sort by values/counts
        tokenindex_occurences = dict(sorted(tokenindex_occurences.items(), key=lambda item: item[1], reverse=True))
        
        # Token-indices to tokens
        token_occurences = {}
        for item in tokenindex_occurences.items():
            token_occurences[self.get_token(item[0])] = item[1]
        return token_occurences
        
    def filter_min_count(self, min_count):  
        """
        Generates smaller matrix based on token minimum counts (overall word usage)
        """
        # Sum up token occurences in docs
        token_sums = self.doc_term_matrix.sum(0)
        print('Filtering. Based on', token_sums.shape[1], 'summed up tokens')
        
        # Create new inverse vocabulary
        token_indices_extract = []
        new_inv_vocabulary = {}
        new_token_index = 0
        for token_index in range(0, token_sums.shape[1]):
            if token_sums.item(token_index) >= min_count:
                token_indices_extract.append(token_index)
                new_inv_vocabulary[new_token_index] = self.get_token(token_index)
                new_token_index += 1

        # Filter matrix
        new_doc_term_matrix = self.doc_term_matrix[:,token_indices_extract]
        
        return Matrix(new_doc_term_matrix, self.docindex_to_reviewid, new_inv_vocabulary)
    
    def filter_tokens(self, tokens):
        """
        Filters matrix by a given set of tokens (e.g. positive words).
        """
        
        # Collect available token-indices
        tokenindex_to_token = {}
        for token in tokens:
            token_index = self.get_token_to_tokenindex(token)
            if token_index:
                tokenindex_to_token[token_index] = token
        
        # Create new inverse vocabulary
        new_inv_vocabulary = {}
        for new_token_index, token_index in enumerate(tokenindex_to_token.keys()):
            new_inv_vocabulary[new_token_index] = tokenindex_to_token[token_index]
            new_token_index += 1

        # Filter matrix
        new_doc_term_matrix = self.doc_term_matrix[:,list(tokenindex_to_token.keys())]
                
        return Matrix(new_doc_term_matrix, self.docindex_to_reviewid, new_inv_vocabulary)
    
    def filter_reviews(self, years=None, stars=None):
        """
        Filters matrix by years and stars of reviews.
        """

        # Get review-ids
        review_ids = []
        for review_tup in get_review_ids(years, stars):
            review_ids.append(review_tup[0])
        review_ids = sorted(review_ids)
        
        # Collect matrix-doc-indices from review-ids
        doc_indices_extract = []
        new_docindex_to_reviewid = {}
        new_docindex = 0
        for review_id in review_ids:
            doc_indices_extract.append(self.get_reviewid_to_docindex(review_id))
            new_docindex_to_reviewid[new_docindex] = review_id
            new_docindex += 1

        # Filter matrix
        print('Filtering. Based on', len(review_ids), 'review IDs')
        new_doc_term_matrix = m.doc_term_matrix[doc_indices_extract,:]
        
        return Matrix(new_doc_term_matrix, new_docindex_to_reviewid, self.tokenindex_to_token)


## Additional matrices

#### Limit all tokens (486,546) to those appearing e.g. at least 1,000 times (8,444)

In [37]:
m = Matrix(doc_term_matrix, vecid_revno, inv_vocabulary)
print()
m2 = m.filter_min_count(2)
print()
m1k = m.filter_min_count(1000)

Document-term matrix:        (1203682, 486546) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 1203682 <class 'dict'>
Term-index to token:         486546 <class 'dict'>

Filtering. Based on 486546 summed up tokens
Document-term matrix:        (1203682, 244203) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 1203682 <class 'dict'>
Term-index to token:         244203 <class 'dict'>

Filtering. Based on 486546 summed up tokens
Document-term matrix:        (1203682, 8444) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 1203682 <class 'dict'>
Term-index to token:         8444 <class 'dict'>


#### Limit matrix tokens to pos/neg words

In [38]:
m_neg = m1k.filter_tokens(opinion_lexicon.get_negative_set())
#print('some negetive words:', list(m_neg.tokenindex_to_token.values())[:20])

Document-term matrix:        (1203682, 873) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 1203682 <class 'dict'>
Term-index to token:         873 <class 'dict'>


In [39]:
m_pos = m1k.filter_tokens(opinion_lexicon.get_positive_set())
#print('some positive words:', list(m_pos.tokenindex_to_token.values())[:20])

Document-term matrix:        (1203682, 687) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 1203682 <class 'dict'>
Term-index to token:         687 <class 'dict'>


#### Limit matrix to subset of reviews

In [40]:
m_2007_pos = m.filter_reviews(years=[2007], stars=[4,5])

Filtering. Based on 146616 review IDs
Document-term matrix:        (146616, 486546) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 146616 <class 'dict'>
Term-index to token:         486546 <class 'dict'>


## Example review

(e.g. review ID 6590 contains pos and neg words)

In [41]:
review_ids = get_review_ids([2007], [5])
print("Number of review IDs:", len(review_ids))
KEY_ID = 0
review_id = review_ids[11][KEY_ID]
print("Review ID:", review_id)

Number of review IDs: 108952
Review ID: 6590


In [42]:
print()
print('Review text:')
print(get_text(review_id))

print()
print('All data')
print(m.get_token_counts(review_id))

print()
print('Min word occurences: 1000')
print(m1k.get_token_counts(review_id))

print()
print('Min word occurences: 1000, only negative')
print(m_neg.get_token_counts(review_id))

print()
print('Min word occurences: 1000, only positive')
print(m_pos.get_token_counts(review_id))

print()
print('2007 positive reviews')
print(m_2007_pos.get_token_counts(review_id))


Review text:
A Priceless Treasure Bobby Short was a favorite performer of mine since I moved to New York in 1955. A friend had his LP with "At the Moving Picture Ball" on it and I insisted she play it whenever I dropped by for coffee, which I did every morning when I was between jobs. I always intended to splurge and go see him at the Carlyle but I was too poor and then too populist for a night club and then too stingy. And there were always the great, great records. And suddenly, after only half a century, he was gone. Thank god for this DVD of a wonderful performance at the club. The ebullience, the superb artistry and the glow of his personal niceness make it a marvelous experience to treasure over and over. And he does "Moving Picture Ball" and other personal favorites, "On the Amazon" and "Why Shouldn't I?"  Bobby fans won't need prompting but this great treat should also be a key discovery for anyone interested in popular song styling of the civilized pre-wail-and-whine era, in 

## Compare years

In [43]:
m_2008_neg = m.filter_reviews(years=[2008], stars=[1,2])

Filtering. Based on 21078 review IDs
Document-term matrix:        (21078, 486546) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 21078 <class 'dict'>
Term-index to token:         486546 <class 'dict'>


#### Check most used words

In [44]:
print(list(m_2007_pos.get_overall_token_counts().items())[:30])
print()
print(list(m_2008_neg.get_overall_token_counts().items())[:30])

[('movie', 127842), ('film', 94332), ('great', 83451), ('dvd', 71754), ('good', 61159), ('like', 60887), ('time', 48115), ('love', 42690), ('story', 40240), ('best', 39836), ('series', 31122), ('watch', 29974), ('season', 26951), ('way', 26061), ('life', 26036), ('movies', 25888), ('people', 24800), ('little', 22186), ('set', 21521), ('new', 21511), ('think', 21144), ('better', 21081), ('seen', 21032), ('years', 20878), ('films', 19965), ('music', 19921), ('old', 19789), ('know', 19574), ('excellent', 18993), ('man', 18621)]

[('movie', 32201), ('film', 17625), ('like', 12847), ('good', 8949), ('dvd', 8424), ('time', 8088), ('bad', 7954), ('story', 6729), ('people', 5127), ('movies', 5110), ('better', 4675), ('way', 4631), ('watch', 4476), ('great', 4275), ('know', 4166), ('think', 4043), ('plot', 3959), ('characters', 3925), ('money', 3899), ('acting', 3853), ('character', 3829), ('little', 3446), ('seen', 3443), ('watching', 3372), ('end', 3246), ('love', 3138), ('waste', 3091), ('fi

In [45]:
print(list(m_2007_pos.get_overall_token_occurences().items())[:30])
print()
print(list(m_2008_neg.get_overall_token_occurences().items())[:30])

[('great', 55242), ('movie', 54525), ('dvd', 42545), ('good', 41218), ('like', 39614), ('film', 34640), ('time', 33760), ('love', 30286), ('best', 29574), ('story', 26231), ('watch', 24348), ('way', 20077), ('movies', 18587), ('seen', 17611), ('life', 17471), ('people', 17382), ('series', 17285), ('better', 17021), ('little', 16956), ('years', 16744), ('think', 16644), ('watching', 15827), ('know', 15754), ('excellent', 15690), ('new', 15572), ('old', 15333), ('set', 14616), ('characters', 13708), ('fun', 13532), ('music', 13324)]

[('movie', 11715), ('like', 7551), ('film', 6647), ('good', 5939), ('time', 5791), ('bad', 5080), ('dvd', 4723), ('story', 4278), ('better', 3640), ('watch', 3589), ('movies', 3572), ('way', 3571), ('people', 3460), ('money', 3259), ('great', 3256), ('know', 3195), ('think', 3177), ('acting', 3168), ('plot', 2979), ('seen', 2887), ('watching', 2801), ('characters', 2679), ('waste', 2675), ('end', 2621), ('little', 2619), ('want', 2506), ('character', 2483), 