# Test results: extract_CountVectorizer notebook

[docs.scipy.org scipy.sparse.csr_matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html)

In [1]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

import bz2
import pickle
import timeit

from access.file_storage import FileStorage
from access.interim_storage import InterimStorage
from amore.amazon_reviews_reader import AmazonReviewsReader
from amore.opinion_lexicon import OpinionLexicon

In [2]:
# For multiple usage afterwards

file_storage = FileStorage()

opinion_lexicon = OpinionLexicon(file_storage.get_filepath('opinion-words'))
print('negative words:', len(opinion_lexicon.get_negative_set()))
print('positive words:', len(opinion_lexicon.get_positive_set()))
# negative words: 4783
# positive words: 2006

negative words: 4783
positive words: 2006


## Read data

In [3]:
# Read deduplicated review Ids
with bz2.BZ2File(file_storage.get_filepath('deduplicated'), 'r') as file:
    year_star_ids = pickle.loads(file.read())
print('Available years:', year_star_ids.keys())
print('Example stars:  ', year_star_ids[2007].keys())
print('Example entry:  ', year_star_ids[2007][1][0])
count = 0
for year in year_star_ids:
    for star in year_star_ids[year]:
        count += len(year_star_ids[year][star])
print('Reviews:', count)

# Available years: dict_keys([2007, 2006, 2008, 2003, 2002, 2004, 2000, 2009, 2011, 2010, 2001, 2005, 2012, 1999, 1998, 1997])
# Example stars:   dict_keys([3, 5, 4, 1, 2])
# Example entry:   [4368, 2007, 1]
# Reviews: 1727821

Available years: dict_keys([2007, 2006, 2008, 2003, 2002, 2004, 2000, 2009, 2011, 2010, 2001, 2005, 2012, 1999, 1998, 1997])
Example stars:   dict_keys([3, 5, 4, 1, 2])
Example entry:   [4368, 2007, 1]
Reviews: 1727821


In [4]:
# Read review texts
max_docs = -1
min_year = 2007
start_time = timeit.default_timer()
reader = AmazonReviewsReader(file_storage.get_filepath('amazon_gz_file'), AmazonReviewsReader.MODE_TYPED, min_year=min_year, max_docs=max_docs)
revno_to_text = {}
def get_texts(item):
    return (item[AmazonReviewsReader.KEY_SUMMARY] + " " + item[AmazonReviewsReader.KEY_TEXT]).replace('<br />', ' ')
for item in reader:
    revno_to_text[item[AmazonReviewsReader.KEY_NUMBER]] = get_texts(item)
print('Texts:', len(revno_to_text))
print('Runtime:', timeit.default_timer() - start_time)

# Texts: 4662381
# Runtime: 265.2943881880492

Texts: 4662381
Runtime: 281.05807309411466


### Read document-term matrix files

In [5]:
# Read document-term matrix
start_time = timeit.default_timer()
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-DocTermMatrix'), 'r') as file:
    doc_term_matrix = pickle.loads(file.read())
    print('document-term matrix:', doc_term_matrix.shape, type(doc_term_matrix))
    print('Runtime:', timeit.default_timer() - start_time)
    
# document-term matrix: (1203682, 486546) <class 'scipy.sparse.csr.csr_matrix'>
# Runtime:  26.56719038821757

#print(doc_term_matrix)
#       (0, 299799)  3
#        :       :
# (1203681, 367201)  1

document-term matrix: (1203682, 486546) <class 'scipy.sparse.csr.csr_matrix'>
Runtime: 26.182680692523718


In [6]:
# Read vocabulary of document-term matrix
# Invert it
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-Vocabulary'), 'r') as file:
    vocabulary = pickle.loads(file.read())
    print('vocabulary:', len(vocabulary), type(vocabulary))
    print('example:', next(iter(vocabulary.items())))

inv_vocabulary = {v: k for k, v in vocabulary.items()}
print('inv_vocabulary:', len(inv_vocabulary), type(inv_vocabulary))
print('example:', next(iter(inv_vocabulary.items())))

# vocabulary: 486546 <class 'dict'>
# example: ('movie', 299799)

# inv_vocabulary: 486546 <class 'dict'>
# example: (299799, 'movie')

vocabulary: 486546 <class 'dict'>
example: ('movie', 299799)
inv_vocabulary: 486546 <class 'dict'>
example: (299799, 'movie')


In [7]:
# Read count-vector-ID to review-ID mapping
# Invert it
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-VecidRevno'), 'r') as file:
    vecid_revno = pickle.loads(file.read())
    print('vectorizer ID to review no:', len(vecid_revno), type(vecid_revno))
    print('example:', next(iter(vecid_revno.items())))

# vectorizer ID to review no: 1203682 <class 'dict'>
# example: (0, 3)

vectorizer ID to review no: 1203682 <class 'dict'>
example: (0, 3)


## Access data

In [11]:
def get_text(review_id):
    return revno_to_text[review_id]

def get_review_ids(years, stars):
    ids = []
    for year in year_star_ids:
        if year in years:
            for star in year_star_ids[year]:
                if star in stars:
                    ids += year_star_ids[year][star]
    return ids

In [79]:
class Matrix:

    # document-term matrix, scipy.sparse.csr.csr_matrix
    doc_term_matrix = None
    
    # matrix-document-index to review-id: dict
    docindex_to_reviewid = None
    reviewid_to_docindex = None
    
    # matrix-term-index to token: dict
    tokenindex_to_token = None
    token_to_tokenindex = None
        
    def __init__(self, doc_term_matrix, docindex_to_reviewid, tokenindex_to_token):
        print('Document-term matrix:       ', doc_term_matrix.shape, type(doc_term_matrix))
        print('Document-index to review-id:', len(docindex_to_reviewid), type(docindex_to_reviewid))
        print('Term-index to token:        ', len(tokenindex_to_token), type(tokenindex_to_token))
        self.doc_term_matrix      = doc_term_matrix
        self.docindex_to_reviewid = docindex_to_reviewid
        self.tokenindex_to_token  = tokenindex_to_token
    
    def get_reviewid_to_docindex(self, review_id):
        if not self.reviewid_to_docindex:
            self.reviewid_to_docindex = {v: k for k, v in self.docindex_to_reviewid.items()}
        return self.reviewid_to_docindex[review_id]
    
    def get_token_to_tokenindex(self, token):
        if not self.token_to_tokenindex:
            self.token_to_tokenindex = {v: k for k, v in self.tokenindex_to_token.items()}
        if token in self.token_to_tokenindex:
            return self.token_to_tokenindex[token]
        else:
            return None
        
    def get_token_indices(self, review_id=None, doc_index=None):
        """
        Returns 1-dimensional numpy.ndarray.
        """
        if not doc_index:
            doc_index = self.get_reviewid_to_docindex(review_id)
        return self.doc_term_matrix[doc_index].indices
        
    def get_token_data(self, review_id=None, doc_index=None):
        """
        Returns 1-dimensional numpy.ndarray.
        """
        if not doc_index:
            doc_index = self.get_reviewid_to_docindex(review_id)
        return self.doc_term_matrix[doc_index].data
    
    def get_token_counts(self, review_id):
        """
        Returns dict (token, count).
        """
        token_counts = {}
        doc_index = self.get_reviewid_to_docindex(review_id)
        token_data = self.get_token_data(doc_index=doc_index)
        for i, token_index in enumerate(self.get_token_indices(doc_index=doc_index)):
            token = self.get_token(token_index)
            if token:
                token_counts[token] = token_data[i]
        return dict(sorted(token_counts.items(), key=lambda item: item[1], reverse=True))
    
    def get_token(self, token_index):
        if token_index in self.tokenindex_to_token:
            return self.tokenindex_to_token[token_index]
        else:
            return None
    
    def filter_min_count(self, min_count):  
        """
        Generates smaller matrix based on token counts (overall word usage)
        """
        # Sum up token occurences in docs
        token_sums = self.doc_term_matrix.sum(0)
        print('Filtering. Based on', token_sums.shape[1], 'summed up tokens')
        
        # Create new inverse vocabulary
        token_indices_extract = []
        new_inv_vocabulary = {}
        new_token_index = 0
        for token_index in range(0, token_sums.shape[1]):
            if token_sums.item(token_index) >= min_count:
                token_indices_extract.append(token_index)
                new_inv_vocabulary[new_token_index] = self.get_token(token_index)
                new_token_index += 1

        # Filter matrix
        new_doc_term_matrix = self.doc_term_matrix[:,token_indices_extract]
        
        return Matrix(new_doc_term_matrix, self.docindex_to_reviewid, new_inv_vocabulary)
        
    def filter_tokens(self, tokens):
        
        # Collect available token-indices
        tokenindex_to_token = {}
        for token in tokens:
            token_index = self.get_token_to_tokenindex(token)
            if token_index:
                tokenindex_to_token[token_index] = token
        
        # Create new inverse vocabulary
        new_inv_vocabulary = {}
        for new_token_index, token_index in enumerate(tokenindex_to_token.keys()):
            new_inv_vocabulary[new_token_index] = tokenindex_to_token[token_index]
            new_token_index += 1

        # Filter matrix
        new_doc_term_matrix = self.doc_term_matrix[:,list(tokenindex_to_token.keys())]
                
        return Matrix(new_doc_term_matrix, self.docindex_to_reviewid, new_inv_vocabulary)

#### Get an example review to test code.

(e.g. review ID 6590 contains pos and neg words)

In [101]:
review_ids = get_review_ids([2007], [5])
print("Number of review IDs:", len(review_ids))
KEY_ID = 0
review_id = review_ids[11][KEY_ID]
print("Review ID:", review_id)
print("Review text:", get_text(review_id))

Number of review IDs: 108952
Review ID: 6590
Review text: A Priceless Treasure Bobby Short was a favorite performer of mine since I moved to New York in 1955. A friend had his LP with "At the Moving Picture Ball" on it and I insisted she play it whenever I dropped by for coffee, which I did every morning when I was between jobs. I always intended to splurge and go see him at the Carlyle but I was too poor and then too populist for a night club and then too stingy. And there were always the great, great records. And suddenly, after only half a century, he was gone. Thank god for this DVD of a wonderful performance at the club. The ebullience, the superb artistry and the glow of his personal niceness make it a marvelous experience to treasure over and over. And he does "Moving Picture Ball" and other personal favorites, "On the Amazon" and "Why Shouldn't I?"  Bobby fans won't need prompting but this great treat should also be a key discovery for anyone interested in popular song styling 

#### Limit all tokens (486,546) to those appearing e.g. at least 1,000 times (8,444)

In [102]:
print('All data')
m = Matrix(doc_term_matrix, vecid_revno, inv_vocabulary)
print(m.get_token_counts(review_id))
print()

if False:
    print('Min word occurences: 100 (from all)')
    m100 = m.filter_min_count(100)
    print(m100.get_token_counts(review_id))
    print()

print('Min word occurences: 1000 (from all)')
m1k = m.filter_min_count(1000)
print(m1k.get_token_counts(review_id))
print()

if False:
    print('Min word occurences: 1000 (from 100)')
    m1000b = m100.filter_min_count(1000)
    print(m1000b.get_token_counts(review_id))
    print()

    print('Min word occurences: 100k (from all)')
    m100k = m.filter_min_count(100000)
    print(m100k.get_token_counts(review_id))
    print()

    print('Min word occurences: 100k (from 1000, from 100)')
    m100kb = m1000.filter_min_count(100000)
    print(m100kb.get_token_counts(review_id))
    print()

All data
Document-term matrix:        (1203682, 486546) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 1203682 <class 'dict'>
Term-index to token:         486546 <class 'dict'>
{'great': 3, 'bobby': 2, 'moving': 2, 'picture': 2, 'treasure': 2, 'personal': 2, 'club': 2, 'ball': 2, 'suddenly': 1, 'high': 1, 'wonderful': 1, 'song': 1, 'performance': 1, 'dvd': 1, 'night': 1, 'amazon': 1, 'morning': 1, 'friend': 1, 'life': 1, 'favorite': 1, 'hell': 1, 'new': 1, 'fans': 1, 'favorites': 1, 'treat': 1, 'play': 1, 'god': 1, 'gone': 1, 'need': 1, 'interested': 1, 'experience': 1, 'superb': 1, 'york': 1, 'era': 1, 'marvelous': 1, 'fun': 1, 'thank': 1, 'century': 1, 'short': 1, 'poor': 1, 'half': 1, 'individual': 1, 'moved': 1, 'artistry': 1, 'coffee': 1, 'popular': 1, 'insisted': 1, 'pre': 1, 'discovery': 1, 'intended': 1, 'priceless': 1, 'paul': 1, 'civilized': 1, 'key': 1, 'dedication': 1, 'jobs': 1, 'dropped': 1, 'records': 1, 'striving': 1, 'triumph': 1, 'prompting': 1, 'p

#### Limit matrix tokens to pos/neg words

In [105]:
m_neg = m1k.filter_tokens(opinion_lexicon.get_negative_set())
#print('some negetive words:', list(m_neg.tokenindex_to_token.values())[:20])

print(m_neg.get_token_counts(review_id))

Document-term matrix:        (1203682, 873) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 1203682 <class 'dict'>
Term-index to token:         873 <class 'dict'>
some negetive words: ['sue', 'helpless', 'trapped', 'distract', 'stupidity', 'slave', 'drags', 'unbearable', 'smash', 'murderous', 'ignore', 'distress', 'warned', 'creeps', 'injury', 'melodramatic', 'obscure', 'suspicious', 'blatant', 'hostage']
{'hell': 1, 'poor': 1}


In [107]:
m_pos = m1k.filter_tokens(opinion_lexicon.get_positive_set())
#print('some positive words:', list(m_pos.tokenindex_to_token.values())[:20])

print(m_pos.get_token_counts(review_id))

Document-term matrix:        (1203682, 687) <class 'scipy.sparse.csr.csr_matrix'>
Document-index to review-id: 1203682 <class 'dict'>
Term-index to token:         687 <class 'dict'>
{'great': 3, 'treasure': 2, 'wonderful': 1, 'favorite': 1, 'fans': 1, 'superb': 1, 'marvelous': 1, 'fun': 1, 'thank': 1, 'popular': 1, 'priceless': 1, 'triumph': 1}
