# Filtering: CountVectorizer

Reads `deduplicated.pickle.bz2`.

Data format: `{year {star [(number, year, star)] } }`

In [12]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

import bz2
import pickle

from access.file_storage import FileStorage
from amore.printer import Printer
from amore.amazon_reviews_reader import AmazonReviewsReader

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
# For multiple usage afterwards
file_storage = FileStorage()
printer = Printer()


KEY_NUMBER = 0
#KEY_YEAR   = 1
#KEY_STAR   = 2

def count_ysl(ysl):
    c = 0
    for year in ysl.keys():
        for star in ysl[year].keys():
            c += len(ysl[year][star])
    return c

## Read deduplicated Year/star/review-IDs

In [6]:
# Read deduplicated review Ids
file_duplicates = file_storage.get_filepath('deduplicated')
with bz2.BZ2File(file_duplicates, 'r') as file:
    dup_ids = pickle.loads(file.read())

# Print overview
print_year_star_sum = False
count = 0
first = None
for year in dup_ids:
    for star in dup_ids[year]:
        size = len(dup_ids[year][star])
        if print_year_star_sum:
            print(year, star, size)
        count += size
        if first is None:
            first = dup_ids[year][star][0]
print('size: ' + str(count)) # size: 1727821
print('first item:', first)  # first item: [16505, 2007, 3]

size: 1727821
first item: [16505, 2007, 3]


In [10]:
# Print duplicate IDs as table
if True:
    printer.ipython_display(printer.get_dataframe_with_sums(dup_ids))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(dup_ids), float_as_integer=True, tablefmt="pipe"))
print('Reviews in dup_ids:', count_ysl(dup_ids))
# Reviews in ys_lists: 1,727,821

Unnamed: 0,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,Sum
1,2.0,26,597,2512,3015,3597,3689,6643,10413,9943,11125,12661,14150,15822,19132,21570,134897.0
2,,30,437,2162,2541,3048,3364,4880,7053,7050,8067,8417,8846,9536,11363,12041,88835.0
3,1.0,65,880,3932,4562,5064,5860,8592,11420,11322,13932,13944,14835,14925,16796,17593,143723.0
4,4.0,146,2166,9832,11216,12257,13466,19364,25958,27917,37664,36838,37089,36408,40392,40528,351245.0
5,14.0,561,7266,25204,26294,29576,32416,46222,64445,71619,108952,104455,112998,113957,130571,134571,1009121.0
Sum,21.0,828,11346,43642,47628,53542,58795,85701,119289,127851,179740,176315,187918,190648,218254,226303,1727821.0


Reviews in dup_ids: 1727821


### Read texts

In [18]:
review_numbers = set()
for year in dup_ids.keys():
    if(year < 2006):
        continue
    for star in dup_ids[year].keys():
        if(star == 3):
            continue
        for tup in dup_ids[year][star]:
            review_numbers.add(tup[KEY_NUMBER])

In [None]:
print(len(review_numbers))

In [None]:
revno_to_text = {}

def get_texts(item):
    return (item[AmazonReviewsReader.KEY_SUMMARY] + " " + item[AmazonReviewsReader.KEY_TEXT]).replace('<br />', ' ')

reader = AmazonReviewsReader(file_storage.get_filepath('amazon_gz_file'), AmazonReviewsReader.MODE_TYPED, max_docs=-1)
for item in reader:
    if item[AmazonReviewsReader.KEY_NUMBER] in review_numbers:
        revno_to_text[item[AmazonReviewsReader.KEY_NUMBER]] = get_texts(item)
print(len(revno_to_text))

## Stopwords

In [None]:
if False:
    from gensim.parsing.preprocessing import STOPWORDS as stopwords_gensim
    print('stopwords_gensim', len(stopwords_gensim))

    from nltk.corpus import stopwords
    #import nltk
    #nltk.download('stopwords')
    stopwords_nltk = set(stopwords.words('english'))
    print('stopwords_nltk', len(stopwords_nltk))

    from sklearn.feature_extraction import _stop_words
    stopwords_sklearn = _stop_words.ENGLISH_STOP_WORDS
    print('stopwords_sklearn', len(stopwords_sklearn))

    import spacy
    # python -m spacy download en_core_web_sm
    stopwords_spacy = spacy.load("en_core_web_sm").Defaults.stop_words
    print('stopwords_spacy', len(stopwords_spacy))

    stopwords_all = stopwords_nltk.union(stopwords_sklearn).union(stopwords_spacy).union(stopwords_gensim)
    # stopwords_gensim 337
    # stopwords_nltk 179
    # stopwords_sklearn 318
    # stopwords_spacy 326
    # stopwords_all 412

    print(InterimStorage('stopwords').write(stopwords_all).get_filepath())
else:
    stopwords_all = InterimStorage('stopwords').read()
print('stopwords_all', len(stopwords_all))
# stopwords_all 412

```python
{'other', 'all', 'did', 'whereupon', 'whole', 'its', 'con', 'wasn', 'four', 'wherein', 'o', 'be', "weren't", 'because', "you're", 'again', 'nine', 'am', 'twenty', 'de', 'however', 'doesn', 'could', 'whose', 'un', 'may', 'toward', 'hundred', 'would', 'out', '‘ve', 'if', 'some', 'anywhere', 'perhaps', '‘m', "didn't", 'becoming', 'five', 'system', 'on', 'me', 'mine', 'least', 'meanwhile', "mustn't", 'two', 'mostly', '’d', 'fifteen', 'having', 'per', 'further', 'whatever', 'top', 'shan', 'himself', 'several', 'thereafter', 'part', 'about', 'cannot', 'whoever', '’s', 'seem', 'too', 'rather', 'latterly', 'sometime', 'where', 'off', 'anyway', 'really', 'last', 'many', 'when', 'whither', 'unless', 'hereby', 'though', 'become', 'will', 'own', 'front', "should've", 'above', 'along', 'another', 'an', 'what', 'cant', "haven't", 'm', 'for', 'doing', "wouldn't", 'using', 'hereafter', 'eg', 'mightn', 'beforehand', 'anyone', 'none', 'are', 'why', 'empty', 'which', '’ll', 'everywhere', 'co', 'same', 'cry', 'seems', 'afterwards', 'that', 'via', 'something', 'quite', 'find', '’re', 'upon', 'everything', 'around', 've', 'under', 'the', 'km', 'very', "you'd", 'take', 'my', 'we', 'otherwise', 'get', 'former', 'once', 'kg', 'then', 'should', 'various', 'name', 'became', 'ours', 'everyone', 'yourself', "isn't", 'yours', 'first', 'either', 'bill', 'it', 'haven', 'these', 'call', 'until', 'couldn', 'whereas', 'while', 'hence', 'those', 'hadn', "n't", 'whereby', 'put', 'seemed', 'found', 'thru', 'amount', 'whether', 'without', 'won', 'six', '‘ll', 'besides', 'there', 'within', '’m', 'ourselves', 'were', 'throughout', 'except', 'hasnt', "doesn't", 'beside', 'down', 'than', 'sixty', 'beyond', '‘s', 'also', 'in', 'formerly', 'fill', 'among', 'nowhere', "won't", 'sometimes', 'ie', 'being', 'make', 'else', 'anything', "needn't", 'her', 'therefore', 'seeming', 'computer', "couldn't", 'herein', 'enough', 'therein', 'but', 'd', 'becomes', 'ma', "you'll", 'twelve', 'three', 'shouldn', 'he', 'wherever', 'our', 'eleven', 'have', 'already', "hasn't", 'show', 'yet', 'wouldn', 'few', 'herself', 'every', 'whenever', 'isn', 'give', "don't", 'even', 'amongst', 'always', "'s", "hadn't", 'before', 'weren', 'now', 'do', 'nevertheless', 'much', 'most', 'might', 'how', 'needn', 'anyhow', 'your', 'any', 'itself', 'others', "aren't", 'whence', 'hereupon', 'interest', 'one', 'thus', 'side', 'this', 'used', 'next', 'since', 'you', 'n’t', 'serious', 'his', 'behind', 'must', 'somewhere', 'at', 'fifty', 'go', 'and', 'mustn', 'she', 'ever', 'nobody', 'still', 'neither', 'no', 'moreover', "it's", 'nor', 'during', 'across', "'m", "wasn't", 'bottom', '‘d', 'more', 'back', "she's", 'thick', 'theirs', '’ve', "shouldn't", 'a', 'eight', 'couldnt', 'yourselves', 'noone', 'thin', "'d", 'hers', 'of', "'ll", 'thereby', 'is', 'due', 'over', 'has', "you've", 'whereafter', 'keep', 'so', 'to', 'with', 'through', 'amoungst', 'done', 'together', 'never', 'well', 's', 'him', 'someone', 'don', 'third', 'onto', 'mill', 'thereupon', 'move', 'somehow', 'n‘t', 't', 'such', 'i', 'into', 'each', 'ca', 'sincere', 'often', 'fire', 'made', 'them', "shan't", 're', 'below', 'here', 'between', 'only', 'ten', "'ve", 'from', 'myself', 'almost', 'by', 'as', 'describe', 'alone', 'against', "mightn't", 'elsewhere', 'been', 'aren', 'inc', 'had', 'latter', 'full', 'please', 'forty', 'thence', 'ltd', 'does', 'etc', 'after', 'say', 'can', 'detail', 'was', 'towards', 'll', 'whom', 'themselves', 'or', "that'll", 'both', 'although', 'up', "'re", 'who', 'nothing', 'namely', 'not', '‘re', 'ain', 'us', 'less', 'they', 'see', 'their', 'didn', 'just', 'indeed', 'hasn', 'y', 'regarding'}
```

## CountVectorizer

Alternatives: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text

In [None]:
corpus = []
index_to_num = {}
for i, item in enumerate(revno_to_text.items()):
    corpus.append(item[1])
    index_to_num[i] = item[0]

if False:
    # https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
    
    # Installation:    
    #import nltk
    #nltk.download('punkt')
    
    from nltk.stem.porter import PorterStemmer
    from nltk import word_tokenize
    import string
    
    stemmer = PorterStemmer()
    remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
    
    def stem_tokens(tokens):
        return [stemmer.stem(item) for item in tokens if len(item)>=3]

    def normalize(text):
        return stem_tokens(word_tokenize(text.lower().translate(remove_punctuation_map)))

    #stop_words = "english"
    stop_words = stopwords_all

    from sklearn.feature_extraction.text import CountVectorizer
    corpus = list(revno_to_text.values())
    vectorizer = CountVectorizer(tokenizer=normalize, stop_words=stop_words, max_features=1000)
    countvec = vectorizer.fit_transform(corpus)
    
    print(InterimStorage('countvec-object').write(countvec).get_filepath())
    print(InterimStorage('countvec-vectorizer').write(vectorizer.vocabulary_).get_filepath())
    
    print(len(vectorizer.get_feature_names()))
    print(vectorizer.get_feature_names())
    print(len(vectorizer.get_stop_words()))
    
else:
    countvec = InterimStorage('countvec-object').read()
    
print(countvec.shape, type(countvec))
# (1,203,682, 1000) <class 'scipy.sparse.csr.csr_matrix'>