# Test results: extract_CountVectorizer_stemmer notebook

In [38]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

import bz2
import pickle

from access.file_storage import FileStorage
from access.interim_storage import InterimStorage
from amore.amazon_reviews_reader import AmazonReviewsReader

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# For multiple usage afterwards

file_storage = FileStorage()

## Read data

In [9]:
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-DocTermMatrix'), 'r') as file:
    doc_term_matrix = pickle.loads(file.read())
    print('document-term matrix:', doc_term_matrix.shape, type(doc_term_matrix))
    # document-term matrix: (1203682, 918065) <class 'scipy.sparse.csr.csr_matrix'>
    
    # print(doc_term_matrix)
    # (0,       816096)  5
    # :         :
    # (1203681, 686709)  1

document-term matrix: (1203682, 918065) <class 'scipy.sparse.csr.csr_matrix'>


In [61]:
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-Vocabulary'), 'r') as file:
    vocabulary = pickle.loads(file.read())
    print('vocabulary:', len(vocabulary), type(vocabulary))
    print('example:', next(iter(vocabulary.items())))
    # vocabulary: 918065 <class 'dict'>
    # example: thi

inv_vocabulary = {v: k for k, v in vocabulary.items()}
print('inv_vocabulary:', len(inv_vocabulary), type(inv_vocabulary))
print('example:', next(iter(inv_vocabulary.items())))

vocabulary: 918065 <class 'dict'>
example: ('thi', 816096)
inv_vocabulary: 918065 <class 'dict'>
example: (816096, 'thi')


In [19]:
with bz2.BZ2File(file_storage.get_filepath('AMORE-CountVec-VecidRevno'), 'r') as file:
    vecid_revno = pickle.loads(file.read())
    print('vectorizer ID to review no:', len(vecid_revno), type(vecid_revno))
    print('example:', next(iter(vecid_revno.items())))
    # vectorizer ID to review no: 1203682 <class 'dict'>
    # example: (0, 3)

vectorizer ID to review no: 1203682 <class 'dict'>
example: (0, 3)


In [40]:
max_docs = 100  # do not load everything (memory usage, time).
reader = AmazonReviewsReader(file_storage.get_filepath('amazon_gz_file'), AmazonReviewsReader.MODE_TYPED, max_docs=max_docs)
revno_to_text = {}
def get_texts(item):
    return (item[AmazonReviewsReader.KEY_SUMMARY] + " " + item[AmazonReviewsReader.KEY_TEXT]).replace('<br />', ' ')
for item in reader:
    revno_to_text[item[AmazonReviewsReader.KEY_NUMBER]] = get_texts(item)

## Check data

In [69]:
vecid = 0
revno = vecid_revno[vecid]
text  = revno_to_text[revno]
print('vecid:', vecid, '  revno:', revno, '  text:', text)

vecid: 0   revno: 3   text: This movie needed to be made. The scenes in this film can be very disquieting due to their graphic re-enactment of real events, but this story needs to be told. I will say the violence was injected into the movie with as much taste as manageable when dealing with rape scenes, etc. Inspired by true events, women are being murdered in Juarez after they leave the factory where they work. A fearful community is suddenly given some hope when one of the young victims not only lives, but experiences 'stigmata' after seeing the Virgin Mary.  I was shocked to learn that murders in Juarez are still happening and many are unsolved. I believe this director brought a very important story to the surface. Though it's never pleasant to think about young women being murdered, this movie depicts a harsh reality of the high cost of exploited-cheap labor.  Chrissy K. McVay - Author


In [70]:
for dim_value in doc_term_matrix[vecid].nonzero()[1]:
    print(inv_vocabulary[dim_value], end='  ')

thi  movi  need  scene  film  veri  disquiet  graphic  reenact  real  event  stori  told  violenc  wa  inject  tast  manag  deal  rape  inspir  true  women  murder  juarez  leav  factori  work  fear  commun  suddenli  given  hope  young  victim  onli  live  experi  stigmata  virgin  mari  shock  learn  happen  mani  unsolv  believ  director  brought  import  surfac  pleasant  think  depict  harsh  realiti  high  cost  exploitedcheap  labor  chrissi  mcvay  author  

In [71]:
# movie        not included
# disquieting

# thi     why included? 'this'?
# inspir                'Inspired'?
# tast                  'taste'?
# leav                  'leave'?
# onli                  ?

# exploited-cheap -> exploitedcheap

## Check methods

In [80]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens if len(item)>=3]

import string
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

from nltk import word_tokenize
def normalize(text):
    return stem_tokens(word_tokenize(text.lower().translate(remove_punctuation_map)))

In [87]:
print(text)
print()
print(normalize(text))

This movie needed to be made. The scenes in this film can be very disquieting due to their graphic re-enactment of real events, but this story needs to be told. I will say the violence was injected into the movie with as much taste as manageable when dealing with rape scenes, etc. Inspired by true events, women are being murdered in Juarez after they leave the factory where they work. A fearful community is suddenly given some hope when one of the young victims not only lives, but experiences 'stigmata' after seeing the Virgin Mary.  I was shocked to learn that murders in Juarez are still happening and many are unsolved. I believe this director brought a very important story to the surface. Though it's never pleasant to think about young women being murdered, this movie depicts a harsh reality of the high cost of exploited-cheap labor.  Chrissy K. McVay - Author

['thi', 'movi', 'need', 'made', 'the', 'scene', 'thi', 'film', 'can', 'veri', 'disquiet', 'due', 'their', 'graphic', 'reenac

In [88]:
print(remove_punctuation_map)
print()
for char in string.punctuation:
    print(char, end='  ')

{33: None, 34: None, 35: None, 36: None, 37: None, 38: None, 39: None, 40: None, 41: None, 42: None, 43: None, 44: None, 45: None, 46: None, 47: None, 58: None, 59: None, 60: None, 61: None, 62: None, 63: None, 64: None, 91: None, 92: None, 93: None, 94: None, 95: None, 96: None, 123: None, 124: None, 125: None, 126: None}

!  "  #  $  %  &  '  (  )  *  +  ,  -  .  /  :  ;  <  =  >  ?  @  [  \  ]  ^  _  `  {  |  }  ~  

In [94]:
# include words with len < 3

def stem_tokens2(tokens):
    return [stemmer.stem(item) for item in tokens]

def normalize2(text):
    return stem_tokens2(word_tokenize(text.lower().translate(remove_punctuation_map)))

In [95]:
print(normalize2(text))

['thi', 'movi', 'need', 'to', 'be', 'made', 'the', 'scene', 'in', 'thi', 'film', 'can', 'be', 'veri', 'disquiet', 'due', 'to', 'their', 'graphic', 'reenact', 'of', 'real', 'event', 'but', 'thi', 'stori', 'need', 'to', 'be', 'told', 'i', 'will', 'say', 'the', 'violenc', 'wa', 'inject', 'into', 'the', 'movi', 'with', 'as', 'much', 'tast', 'as', 'manag', 'when', 'deal', 'with', 'rape', 'scene', 'etc', 'inspir', 'by', 'true', 'event', 'women', 'are', 'be', 'murder', 'in', 'juarez', 'after', 'they', 'leav', 'the', 'factori', 'where', 'they', 'work', 'a', 'fear', 'commun', 'is', 'suddenli', 'given', 'some', 'hope', 'when', 'one', 'of', 'the', 'young', 'victim', 'not', 'onli', 'live', 'but', 'experi', 'stigmata', 'after', 'see', 'the', 'virgin', 'mari', 'i', 'wa', 'shock', 'to', 'learn', 'that', 'murder', 'in', 'juarez', 'are', 'still', 'happen', 'and', 'mani', 'are', 'unsolv', 'i', 'believ', 'thi', 'director', 'brought', 'a', 'veri', 'import', 'stori', 'to', 'the', 'surfac', 'though', 'it', 

In [96]:
# plus: no translate

def stem_tokens3(tokens):
    return [stemmer.stem(item) for item in tokens]

def normalize3(text):
    return stem_tokens3(word_tokenize(text.lower()))

In [97]:
print(normalize3(text))

['thi', 'movi', 'need', 'to', 'be', 'made', '.', 'the', 'scene', 'in', 'thi', 'film', 'can', 'be', 'veri', 'disquiet', 'due', 'to', 'their', 'graphic', 're-enact', 'of', 'real', 'event', ',', 'but', 'thi', 'stori', 'need', 'to', 'be', 'told', '.', 'i', 'will', 'say', 'the', 'violenc', 'wa', 'inject', 'into', 'the', 'movi', 'with', 'as', 'much', 'tast', 'as', 'manag', 'when', 'deal', 'with', 'rape', 'scene', ',', 'etc', '.', 'inspir', 'by', 'true', 'event', ',', 'women', 'are', 'be', 'murder', 'in', 'juarez', 'after', 'they', 'leav', 'the', 'factori', 'where', 'they', 'work', '.', 'a', 'fear', 'commun', 'is', 'suddenli', 'given', 'some', 'hope', 'when', 'one', 'of', 'the', 'young', 'victim', 'not', 'onli', 'live', ',', 'but', 'experi', "'stigmata", "'", 'after', 'see', 'the', 'virgin', 'mari', '.', 'i', 'wa', 'shock', 'to', 'learn', 'that', 'murder', 'in', 'juarez', 'are', 'still', 'happen', 'and', 'mani', 'are', 'unsolv', '.', 'i', 'believ', 'thi', 'director', 'brought', 'a', 'veri', '

In [98]:
# plus: no stemmer

def normalize4(text):
    return word_tokenize(text.lower())

In [99]:
print(normalize4(text))

['this', 'movie', 'needed', 'to', 'be', 'made', '.', 'the', 'scenes', 'in', 'this', 'film', 'can', 'be', 'very', 'disquieting', 'due', 'to', 'their', 'graphic', 're-enactment', 'of', 'real', 'events', ',', 'but', 'this', 'story', 'needs', 'to', 'be', 'told', '.', 'i', 'will', 'say', 'the', 'violence', 'was', 'injected', 'into', 'the', 'movie', 'with', 'as', 'much', 'taste', 'as', 'manageable', 'when', 'dealing', 'with', 'rape', 'scenes', ',', 'etc', '.', 'inspired', 'by', 'true', 'events', ',', 'women', 'are', 'being', 'murdered', 'in', 'juarez', 'after', 'they', 'leave', 'the', 'factory', 'where', 'they', 'work', '.', 'a', 'fearful', 'community', 'is', 'suddenly', 'given', 'some', 'hope', 'when', 'one', 'of', 'the', 'young', 'victims', 'not', 'only', 'lives', ',', 'but', 'experiences', "'stigmata", "'", 'after', 'seeing', 'the', 'virgin', 'mary', '.', 'i', 'was', 'shocked', 'to', 'learn', 'that', 'murders', 'in', 'juarez', 'are', 'still', 'happening', 'and', 'many', 'are', 'unsolved',

In [106]:
# only no stemmer / substitute with whitespace instead of removind punctuation -> good result

def stem_tokens5(tokens):
    return [item for item in tokens if len(item)>=3]

remove_punctuation_map = dict((ord(char), ' ') for char in string.punctuation)

def normalize5(text):
    return stem_tokens5(word_tokenize(text.lower().translate(remove_punctuation_map)))

In [107]:
print(normalize5(text))

['this', 'movie', 'needed', 'made', 'the', 'scenes', 'this', 'film', 'can', 'very', 'disquieting', 'due', 'their', 'graphic', 'enactment', 'real', 'events', 'but', 'this', 'story', 'needs', 'told', 'will', 'say', 'the', 'violence', 'was', 'injected', 'into', 'the', 'movie', 'with', 'much', 'taste', 'manageable', 'when', 'dealing', 'with', 'rape', 'scenes', 'etc', 'inspired', 'true', 'events', 'women', 'are', 'being', 'murdered', 'juarez', 'after', 'they', 'leave', 'the', 'factory', 'where', 'they', 'work', 'fearful', 'community', 'suddenly', 'given', 'some', 'hope', 'when', 'one', 'the', 'young', 'victims', 'not', 'only', 'lives', 'but', 'experiences', 'stigmata', 'after', 'seeing', 'the', 'virgin', 'mary', 'was', 'shocked', 'learn', 'that', 'murders', 'juarez', 'are', 'still', 'happening', 'and', 'many', 'are', 'unsolved', 'believe', 'this', 'director', 'brought', 'very', 'important', 'story', 'the', 'surface', 'though', 'never', 'pleasant', 'think', 'about', 'young', 'women', 'being'