# Rating

Test to be integrated to amore.py

In [40]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

from access.file_storage import FileStorage
from amore.opinion_lexicon import OpinionLexicon
from amore.amazon_reviews_reader import AmazonReviewsReader
from amore.review import Review

import os
import datetime
import pprint
from gensim.utils import simple_preprocess
from operator import methodcaller

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
# Print helper
def pp(value):
    # https://docs.python.org/3/library/pprint.html#pprint.pprint
    pprint.pprint(value, compact=True, width=120)

In [3]:
# File access
file_storage = FileStorage()

In [4]:
# Load opinion words
opinion_lexicon = OpinionLexicon(file_storage.get_filepath('opinion-words'))
print(len(opinion_lexicon.get_positive_set()), len(opinion_lexicon.get_negative_set()))

2006 4783


In [5]:
# Check length of opinion words. Result: Min 2 (useless, use 3), Max 24. 
if False:
    opinion_min = 100
    opinion_max = -1
    for word in pos_words:
        if len(word) < opinion_min:
            opinion_min = len(word)
        if len(word) > opinion_max:
            opinion_max = len(word)
    for word in neg_words:
        if len(word) < opinion_min:
            opinion_min = len(word)
        if len(word) > opinion_max:
            opinion_max = len(word)
    print(opinion_min, opinion_max)

    if False:
        # a+ bs ax
        for word in pos_words:
            if len(word) <= 2:
                print(word)
        print()
        for word in neg_words:
            if len(word) <= 2:
                print(word)

In [6]:
# Function extracts opinion words
def extract_opinion_words(text, positive=True, min_len=3, max_len=24):
    token_set = set(simple_preprocess(text, min_len=min_len, max_len=max_len))
    if(positive):
        return opinion_lexicon.extract_positive_words(token_set)
    else:
        return opinion_lexicon.extract_negative_words(token_set)

In [7]:
# Example review
review1 = {'productId': 'B003AI2VGA', 'userId': 'A141HP4LYPWMSR', 'profileName': 'Brian E. Erland "Rainbow Sphinx"', 'helpfulness': (7, 7), 'score': 3, 'time': datetime.datetime(2007, 6, 25, 2, 0), 'summary': '"There Is So Much Darkness Now ~ Come For The Miracle"', 'text': 'Synopsis: On the daily trek from Juarez, Mexico to El Paso, Texas an ever increasing number of female workers are found raped and murdered in the surrounding desert. Investigative reporter Karina Danes (Minnie Driver) arrives from Los Angeles to pursue the story and angers both the local police and the factory owners who employee the undocumented aliens with her pointed questions and relentless quest for the truth.<br /><br />Her story goes nationwide when a young girl named Mariela (Ana Claudia Talancon) survives a vicious attack and walks out of the desert crediting the Blessed Virgin for her rescue. Her story is further enhanced when the "Wounds of Christ" (stigmata) appear in her palms. She also claims to have received a message of hope for the Virgin Mary and soon a fanatical movement forms around her to fight against the evil that holds such a stranglehold on the area.<br /><br />Critique: Possessing a lifelong fascination with such esoteric matters as Catholic mysticism, miracles and the mysterious appearance of the stigmata, I was immediately attracted to the \'05 DVD release `Virgin of Juarez\'. The film offers a rather unique storyline blending current socio-political concerns, the constant flow of Mexican migrant workers back and forth across the U.S./Mexican border and the traditional Catholic beliefs of the Hispanic population. I must say I was quite surprised by the unexpected route taken by the plot and the means and methods by which the heavenly message unfolds.<br /><br />`Virgin of Juarez\' is not a film that you would care to watch over and over again, but it was interesting enough to merit at least one viewing. Minnie Driver delivers a solid performance and Ana Claudia Talancon is perfect as the fragile and innocent visionary Mariela. Also starring Esai Morales and Angus Macfadyen (Braveheart).', 'number': 1}
if True:
    pp(review1)

{'helpfulness': (7, 7),
 'number': 1,
 'productId': 'B003AI2VGA',
 'profileName': 'Brian E. Erland "Rainbow Sphinx"',
 'score': 3,
 'summary': '"There Is So Much Darkness Now ~ Come For The Miracle"',
 'text': 'Synopsis: On the daily trek from Juarez, Mexico to El Paso, Texas an ever increasing number of female '
         'workers are found raped and murdered in the surrounding desert. Investigative reporter Karina Danes (Minnie '
         'Driver) arrives from Los Angeles to pursue the story and angers both the local police and the factory owners '
         'who employee the undocumented aliens with her pointed questions and relentless quest for the truth.<br /><br '
         '/>Her story goes nationwide when a young girl named Mariela (Ana Claudia Talancon) survives a vicious attack '
         'and walks out of the desert crediting the Blessed Virgin for her rescue. Her story is further enhanced when '
         'the "Wounds of Christ" (stigmata) appear in her palms. She also claims t

In [8]:
pp(extract_opinion_words(review1['text'], positive=True))

['perfect', 'miracles', 'enhanced', 'heavenly', 'interesting', 'merit', 'enough', 'fascination', 'visionary', 'solid']


In [9]:
pp(extract_opinion_words(review1['text'], positive=False))

['plot', 'vicious', 'relentless', 'evil', 'mysterious', 'unexpected', 'attack', 'fragile', 'undocumented', 'raped',
 'desert', 'concerns', 'wounds', 'fanatical']


In [10]:
def review_extract(review):
    text = review[AmazonReviewsReader.KEY_SUMMARY]+' '+review[AmazonReviewsReader.KEY_TEXT]
    return (review[AmazonReviewsReader.KEY_NUMBER], len(extract_opinion_words(text, positive=True)), len(extract_opinion_words(text, positive=False)))

In [17]:
max_docs = 10
min_year=2001
max_year=2010+1
reviews = AmazonReviewsReader(file_storage.get_filepath('amazon_gz_file'), AmazonReviewsReader.MODE_TYPED, max_docs=max_docs, min_year=min_year, max_year=max_year)

In [42]:
years = range(min_year, max_year)
stars = [0,4]

counter = {}
for year in years:
    counter[year] = {}
    for star in stars:
        counter[year][star] = []
    
if True:
    for review in reviews:
        text = review[AmazonReviewsReader.KEY_SUMMARY] + ' ' + review[AmazonReviewsReader.KEY_TEXT]
        counter[year][star].append(Review(
            review[AmazonReviewsReader.KEY_NUMBER],
            len(extract_opinion_words(text, positive=True)),
            len(extract_opinion_words(text, positive=False))))

In [45]:
pp(counter)

{2001: {0: [], 4: []},
 2002: {0: [], 4: []},
 2003: {0: [], 4: []},
 2004: {0: [], 4: []},
 2005: {0: [], 4: []},
 2006: {0: [], 4: []},
 2007: {0: [], 4: []},
 2008: {0: [], 4: []},
 2009: {0: [], 4: []},
 2010: {0: [],
        4: [Review("1","11",15), Review("2","10",22), Review("3","4",6), Review("4","7",15), Review("5","0",8),
            Review("6","3",3), Review("7","0",6), Review("8","12",1), Review("9","10",1), Review("10","7",0)]}}


In [46]:
x = counter[2010][4]
x

[Review("1","11",15),
 Review("2","10",22),
 Review("3","4",6),
 Review("4","7",15),
 Review("5","0",8),
 Review("6","3",3),
 Review("7","0",6),
 Review("8","12",1),
 Review("9","10",1),
 Review("10","7",0)]

In [53]:

sorted(x, key=methodcaller('get_negative_sort_value'))

[Review("5","0",8),
 Review("7","0",6),
 Review("2","10",22),
 Review("4","7",15),
 Review("3","4",6),
 Review("1","11",15),
 Review("6","3",3),
 Review("9","10",1),
 Review("8","12",1),
 Review("10","7",0)]