In [5]:
# change kernel to 2.7 and run these cells to add politeness
# scores to the data
import sys
import os
import cPickle
import nltk.data
import json, requests
from politeness.features.vectorizer import PolitenessFeatureVectorizer

"""
This file provides an interface to 
a pre-trained politeness SVM. 
"""

#####
# Ensure the proper python dependencies exist

try:
    import numpy as np
except:
    sys.stderr.write("Package not found: Politeness model requires python package numpy\n")
    sys.exit(2)

try:
    import scipy
    from scipy.sparse import csr_matrix
except:
    sys.stderr.write("Package not found: Politeness model requires python package scipy\n")
    sys.exit(2)

try:
    import sklearn
except:
    sys.stderr.write("Package not found: Politeness model requires python package scikit-learn\n")
    sys.exit(2)

try:
    import nltk
except:
    sys.stderr.write("Package not found: Politeness model requires python package nltk\n")
    sys.exit(2)

####
# Check versions for sklearn, scipy, numpy, nltk
# Don't error out, just notify

packages2versions = [("scikit-learn", sklearn, "0.15.1"), ("numpy", np, "1.9.0"), ("nltk", nltk, "3.0.0"), ("scipy", scipy, "0.12.0")]

for name, package, expected_v in packages2versions:
    if package.__version__ < expected_v:
        sys.stderr.write("Warning: package '%s', expected version >= %s, detected %s. Code functionality not guaranteed.\n" % (name, expected_v, package.__version__))

####

####
# Serialized model filename

MODEL_FILENAME = 'politeness/politeness-svm.p'
####
# Load model, initialize vectorizer

clf = cPickle.load(open(MODEL_FILENAME))
vectorizer = PolitenessFeatureVectorizer()
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

def score(request):
    """
    :param request - The request document to score
    :type request - dict with 'sentences' and 'parses' field
        sample (taken from test_documents.py)--
        {
            'sentences': [
                "Have you found the answer for your question?", 
                "If yes would you please share it?"
            ],
            'parses': [
                ["csubj(found-3, Have-1)", "dobj(Have-1, you-2)", "root(ROOT-0, found-3)", "det(answer-5, the-4)", "dobj(found-3, answer-5)", "poss(question-8, your-7)", "prep_for(found-3, question-8)"], 
                ["prep_if(would-3, yes-2)", "root(ROOT-0, would-3)", "nsubj(would-3, you-4)", "ccomp(would-3, please-5)", "nsubj(it-7, share-6)", "xcomp(please-5, it-7)"]
            ]
        } 

    returns class probabilities as a dict
        {
            'polite': float, 
            'impolite': float
        }
    """
    # vectorizer returns {feature-name: value} dict
    features = vectorizer.features(request)
    fv = [features[f] for f in sorted(features.iterkeys())]
    # Single-row sparse matrix
    X = csr_matrix(np.asarray([fv]))
    probs = clf.predict_proba(X)
    # Massage return format
    probs = {"polite": probs[0][1], "impolite": probs[0][0]}
    return probs

class StanfordCoreNLP:
    """
    Modified from https://github.com/smilli/py-corenlp
    """
 
    def __init__(self, server_url):
        # TODO: Error handling? More checking on the url?
        if server_url[-1] == '/':
            server_url = server_url[:-1]
        self.server_url = server_url
 
    def annotate(self, text, properties=None):
        assert isinstance(text, str)
        if properties is None:
            properties = {}
        else:
            assert isinstance(properties, dict)
 
        # Checks that the Stanford CoreNLP server is started.
        try:
            requests.get(self.server_url)
        except requests.exceptions.ConnectionError:
            raise Exception('Check whether you have started the CoreNLP server e.g.\n'
                            '$ cd <path_to_core_nlp_folder>/stanford-corenlp-full-2016-10-31/ \n'
                            '$ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port <port> -timeout <timeout_in_ms>')
 
        data = text.encode()
        r = requests.post(
            self.server_url, params={
                'properties': str(properties)
            }, data=data, headers={'Connection': 'close'})
        output = r.text
        if ('outputFormat' in properties
            and properties['outputFormat'] == 'json'):
            try:
                output = json.loads(output, encoding='utf-8', strict=True)
            except:
                pass
        return output

def dep_parse_sentence(sentence):
    # The StanfordCoreNLP server is running on http://127.0.0.1:9000
    nlp = StanfordCoreNLP('http://127.0.0.1:9000')
    # Json response of all the annotations
    output = nlp.annotate(sentence, properties={
        "annotators": "parse,sentiment",
        "outputFormat": "json"
        # Only split the sentence at End Of Line. We assume that this method only takes in one single sentence.
        #"ssplit.eolonly": "true",
        # Setting enforceRequirements to skip some annotators and make the process faster
        #"enforceRequirements": "false"
    }
                         )
    # Only care about the result of the first sentence because we assume we only annotate a single sentence in this method.
    return output

def getpoliteImpoliteScores(text):
    dataDict = {}
    dataDict["text"] = text
    sentences = sent_detector.tokenize(dataDict["text"].strip())
    dataDict["sentences"] = sentences
    parses = []
    for sentence in sentences:
        out = dep_parse_sentence(sentence)
        depsList = []
        deps = out['sentences'][0]['basicDependencies']
        for dep in deps:
            d = dep
            newDepFmt = d['dep'].lower() + "(" + d['governorGloss']+"-"+str(d['governor'])+", "+d['dependentGloss']+"-"+str(d['dependent'])+")"
            depsList.append(newDepFmt)
        parses.append(depsList)
    dataDict["parses"] = parses
    return score(dataDict)

def addPolitenessScoresToProducts(furl):
    cleanerProducts = json.load(open(furl))
    productsProsessed = 0
    for asin in cleanerProducts:
        productsProsessed += 1
        try:
            descriptionPoliteImpoliteScores = getpoliteImpoliteScores(str(cleanerProducts[asin]['description']))
            cleanerProducts[asin]['descriptionPoliteness'] = descriptionPoliteImpoliteScores['polite']
            reviewPoliteImpoliteScores = getpoliteImpoliteScores(str(cleanerProducts[asin]['Best review']))
            cleanerProducts[asin]['reviewPoliteness'] = reviewPoliteImpoliteScores['polite']
        except Exception:
            print('description or review too big for computing politeness for the asin: ',asin)
            pass
        if productsProsessed % (len(cleanerProducts)//10) == 0:
                print("products processed: ",productsProsessed)
    with open(furl, 'w') as outfile:
        json.dump(cleanerProducts, outfile)
    print("done!")
#    return cleanerProducts

In [9]:
# addPolitenessScoresToProducts("./out/reviews_Beauty_5_Beauty_clean.json")
# addPolitenessScoresToProducts("./out/reviews_Health_and_Personal_Care_5_Health & Personal Care_clean.json")
# addPolitenessScoresToProducts("./out/reviews_Grocery_and_Gourmet_Food_5_Grocery & Gourmet Food_clean.json")
addPolitenessScoresToProducts("./out/reviews_Tools_and_Home_Improvement_5_Sports &amp; Outdoors_clean.json")

('products processed: ', 16)
('products processed: ', 32)
('products processed: ', 48)
('products processed: ', 64)
('products processed: ', 80)
('products processed: ', 96)
('products processed: ', 112)
('products processed: ', 128)
('products processed: ', 144)
('products processed: ', 160)


{u'B00008BFS8': {u'Best review': u'I have become a huge fan of this light.  I used to only use the SureFire and Streamlight "Lithium Battery" type lights, but after a hundred dollars worth of replacement batteries I thought I should find something that doesnt take $15-$20 worth of batteries at a shot. I then found a major disapointment in the LED/Regular bulb selectable setup of another Streamlight I purchased.  My next option was this one, I am completely satasfied and now have two for around the house and one for my wifes car and another for my car.  These are small, sturdy, easy to manipulate, have a long battery life and give out a fantastic amount of light.As the actual light goes, it is very crisp and bluish white colored.  It lights up a large area nicely.  It does not travel a long distace but easily provides enough light to light up a room or trail, or to work on a car at night.Another great thing about this light, funny as it sounds, is that the handle is not round, it is obl