In [56]:
import sys
import os
import cPickle

In [57]:
"""
This file provides an interface to 
a pre-trained politeness SVM. 
"""

#####
# Ensure the proper python dependencies exist

try:
    import numpy as np
except:
    sys.stderr.write("Package not found: Politeness model requires python package numpy\n")
    sys.exit(2)

try:
    import scipy
    from scipy.sparse import csr_matrix
except:
    sys.stderr.write("Package not found: Politeness model requires python package scipy\n")
    sys.exit(2)

try:
    import sklearn
except:
    sys.stderr.write("Package not found: Politeness model requires python package scikit-learn\n")
    sys.exit(2)

try:
    import nltk
except:
    sys.stderr.write("Package not found: Politeness model requires python package nltk\n")
    sys.exit(2)

####
# Check versions for sklearn, scipy, numpy, nltk
# Don't error out, just notify

packages2versions = [("scikit-learn", sklearn, "0.15.1"), ("numpy", np, "1.9.0"), ("nltk", nltk, "3.0.0"), ("scipy", scipy, "0.12.0")]

for name, package, expected_v in packages2versions:
    if package.__version__ < expected_v:
        sys.stderr.write("Warning: package '%s', expected version >= %s, detected %s. Code functionality not guaranteed.\n" % (name, expected_v, package.__version__))


####

from politeness.features.vectorizer import PolitenessFeatureVectorizer


####
# Serialized model filename

MODEL_FILENAME = 'politeness/politeness-svm.p'
####
# Load model, initialize vectorizer

clf = cPickle.load(open(MODEL_FILENAME))
vectorizer = PolitenessFeatureVectorizer()

def score(request):
    """
    :param request - The request document to score
    :type request - dict with 'sentences' and 'parses' field
        sample (taken from test_documents.py)--
        {
            'sentences': [
                "Have you found the answer for your question?", 
                "If yes would you please share it?"
            ],
            'parses': [
                ["csubj(found-3, Have-1)", "dobj(Have-1, you-2)", "root(ROOT-0, found-3)", "det(answer-5, the-4)", "dobj(found-3, answer-5)", "poss(question-8, your-7)", "prep_for(found-3, question-8)"], 
                ["prep_if(would-3, yes-2)", "root(ROOT-0, would-3)", "nsubj(would-3, you-4)", "ccomp(would-3, please-5)", "nsubj(it-7, share-6)", "xcomp(please-5, it-7)"]
            ]
        } 

    returns class probabilities as a dict
        {
            'polite': float, 
            'impolite': float
        }
    """
    # vectorizer returns {feature-name: value} dict
    features = vectorizer.features(request)
    fv = [features[f] for f in sorted(features.iterkeys())]
    # Single-row sparse matrix
    X = csr_matrix(np.asarray([fv]))
    probs = clf.predict_proba(X)
    # Massage return format
    probs = {"polite": probs[0][1], "impolite": probs[0][0]}
    return probs

In [16]:
from politeness.test_documents import TEST_DOCUMENTS

for doc in TEST_DOCUMENTS:
    
    probs = score(doc)

    print "===================="
    print "Text: ", doc['text']
    print "\tP(polite) = %.3f" % probs['polite']
    print "\tP(impolite) = %.3f" % probs['impolite']
    print "\n"

Text:  Have you found the answer for your question? If yes would you please share it?
	P(polite) = 0.719
	P(impolite) = 0.281


Text:  Sorry :) I dont want to hack the system!! :) is there another way?
	P(polite) = 0.640
	P(impolite) = 0.360


Text:  What are you trying to do?  Why can't you just store the "Range"?
	P(polite) = 0.034
	P(impolite) = 0.966


Text:  This was supposed to have been moved to &lt;url&gt; per the cfd. why wasn't it moved?
	P(polite) = 0.068
	P(impolite) = 0.932






In [20]:
import nltk.data
text = "Punkt knows that the periods in Mr. Smith and Johann S. Bach\
do not mark sentence boundaries.  And sometimes sentences\
can start with non-capitalized words.  i is a good variable\
name."
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
print(sent_detector.tokenize(text.strip()))

['Punkt knows that the periods in Mr. Smith and Johann S. Bachdo not mark sentence boundaries.', 'And sometimes sentencescan start with non-capitalized words.', 'i is a good variablename.']


In [58]:
import json, requests
class StanfordCoreNLP:
    """
    Modified from https://github.com/smilli/py-corenlp
    """
 
    def __init__(self, server_url):
        # TODO: Error handling? More checking on the url?
        if server_url[-1] == '/':
            server_url = server_url[:-1]
        self.server_url = server_url
 
    def annotate(self, text, properties=None):
        assert isinstance(text, str)
        if properties is None:
            properties = {}
        else:
            assert isinstance(properties, dict)
 
        # Checks that the Stanford CoreNLP server is started.
        try:
            requests.get(self.server_url)
        except requests.exceptions.ConnectionError:
            raise Exception('Check whether you have started the CoreNLP server e.g.\n'
                            '$ cd <path_to_core_nlp_folder>/stanford-corenlp-full-2016-10-31/ \n'
                            '$ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port <port> -timeout <timeout_in_ms>')
 
        data = text.encode()
        r = requests.post(
            self.server_url, params={
                'properties': str(properties)
            }, data=data, headers={'Connection': 'close'})
        output = r.text
        if ('outputFormat' in properties
            and properties['outputFormat'] == 'json'):
            try:
                output = json.loads(output, encoding='utf-8', strict=True)
            except:
                pass
        return output

def dep_parse_sentence(sentence):
    # The StanfordCoreNLP server is running on http://127.0.0.1:9000
    nlp = StanfordCoreNLP('http://127.0.0.1:9000')
    # Json response of all the annotations
    output = nlp.annotate(sentence, properties={
        "annotators": "parse,sentiment",
        "outputFormat": "json"
        # Only split the sentence at End Of Line. We assume that this method only takes in one single sentence.
        #"ssplit.eolonly": "true",
        # Setting enforceRequirements to skip some annotators and make the process faster
        #"enforceRequirements": "false"
    }
                         )
    # Only care about the result of the first sentence because we assume we only annotate a single sentence in this method.
    return output

In [45]:
# input [{u'dep': u'ROOT',
#   u'dependent': 3,
#   u'dependentGloss': u'would',
#   u'governor': 0,
#   u'governorGloss': u'ROOT'},... ]

# output ["root(ROOT-0, would-3)", ... ]
#l = ["prep_if(would-3, yes-2)", "root(ROOT-0, would-3)", "nsubj(would-3, you-4)", "ccomp(would-3, please-5)", "nsubj(it-7, share-6)", "xcomp(please-5, it-7)"]
#for ll in l:
#    print(ll)
out = sentiment_analysis_on_sentence("If yes would you please share it?")
depsList = []
deps = out['sentences'][0]['basicDependencies']
for dep in deps:
    d = dep
    newDepFmt = d['dep'].lower() + "(" + d['governorGloss']+"-"+str(d['governor'])+", "+d['dependentGloss']+"-"+str(d['dependent'])+")"
    depsList.append(newDepFmt)


In [59]:
# {
#         "text": "What are you trying to do?  Why can't you just store the \"Range\"?",
#         "sentences": [
#             "What are you trying to do?",
#             "Why can't you just store the 'Range'?"
#         ],
#         "parses": [
#             ["dep(trying-4, What-1)", "aux(trying-4, are-2)", "nsubj(trying-4, you-3)", "xsubj(do-6, you-3)", "root(ROOT-0, trying-4)", "aux(do-6, to-5)", "xcomp(trying-4, do-6)"],
#             ["advmod(ca-2, Why-1)", "advcl(store-6, ca-2)", "neg(ca-2, n't-3)", "nsubj(store-6, you-4)", "advmod(store-6, just-5)", "root(ROOT-0, store-6)", "det(Range-9, the-7)", "dobj(store-6, Range-9)"]
#         ]
# }
# putting it all together

import nltk.data
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

def getpoliteImpoliteScores(text):
    dataDict = {}
    dataDict["text"] = text
    sentences = sent_detector.tokenize(dataDict["text"].strip())
    dataDict["sentences"] = sentences
    parses = []
    for sentence in sentences:
        out = dep_parse_sentence(sentence)
        depsList = []
        deps = out['sentences'][0]['basicDependencies']
        for dep in deps:
            d = dep
            newDepFmt = d['dep'].lower() + "(" + d['governorGloss']+"-"+str(d['governor'])+", "+d['dependentGloss']+"-"+str(d['dependent'])+")"
            depsList.append(newDepFmt)
        parses.append(depsList)
    dataDict["parses"] = parses
    return score(dataDict)


In [62]:
import json
json_data=open('./out/HomeKitchenCleanBOWSentimentDict.json').read()
HomeKitchenCleanBOWSentimentDict = json.loads(json_data)

In [65]:
HomeKitchenCleanBOWSentimentPolitenessDict = {}
for asin in HomeKitchenCleanBOWSentimentDict:
    try:
        HomeKitchenCleanBOWSentimentPolitenessDict[asin] = HomeKitchenCleanBOWSentimentDict[asin]
        descriptionPoliteImpoliteScores = getpoliteImpoliteScores(str(HomeKitchenCleanBOWSentimentDict[asin]['description']))
        reviewPoliteImpoliteScores = getpoliteImpoliteScores(str(HomeKitchenCleanBOWSentimentDict[asin]['Best review']))
        HomeKitchenCleanBOWSentimentPolitenessDict[asin]['descriptionPoliteness'] = descriptionPoliteImpoliteScores['polite']
        HomeKitchenCleanBOWSentimentPolitenessDict[asin]['reviewPoliteness'] = reviewPoliteImpoliteScores['polite']
    except Exception:
        print(asin)
        pass

B000REMVGK
B001NJ4J6I
B0018O2XFC
B000QJ1MZC


In [69]:
HomeKitchenCleanBOWSentimentPolitenessDict['B003ISJ4L2']

{u'Best review': u"I opted for regular shipping as I didn't really want it to arrive the day before Christmas (for an extra $69 too). However, that's exactly when it did arrive! It was a gift to my daughter & son-in-law. They assembled it Christmas night after we left and had it together in no time. They love it!",
 u'description': u'Versatile, functional and stylish all describe this kitchen island with its cottage oak finish. The island not only provides added kitchen work surface and storage but with its 11-1/2-inch breakfast bar extended also provides a convenient place to grab a bite to eat or enjoy a morning cup of coffee. Find adjustable shelving on both ends of the island as well as two utility drawers and two cabinets, each with two adjustable shelves yielding lots of storage. Enhancing the style and cottage oak finish, the island features both hand applied physical and finish distressing. Overall construction is of sustainable hardwood with a clear coat finish helping to prot

In [67]:
with open('./out/HomeKitchenCleanBOWSentimentPolitenessDict.json', 'w') as outfile:
    json.dump(HomeKitchenCleanBOWSentimentPolitenessDict, outfile)