# Resources

* https://nlp.stanford.edu/sentiment/index.html
* Need all data from https://nlp.stanford.edu/sentiment/code.html (see "Dataset Downloads" section)
* This code uses CoreNLP server from the command line. You may need to change paths or code based on operating system

In [191]:
import os
import re
import subprocess

import numpy as np
import spacy
import pandas as pd

In [20]:
sentences = pd.read_csv('../data/stanfordSentimentTreebank/datasetSentences.txt', sep='\t')
sentences.head()

Unnamed: 0,sentence_index,sentence
0,1,The Rock is destined to be the 21st Century 's...
1,2,The gorgeously elaborate continuation of `` Th...
2,3,Effective but too-tepid biopic
3,4,If you sometimes like to go to the movies to h...
4,5,"Emerges as something rare , an issue movie tha..."


In [22]:
datasetSplit = pd.read_csv('../data/stanfordSentimentTreebank/datasetSplit.txt', sep=',')
datasetSplit.head()

Unnamed: 0,sentence_index,splitset_label
0,1,1
1,2,1
2,3,2
3,4,2
4,5,2


In [23]:
sentiment_labels = pd.read_csv('../data/stanfordSentimentTreebank/sentiment_labels.txt', sep='|')
sentiment_labels.head()

Unnamed: 0,phrase ids,sentiment values
0,0,0.5
1,1,0.5
2,2,0.44444
3,3,0.5
4,4,0.42708


In [25]:
dictionary = pd.read_csv('../data/stanfordSentimentTreebank/dictionary.txt', sep='|', header=None, names=['phrase', 'phrase ids'])
dictionary.head()

Unnamed: 0,phrase,phrase ids
0,!,0
1,! ',22935
2,! '',18235
3,! Alas,179257
4,! Brilliant,22936


In [114]:
phrase_sentiments = dictionary.merge(sentiment_labels, on='phrase ids')
phrase_sentiments.head()

Unnamed: 0,phrase,phrase ids,sentiment values
0,!,0,0.5
1,! ',22935,0.52778
2,! '',18235,0.5
3,! Alas,179257,0.44444
4,! Brilliant,22936,0.86111


In [57]:
out_df = sentences.merge(datasetSplit, on='sentence_index').loc[lambda df: df.splitset_label == 2]
out_df = out_df.merge(dictionary, how='left', left_on='sentence', right_on='phrase')
out_df = out_df.merge(sentiment_labels, on='phrase ids')
out_df.drop('phrase', axis=1, inplace=True)

print(out_df.shape)
out_df.head(10)

(2125, 5)


Unnamed: 0,sentence_index,sentence,splitset_label,phrase ids,sentiment values
0,3,Effective but too-tepid biopic,2,13995,0.51389
1,4,If you sometimes like to go to the movies to h...,2,14123,0.73611
2,5,"Emerges as something rare , an issue movie tha...",2,13999,0.86111
3,6,The film provides some great insight into the ...,2,14498,0.59722
4,7,Offers that rare combination of entertainment ...,2,14351,0.83333
5,8,Perhaps no picture ever made has more literall...,2,14371,0.69444
6,9,Steers turns in a snappy screenplay that curls...,2,225968,0.77778
7,10,But he somehow pulls it off .,2,222746,0.73611
8,11,Take Care of My Cat offers a refreshingly diff...,2,14475,0.76389
9,12,"This is a film well worth seeing , talking and...",2,14534,0.90278


In [58]:
out_df.to_csv('../data/small_test.csv', columns=['sentence'], header=False, index=False)

In [62]:
file_list = list()
base_dir = 'D:/Users/john/My Documents/UIC/CS 594 - AIMA/papers/Sentiment Analysis - RNTN/data/test_files/'

for index,row in out_df.iterrows():
    file_name = 'test_' + str(row['sentence_index']) + '.txt'
    file_list.append('"' + base_dir + file_name + '"')
    
    with open(base_dir + file_name, 'w') as f:
        f.write(row['sentence'])
        
#with open('../data/file_list.txt', 'w') as f:
#    for i in file_list:
#        f.write(i + ',')

In [80]:
for file_name in file_list:
    p = subprocess.Popen('java -cp "*" -mx5g edu.stanford.nlp.sentiment.SentimentPipeline -fileList ' + file_name + ' -output pennTrees', 
                         cwd='D:/Users/john/My Documents/stanford-corenlp-full-2017-06-09')
    p.wait()

In [92]:
predictions = list()

for file_name in file_list:
    with open(file_name[1:-1] + '.out', 'r') as f:
        f.readline()
        predictions.append(f.readline()[:-1])

In [94]:
out_df['predictions'] = pd.Series(predictions, name='prediction')

In [95]:
out_df.to_csv('../data/test_predictions.txt', index=False)

# Named entities

In [3]:
nlp = spacy.load('en')

In [201]:
docs = nlp.pipe(out_df['sentence'].tolist())

In [202]:
ne_text = list()
ne_label = list()
ne_sentiment = list()

for doc in docs:
    for ent in doc.ents:
        sentiment = phrase_sentiments.loc[lambda df: df['phrase'] == ent.text, 'sentiment values'].values
        
        if len(sentiment) > 0:
            ne_text.append(ent.text)
            ne_label.append(ent.label_)
            ne_sentiment.append(sentiment[0])
            
sent_ents = pd.DataFrame({'text': ne_text, 'label': ne_label, 'sentiment_val': ne_sentiment}, columns=['text', 'label', 'sentiment_val'])
sent_ents['sent_bin'] = pd.cut(sent_ents['sentiment_val'], bins=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0], labels=[0,1,2,3,4], include_lowest=True)
sent_ents.head()

Unnamed: 0,text,label,sentiment_val,sent_bin
0,Wasabi,ORG,0.5,2
1,Steers,ORG,0.5,2
2,Asian,NORP,0.5,2
3,Wisegirls,GPE,0.5,2
4,Wendigo,PERSON,0.5,2


In [158]:
# Histogram of Sentiment labels of named entities
sent_ents['sent_bin'].value_counts()

2    1412
3      92
1      46
4       8
0       1
Name: sent_bin, dtype: int64

In [203]:
# Number of tokens in named entities
sent_ents['text'].str.split(' ').str.len().value_counts()

1    1057
2     366
3     103
4      21
5      11
6       1
Name: text, dtype: int64

In [187]:
# Examples
sent_ents.loc[sent_ents['text'].str.split(' ').str.len() == 3, :].head()

Unnamed: 0,text,label,sentiment_val,sent_bin
7,Lovely & Amazing,ORG,0.77778,3
9,about 95 minutes,TIME,0.5,2
47,a good half-hour,TIME,0.79167,3
71,the Discovery Channel,ORG,0.58333,2
74,Steve Irwin 's,PERSON,0.45833,2


In [204]:
# Parse trees to get gold sentiment label for a phrase
def get_phrase_prediction(phrase, df):
    prediction_val = np.nan
    # Find tree that contains the phrase
    prediction = df.loc[out_df['sentence'].str.contains(phrase)]['predictions'].values
    
    if len(prediction) > 0:
        prediction = prediction[0]
        phrase_words = phrase.split(' ')
        
        # Parse 1 token named entities
        if len(phrase_words) == 1:
            regex_str = '\(\d ' + phrase_words[0] + '\)'
            m = re.search(regex_str, prediction)
            
            if m:
                prediction_val = m.group(0)[1]
        
        # Parse 2 token named entities
        elif len(phrase_words) == 2:
            regex_str = '\(\d \(\d ' + phrase_words[0] + '\) \(\d ' + phrase_words[1] + '\)'
            m = re.search(regex_str, prediction)
            
            if m:
                prediction_val = m.group(0)[1]
            
        # Parse 3 token named entities
        # There are different possible formats for length 3 named entities
        elif len(phrase_words) == 3:
            regex_str = '\(\d \(\d ' + phrase_words[0] + '\) \(\d ' + phrase_words[1] + '\) \(\d ' + phrase_words[2] + '\)'
            m = re.search(regex_str, prediction)
            
            if m is None:
                regex_str = '\(\d \(\d \(\d ' + phrase_words[0] + '\) \(\d ' + phrase_words[1] + '\)\) \(\d ' + phrase_words[2] + '\)'
                m = re.search(regex_str, prediction)
                
                if m is None:
                    regex_str = '\(\d \(\d ' + phrase_words[0] + '\) \(\d \(\d ' + phrase_words[1] + '\) \(\d ' + phrase_words[2] + '\)'
                    m = re.search(regex_str, prediction)
                    
                    if m is None:
                        prediction_val = np.nan
                    else:
                        prediction_val = m.group(0)[1]
                else:
                    prediction_val = m.group(0)[1]
            else:
                prediction_val = m.group(0)[1]
            
    return np.float(prediction_val)

In [197]:
sent_ents['sent_pred_bin'] = sent_ents['text'].apply(get_phrase_prediction, df=out_df)

In [199]:
pd.crosstab(sent_ents['sent_bin'], sent_ents['sent_pred_bin'])

sent_pred_bin,1.0,2.0,3.0
sent_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,1,0
1,26,13,3
2,1,1242,11
3,1,30,52
4,0,3,3
