In [1]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from helper.constantes import *

In [2]:
sid_obj = SentimentIntensityAnalyzer()

def sentiment_scores(sentence, metric): 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    sentiment_dict = sid_obj.polarity_scores(sentence)
    return sentiment_dict[metric]

In [3]:
all_df = []
for i in range(1,9):
    df = pd.read_csv(data_folder+movies_folder+f"hp{i}.csv")
    all_df.append(df)
df_tot = df = pd.concat(all_df)

In [4]:
df_tot['polarity_score'] = df_tot.apply(lambda x:sentiment_scores(str(x['dialog']),"compound"),axis=1)

In [5]:
df_tot

Unnamed: 0,movie,chapter,character,dialog,polarity_score
0,Harry Potter and the Philosopher's Stone,Doorstep Delivery,Albus Dumbledore,I should have known that you would be here...P...,0.0000
1,Harry Potter and the Philosopher's Stone,Doorstep Delivery,Minerva McGonagall,"Good evening, Professor Dumbledore. Are the ru...",0.6908
2,Harry Potter and the Philosopher's Stone,Doorstep Delivery,Albus Dumbledore,"I'm afraid so, Professor. The good, and the bad.",-0.1531
3,Harry Potter and the Philosopher's Stone,Doorstep Delivery,Minerva McGonagall,And the boy?,0.0000
4,Harry Potter and the Philosopher's Stone,Doorstep Delivery,Albus Dumbledore,Hagrid is bringing him.,0.0000
...,...,...,...,...,...
707,Harry Potter and the Deathly Hallows Part 2,Nineteen Years Later,Harry Potter,Then Slytherin House will have gained a wonder...,0.4854
708,Harry Potter and the Deathly Hallows Part 2,Nineteen Years Later,Albus Potter,Really?,0.0000
709,Harry Potter and the Deathly Hallows Part 2,Nineteen Years Later,Harry Potter,Really.,0.0000
710,Harry Potter and the Deathly Hallows Part 2,Nineteen Years Later,Harry Potter,Ready?,0.3612


In [23]:
import text2emotion as te

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alessio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/alessio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/alessio/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [34]:
text = "We'll be crushed! Hurry!"

In [35]:
te.get_emotion(text)

{'Happy': 0, 'Angry': 0, 'Surprise': 0, 'Sad': 0, 'Fear': 0}

In [36]:
import re 
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

def read_data(file):
    data = []
    with open(file, 'r')as f:
        for line in f:
            line = line.strip()
            label = ' '.join(line[1:line.find("]")].strip().split())
            text = line[line.find("]")+1:].strip()
            data.append([label, text])
    return data

file = 'text.txt'
data = read_data(file)
print("Number of instances: {}".format(len(data)))

Number of instances: 7480


In [37]:
def ngram(token, n): 
    output = []
    for i in range(n-1, len(token)): 
        ngram = ' '.join(token[i-n+1:i+1])
        output.append(ngram) 
    return output

def create_feature(text, nrange=(1, 1)):
    text_features = [] 
    text = text.lower() 
    text_alphanum = re.sub('[^a-z0-9#]', ' ', text)
    for n in range(nrange[0], nrange[1]+1): 
        text_features += ngram(text_alphanum.split(), n)    
    text_punc = re.sub('[a-z0-9]', ' ', text)
    text_features += ngram(text_punc.split(), 1)
    return Counter(text_features)

In [38]:
def convert_label(item, name): 
    items = list(map(float, item.split()))
    label = ""
    for idx in range(len(items)): 
        if items[idx] == 1: 
            label += name[idx] + " "
    
    return label.strip()

emotions = ["joy", 'fear', "anger", "sadness", "disgust", "shame", "guilt"]

X_all = []
y_all = []
for label, text in data:
    y_all.append(convert_label(label, emotions))
    X_all.append(create_feature(text, nrange=(1, 4)))

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.2, random_state = 123)

def train_test(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    train_acc = accuracy_score(y_train, clf.predict(X_train))
    test_acc = accuracy_score(y_test, clf.predict(X_test))
    return train_acc, test_acc

from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer(sparse = True)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [49]:
svc = SVC(max_iter=2000)
lsvc = LinearSVC(random_state=123)
rforest = RandomForestClassifier(random_state=123)
dtree = DecisionTreeClassifier()

clifs = [svc, lsvc, rforest, dtree]

# train and test them 
print("| {:25} | {} | {} |".format("Classifier", "Training Accuracy", "Test Accuracy"))
print("| {} | {} | {} |".format("-"*25, "-"*17, "-"*13))
for clf in clifs: 
    clf_name = clf.__class__.__name__
    train_acc, test_acc = train_test(clf, X_train, X_test, y_train, y_test)
    print("| {:25} | {:17.7f} | {:13.7f} |".format(clf_name, train_acc, test_acc))

| Classifier                | Training Accuracy | Test Accuracy |
| ------------------------- | ----------------- | ------------- |
| SVC                       |         0.9067513 |     0.4512032 |




| LinearSVC                 |         0.9988302 |     0.5768717 |


KeyboardInterrupt: 

In [47]:


features = create_feature(text, nrange=(1, 4))
features = vectorizer.transform(features)
prediction = clifs[1].predict(features)

In [48]:
prediction

array(['guilt'], dtype='<U7')