In [246]:
import re
import urllib
import json
import numpy as np
import pandas as pd
import csv
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk import FreqDist
import random

stopwords = stopwords.words("english")

In [247]:
with open('config.json') as config_file:
    data = json.load(config_file)
    
wassa = data["WASSA"]
archive = data["archive"]
corporate = data["corporate"]

global_dataset = np.array([])

## WASSA

In [248]:
def clearify_post(post):
    data = list(map(lambda x: x.rstrip(), post.split('\t')))
    return { "text": data[1], "emotion": data[2] }

def get_wassa_dataset(url):
    file = urllib.request.urlopen(url)
    posts = list(map(lambda x : x.decode("utf-8"), file.readlines()))
    return list(map(clearify_post, posts))

In [249]:
total_local_dataset = np.array([])
for key in wassa:
    for dataset_version in wassa[key]:
        local_dataset = get_wassa_dataset(wassa[key][dataset_version])
        global_dataset = np.concatenate((global_dataset, np.array(local_dataset)))
        total_local_dataset = np.concatenate((total_local_dataset, np.array(local_dataset)))
        print("local dataset:", len(local_dataset))
print("total local dataset:", total_local_dataset.shape[0])
print("global dataset:", global_dataset.shape[0])

local dataset: 857
local dataset: 84
local dataset: 1147
local dataset: 110
local dataset: 823
local dataset: 79
local dataset: 786
local dataset: 74
total local dataset: 3960
global dataset: 3960


In [250]:
df = pd.DataFrame(list(total_local_dataset))
df[['emotion', 'text']].groupby(['emotion']).count()

Unnamed: 0_level_0,text
emotion,Unnamed: 1_level_1
anger,941
fear,1257
joy,902
sadness,860


## Archive

In [251]:
def clearify_comment(comment):
    clear_comment = comment.rstrip().split(';')
    return { "text" : clear_comment[0], "emotion": clear_comment[1] }

def get_archive_dataset(url):
    with open(url) as file:
        data = file.readlines()
    return list(map(clearify_comment, data))

In [252]:
total_local_dataset = np.array([])
for key in archive:
    for dataset_version in archive[key]:
        local_dataset = get_archive_dataset(archive[key][dataset_version])
        total_local_dataset = np.concatenate((total_local_dataset, np.array(local_dataset)))
        global_dataset = np.concatenate((global_dataset, np.array(local_dataset)))
        print("local dataset:", len(local_dataset))
print("total local dataset:", total_local_dataset.shape[0])
print("global dataset:", global_dataset.shape[0])

local dataset: 2000
local dataset: 2000
local dataset: 16000
total local dataset: 20000
global dataset: 23960


In [253]:
df = pd.DataFrame(list(total_local_dataset))
df[['emotion', 'text']].groupby(['emotion']).count()

Unnamed: 0_level_0,text
emotion,Unnamed: 1_level_1
anger,2709
fear,2373
joy,6761
love,1641
sadness,5797
surprise,719


In [254]:
def clearify_corporate(text):
    return { "text" : text, "emotion" : "corporate" }

total_local_dataset = np.array([])
with open(corporate, encoding = "ISO-8859-1") as csvfile:
    corporate_reader = csv.DictReader(csvfile, delimiter=',')
    reviews = [row['text'] for row in corporate_reader]
    local_dataset = list(map(clearify_corporate, reviews))
    total_local_dataset = np.concatenate((total_local_dataset, np.array(local_dataset)))
    global_dataset = np.concatenate((global_dataset, np.array(local_dataset)))
    print("local dataset:", len(local_dataset))
print("total local dataset:", total_local_dataset.shape[0])
print("global dataset:", global_dataset.shape[0])

local dataset: 3118
total local dataset: 3118
global dataset: 27078


In [255]:
df = pd.DataFrame(list(total_local_dataset))
df[['emotion', 'text']].groupby(['emotion']).count()

Unnamed: 0_level_0,text
emotion,Unnamed: 1_level_1
corporate,3118


## Pandas & NLTK

In [258]:
# list(np.array) wtf?
df = pd.DataFrame(list(global_dataset))
print(df.count())
df[['emotion', 'text']].groupby(['emotion']).count()

text       27078
emotion    27078
dtype: int64


Unnamed: 0_level_0,text
emotion,Unnamed: 1_level_1
anger,3650
corporate,3118
fear,3630
joy,7663
love,1641
sadness,6657
surprise,719


In [259]:
def remove_noise(tokens, stop_words = ()):
    stemmer = SnowballStemmer("english")
    cleaned_tokens = []
    for token in tokens:
        if len(token) > 0 and not re.search(r'[^0-9a-zA-Z]+', token) and token.lower() not in stop_words:
            cleaned_tokens.append(stemmer.stem(token))
    return cleaned_tokens

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

def get_tokens_for_model(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tokens)

In [263]:
emotions = df['emotion'].drop_duplicates().tolist()
global_dataset = {}

for em in emotions:
    print("NLTK:", em)
    dataset = df[df['emotion'] == em]['text'].astype('str').to_numpy()
    
    tokens = [nltk.word_tokenize(text) for text in dataset]
    cleaned_tokens = [remove_noise(token, stopwords) for token in tokens]
    words = get_all_words(cleaned_tokens)
    
    freq_dist = FreqDist(words)
    print(freq_dist_pos.most_common(10))
    
    global_dataset[em] = [(text_dict, em) for text_dict in get_tokens_for_model(cleaned_tokens)]

global_dataset

NLTK: anger
[('feel', 2843), ('like', 563), ('im', 444), ('get', 282), ('peopl', 203), ('time', 202), ('want', 196), ('know', 188), ('realli', 186), ('think', 177)]
NLTK: fear
[('feel', 2843), ('like', 563), ('im', 444), ('get', 282), ('peopl', 203), ('time', 202), ('want', 196), ('know', 188), ('realli', 186), ('think', 177)]
NLTK: joy
[('feel', 2843), ('like', 563), ('im', 444), ('get', 282), ('peopl', 203), ('time', 202), ('want', 196), ('know', 188), ('realli', 186), ('think', 177)]
NLTK: sadness
[('feel', 2843), ('like', 563), ('im', 444), ('get', 282), ('peopl', 203), ('time', 202), ('want', 196), ('know', 188), ('realli', 186), ('think', 177)]
NLTK: love
[('feel', 2843), ('like', 563), ('im', 444), ('get', 282), ('peopl', 203), ('time', 202), ('want', 196), ('know', 188), ('realli', 186), ('think', 177)]
NLTK: surprise
[('feel', 2843), ('like', 563), ('im', 444), ('get', 282), ('peopl', 203), ('time', 202), ('want', 196), ('know', 188), ('realli', 186), ('think', 177)]
NLTK: cor

array(['Barclays CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference  http://t.co/Ge9Lp7hpyG',
       'Barclays announces result of Rights Issue http://t.co/LbIqqh3wwG',
       'Barclays publishes its prospectus for its å£5.8bn Rights Issue: http://t.co/YZk24iE8G6',
       ...,
       'Yesterday, these #HealthyKids lit up Broadway with #Nestle, @iaaforg and some sporting stars: http://t.co/YdtBj60Ofz',
       'Yo-Jelly, Danone new brand in South Africa : the fun taste sensation of jelly &amp; the health benefits of yoghurt ! #Danone #Yojelly',
       'Z Bhutta: Problems with food&amp;land systems include land acquistion, commodity speculation affecting food prices&amp;lack of discussion #NINS2013'],
      dtype=object)

In [265]:
dataset_for_model = []
for key in global_dataset.keys():
    dataset_for_model += global_dataset[key]
random.shuffle(dataset_for_model)
len(dataset_for_model)

27078

In [266]:
text_count = (df.shape[0] * 80) // 100

train_data = dataset_for_model[:text_count]
test_data = dataset_for_model[text_count:]

print("train data:", len(train_data))
print("test data:", len(test_data))

train data: 21662
test data: 5416


In [267]:
from nltk import classify
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.7324593796159528
Most Informative Features
                    feel = True           surpri : corpor =    550.8 : 1.0
                    feel = None           corpor : love   =    526.2 : 1.0
                    http = True           corpor : fear   =    395.1 : 1.0
                   nestl = True           corpor : sadnes =    329.1 : 1.0
                  health = True           corpor : fear   =    215.8 : 1.0
                      rt = True           corpor : sadnes =    215.3 : 1.0
                       1 = True           corpor : joy    =    193.1 : 1.0
                    daze = True           surpri : joy    =    188.3 : 1.0
                 curious = True           surpri : anger  =    185.8 : 1.0
               overwhelm = True           surpri : corpor =    173.7 : 1.0
None


In [268]:
result = [classifier.classify(test_data[i][0]) == test_data[i][1] for i in range(len(test_data))]
print("Correct answers:", sum(result) * 100 / len(result), "%")

Correct answers: 73.24593796159527 %


In [269]:
def classify(text):
    tokens = remove_noise(nltk.tokenize.word_tokenize(text))
    return classifier.classify(dict([token, True] for token in tokens))

In [273]:
classify("we prefer to work with kanban, so please answer me asap")

'corporate'