In [1]:
import regex as re
import sys
import os
import json
import random
import math
import operator as op
import docopt
import numpy as np
import os.path
import itertools
from os import path
from tqdm import tqdm
from scipy.spatial.distance import jensenshannon, cosine
from numpy import asarray
import statistics 
from collections import Counter, defaultdict, namedtuple

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics.pairwise import cosine_similarity, chi2_kernel
from scipy.spatial import distance
import joblib
import shutil

Report = namedtuple("Report", ["precision", "recall", "accuracy", "f1", "tp", "tn", "fp", "fn"])
JSON = "unified-dataset.jsonl"

In [2]:
#this method is used to get the classifier mode and decide whether to single-label or multi-label classification
#this method comes from the original authors and is kept to replicate their results
def get_clf_mode(train, test):
    first = "single"
    for example in train:
        if example.get("labeled", "multi") == "multi":
            first = "multi"
    print(first)
    for example in test:
        if example.get("labeled", "multi") == "multi":
            return first, "multi"
    print("oof")
    return first, "single"

In [3]:
#This methods is used to extract the training and testing data from the unified corpus json.
#The unified corpus json must be produced using the authors original code
#this version is only used in getting the benchmarks for the previous paper
#this version takes the jsonfile, the name of the train file and the name of the test file as parameters
def get_train_test(jsonfile, train, test):
    print("get_train_test param:")
    print("json ", jsonfile)
    print("train ", train)
    print("test ", test)
    same = test in train.split(",") #used if train and test corpus are same
    training, testing = [], []
    count1 = 0
    count2 = 0
    count3 = 0
    count4 = 0
    with open(jsonfile) as f:
        for line in f:
            data = json.loads(line)
            if(data["source"] == test):
                count1 += 1
            if(data["source"] != test):
                count2 += 1
            if((train == None or train == "all-vs") and data["source"] != test):
                count3 += 1
                training.append(data)
            elif data["source"] == test:
                count4 += 1
                testing.append(data)
            elif(data["source"] in train.split(",")):
                count3 += 1
                training.append(data)
    print("there were ", count1, " entries that were in test and ", count2, "that were not in test",
          "and ", count3, " that were in train")
    print("test was appended ", count4, " times")
    if same:
        training, testing = hacky_train_test_split(testing, train_size=0.8, first=train, second=test)
        print("revised", "there were ", len(testing), " entries that were in test and ", len(training), " that were in train")
    return training, testing

In [4]:
#this method stays as is from the original paper
def get_labels(train, test, operation=op.and_, mode="multi"):
    """Return a list of the emotional intersection of two sources."""
    emotions = set()
    if mode == "single":
        emotions.add("noemo")
    train_emotions = set(
        emotion
        for data in train
        for emotion in data["emotions"]
        if data["emotions"][emotion] is not None
    )
    # print(train_emotions)
    test_emotions = set(
        emotion
        for emotion in test[0]["emotions"]
        if test[0]["emotions"][emotion] is not None
    )
    # print(test_emotions)
    return list(emotions | operation(train_emotions, test_emotions))

In [5]:
#expects corpus list in data form
#returns compatible labels
def getMatchingLabels(corpora):
    emotionSetList = []
    for corpus in corpora:
        emoSet = set()
        for data in corpus:
            for emotion in data["emotions"]:
                if data["emotions"][emotion] == 1 or data["emotions"][emotion] == 0:
                    emoSet.add(emotion)
#         emoSet = set(emotion for data in corpus for emotion in data["emotions"] if data["emotions"][emotion] is not None)
        print(emoSet)
        emotionSetList.append(emoSet)
    intersectionSet = set.intersection(*emotionSetList)
    print("matched Labels:", intersectionSet)
    return intersectionSet

In [6]:
#this method stays as is from the original paper
def get_emotion(emovals, labels, emotions, mode="multi"):
#     print("get emotion mode ", mode)
#     print("emovals ",emovals)
#     print("labels ",labels)
#     print("emotions ",emotions)
    if mode == "single":
        truthy = len(list(filter(bool, emovals.values())))
        if truthy == 1:
            emotion = [v for v in emovals if emovals[v]][0]
        elif truthy == 0:
            emotion = "noemo"
        else:
            raise ValueError("Dataset marked as 'single' contains multiple emotions")
        return emotions.get(emotion, emotions.get("noemo"))
    else:
        el = [int((emovals[label] or 0) > 0.1) for label in labels]
        return np.array(el)

In [7]:
#this method stays as is from the original paper
def get_vector(text, wordlist):
    tokens = set(tokenize(text))
#     print(tokens)
    return [1 if word in tokens else 0 for word in wordlist]

In [8]:
#The comment below was left by the original authors. As you can see, their results were unable to use the full bag of words
# this is bad. memory error for all_vs (too many words...)
def get_wordlist(dataset):
    """Get a bag of words from a dataset."""
    bag = set()
    for data in dataset:
        bag.update({token for token in tokenize(data["text"])})
    return list(bag)

In [9]:
#bag of word limit of 5000 is kept from the original authors to match their results
def getTop5000Words(dataset):
    """Get a bag of words from a dataset."""
    bag = Counter()
    for data in dataset:
        bag.update({token for token in tokenize(data["text"])})
    print("bag size", len(bag))
#     print("bag", bag)
    out = list(map(op.itemgetter(0), bag.most_common(5000)))
#     print("this is the output", out)
    return out

In [10]:
#Taken from my own Ling 413 final project, I was going to run trials with lemmatization and other tokenization
#but by the time I was far enough in the project to do this, I didn't have time to run trials with this
# def cleanDataLemma(dataset):
#     taggedDataset = nltk.pos_tag(dataset)
#     filteredString = []
#     for token, tag in taggedDataset:
#         for char in token:
#             if char in string.punctuation:
#                 token = token.replace(char,"") #remove punctuation
#         if (token not in stopWords):
#             lemmatizedToken = ""
#             if tag[0] == 'N':
#                 lemmatizedToken = lemmatizer.lemmatize(token, 'n')
#             elif tag[0] == 'V':
#                 lemmatizedToken = lemmatizer.lemmatize(token, 'v')
#             else:
#                 lemmatizedToken = token
#             if len(lemmatizedToken) > 2:
#                 filteredString.append(lemmatizedToken)
#     return filteredString

In [11]:
#tokenization is kept the same so that performance results match the ones used in the paper as closely as possible
#if there is improvement, it should be because of my changes
def tokenize(text):
    return re.findall(r"\p{L}+", text.lower())

In [12]:
def getWordCountsByEmotion(dataset, emotionLabels):
    emotionCounts = []
    for emotion in emotionLabels:
        emotionDict = Counter()
        for data in dataset:
#             if data["emotions"][emotion] == 1:
#                 print(emotion)
#                 print(data)
#                 print(data["emotions"][emotion])
            emotionDict.update({token for token in tokenize(data["text"]) if data["emotions"][emotion] == 1})
#         print("emotionsWordCounts", len(emotionDict))
        emotionCounts.append(emotionDict)
    return emotionCounts

In [13]:
def getTokenFrequency(dataset):
    token2DocFreq = {}
    for data in dataset:
        tempDict = {}
        for word in data:
            if word not in tempDict:
                tempDict[word] = 1
        for key, value in tempDict.items():
            if key in token2DocFreq:
                token2DocFreq[key] += value
            else:
                token2DocFreq[key] = value
    return token2DocFreq

In [14]:
def getTokenizedCorpusTextPair(corpus1, corpus2):
    with open(JSON) as f:
        for line in f:
            data = json.loads(line)
            if (data["source"] in corpus1) or (corpus1 == None and data["source"] != corpus2):
                corpus1Text.append(tokenize(data["text"]))
                corpus1Data.append(data)
            if data["source"] in corpus2:
                corpus2Text.append(tokenize(data["text"]))
                corpus2Data.append(data)
    corporaData = [corpus1Data,corpus2Data]
    return corpus1Text, corpus2Text, corporaData

In [15]:
def getNormalizedFreq(tokenFreq):
# def getNormalizedFreq(corpus):
#     newCorpus = []
#     for entry in corpus:
#         newCorpus.append(tokenize(entry))
#     tokenFreq = getTokenFrequency(newCorpus)
#     print(tokenFreq.items())
#     print("freq values", tokenFreq)
    newTokenFreq = tokenFreq.copy()
    for item, freq in newTokenFreq.items():
        if(freq == 0):
            newTokenFreq[item] = 0
        else:
            newTokenFreq[item] = 1 + math.log10(freq)
#     print("log weighted values", tokenFreq)
    docLength = 0
    for freq in newTokenFreq.values():
        docLength += freq*freq
    docLength = math.sqrt(docLength)
#     print("doclength", docLength)
    for item, freq in newTokenFreq.items():
        newTokenFreq[item] = freq/docLength
    # logFreq = freq for freq in math.log() 
#     print("normalized")
#     print(tokenFreq)
    return newTokenFreq

In [16]:
def getCosineSimilarityFromTokenFreq(tokenFreq1, tokenFreq2):
    normFreq1 = getNormalizedFreq(tokenFreq1)
    normFreq2 = getNormalizedFreq(tokenFreq2)
    cosineSum = 0
    normFreq1.items()
#     print("tokenFrequency length 1", len(normFreq1))
#     print("tokenFrequency length 2", len(normFreq2))
    intersection = normFreq1.keys() & normFreq2.keys()
    #only loop intersection because unshared values will be multiplied by 0 anyway
    for item in intersection:
#         if normFreq1[item] < 0 or normFreq2[item] < 0 :
#             print("negative?", item, normFreq1[item],normFreq2[item])
#             sys.exit()
#         print(item)
#         print(normFreq1[item])
#         print(normFreq2[item])
        x = normFreq1[item] * normFreq2[item]
        cosineSum += x
    return cosineSum

In [17]:
def getCosineSimilarityFromCorpus(corpus1,corpus2):
    corpus1Text, corpus2Text, corporaData = getTokenizedCorpusTextPair(corpus1, corpus2)
    corpus1Data,corpus2Data = corporaData
    emotionLabels = getMatchingLabels(corporaData)
    tokenFreq1 = getTokenFrequency(corpus1Text)
    tokenFreq2 = getTokenFrequency(corpus2Text)
    sim = getCosineSimilarityFromTokenFreq(tokenFreq1, tokenFreq2)
    return sim

In [72]:
def getCosineSimilarityFromCorpusEmotions(corpus1,corpus2):
    corpus1Text, corpus2Text, corporaData = getTokenizedCorpusTextPair(corpus1, corpus2)
    corpus1Data,corpus2Data = corporaData
    emotionLabels = list(getMatchingLabels(corporaData))
    emotionDicts1 = getWordCountsByEmotion(corpus1Data, emotionLabels)
    emotionDicts2 = getWordCountsByEmotion(corpus2Data, emotionLabels)
    emoSim = {}
    for emotion in range(len(emotionLabels)):
        sim = getCosineSimilarityFromTokenFreq(emotionDicts1[emotion], emotionDicts2[emotion])
        emotion = emotionLabels[emotion]
        emoSim[emotion] = sim
    return emoSim

In [73]:
#averages the values the come from jensenshannon into a single value
def getJensenShannonFromNPArrays(np1,np2):
    js_pq = jensenshannon(np1, np2)
    print(js_pq)
    sumJS = 0
    length = len(js_pq)
    for x in js_pq:
        if math.isnan(x): #assume nan values should be interpretted as 0
            sumJS += 0
        else:
            sumJS += x
    js = sumJS/length
    return js

In [46]:
#test code for understanding how NP arrays are distributed
arr1 = np.load("ssec_emotion-causetrain_xNP.npy")
arr2 = np.load("ssec_grounded_emotionstrain_xNP.npy")
arr3 = np.load("isear_ssectest_xNP.npy")
arr4 = np.load("isear_ssectrain_xNP.npy")
arr5 = np.load("ssec_iseartest_xNP.npy")
arr5 = np.load("ssec_iseartrain_xNP.npy")
print(np.array_equal(arr1,arr2))
print(np.array_equal(arr2,arr3))
print(np.array_equal(arr3,arr4))
print(np.array_equal(arr4,arr5))
print(np.array_equal(arr1,arr5))
print(np.array_equal(arr3,arr5))
print(arr1.shape)
print(arr2.shape)
print(arr3.shape)
print(arr4.shape)
print(arr5.shape)
print(getJensenShannonFromNPArrays(arr1,arr2))
print(getJensenShannonFromNPArrays(arr2,arr3))
# print(getJensenShannonFromNPArrays(arr4,arr5))

True
False
False
False
True
False
(4868, 5000)
(4868, 5000)
(4868, 5000)
(7666, 5000)
(4868, 5000)
[0. 0. 0. ... 0. 0. 0.]
0.0


  q = q / np.sum(q, axis=0)


[0.65300289 0.71217394 0.65470119 ...        nan 0.83255461        nan]
0.48101544698034715


In [47]:
# def getChiSquare(observed,calculated):
#     chiSquare = ((observed - calculated)**2)/calculated
#     return chiSquare

In [48]:
#Calculates Chi Square, or at least it would have if I had finished implementing
#see paper for details on why it was not implemented
# def getChiSquareFromTokenFreq(tokenFreq1, tokenFreq2):
# columnTotal1 = sum(tokenFreq1.values())
# columnTotal2 = sum(tokenFreq2.values())
# intersection = tokenFreq1.keys() & tokenFreq2.keys()
# rowTotals = {key: tokenFreq1.get(key, 0) + tokenFreq2.get(key, 0)
#           for key in set(dict1) | set(dict2)}
# grandTotal = columnTotal1 + columnTotal2
# chiSquareTotal = 0
# calculated1 = []
# calculated2 = []
# for item, rowTotal in rowTotals.items():
#     calculated1[item] = (rowTotal * columnTotal1) / grandTotal
#     calculated2[item] = (rowTotal * columnTotal2) / grandTotal
# for item, value in calculated.items():
#     getChiSquare(observed,calculated[item])
#     calculated[item]
#     return js

In [49]:
#This is a validation of my corpus similarity metrics
corpus1 = "ssec"
corpus1Data = []
corpus1Text = []
corpus2 = "isear"
corpus2Data = []
corpus2Text = []
with open(JSON) as f:
    for line in f:
        data = json.loads(line)
        if data["source"] == corpus1:
            corpus1Text.append(tokenize(data["text"]))
            corpus1Data.append(data)
        if data["source"] == corpus2:
            corpus2Text.append(tokenize(data["text"]))
            corpus2Data.append(data)
print("loaded data")
combinedCorpus = corpus1Data + corpus2Data
combinedCorpusText = corpus1Text + corpus2Text
# tokenFreq = getTokenFrequency(corpus1Text)
# print("tokenFreq", tokenFreq)
words = getTop5000Words(combinedCorpus)
corporaData = [corpus1Data,corpus2Data]
emotionLabels = list(getMatchingLabels(corporaData))
emotions1 = getWordCountsByEmotion(corpus1Data, emotionLabels)
# print(emotions1)
emotions2 = getWordCountsByEmotion(corpus2Data, emotionLabels)
# print(emotions2)
for emotion in range(len(emotionLabels)):
    sim = getCosineSimilarityFromTokenFreq(emotions1[emotion], emotions2[emotion])
    print(emotionLabels[emotion], sim)
fullCorpus1Words = getTokenFrequency(corpus1Text)
fullCorpus2Words = getTokenFrequency(corpus2Text)
sim1 = getCosineSimilarityFromTokenFreq(fullCorpus1Words, fullCorpus2Words)
sim2 = getCosineSimilarityFromTokenFreq(fullCorpus2Words, fullCorpus1Words)
print(sim1)
print(sim2)

loaded data
bag size 17756
{'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
{'anger', 'shame', 'disgust', 'fear', 'sadness', 'guilt', 'joy'}
matched Labels: {'anger', 'fear', 'disgust', 'sadness', 'joy'}
anger 0.46229743778369403
fear 0.4368899238278968
disgust 0.4516565863648767
sadness 0.4538342791036561
joy 0.4439890786057686
0.5107359175632173
0.5107359175632176


In [50]:
# import gensim
# import numpy as np
# from nltk.tokenize import word_tokenize
# print(corpus1Text[:5])
# gen_docs = corpus1Text[:5]
# dictionary = gensim.corpora.Dictionary(gen_docs)
# # print(dictionary.token2id)
# corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
# tf_idf = gensim.models.TfidfModel(corpus)
# for doc in tf_idf [corpus]:
#     print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])
# sims = gensim.similarities.Similarity("../Ling506TermProject/",tf_idf[corpus],
#                                         num_features=len(dictionary))



# file2_docs = ["Mars is the fourth planet in our solar system.",
#         "It is second-smallest planet in the Solar System after Mercury.",
#         "Saturn is yellow planet."]
# tf_idf = gensim.models.TfidfModel(corpus)

# print("Number of documents:",len(file2_docs))  
# for line in file2_docs:
#     query_doc = [w.lower() for w in word_tokenize(line)]
#     query_doc_bow = dictionary.doc2bow(query_doc) #update an existing dictionary and create bag of words

# # perform a similarity query against the corpus
# query_doc_tf_idf = tf_idf[query_doc_bow]
# # print(document_number, document_similarity)
# print('Comparing Result:', sims[query_doc_tf_idf]) 

# sum_of_sims =(np.sum(sims[query_doc_tf_idf], dtype=np.float32))
# print(sum_of_sims)

# avg_sims = [] # array of averages


# # for line in query documents
# for line in file2_docs:
#     # tokenize words
#     query_doc = [w.lower() for w in word_tokenize(line)]
#     # create bag of words
#     query_doc_bow = dictionary.doc2bow(query_doc)
#     # find similarity for each document
#     query_doc_tf_idf = tf_idf[query_doc_bow]
#     # print (document_number, document_similarity)
#     print('Comparing Result:', sims[query_doc_tf_idf]) 
#     # calculate sum of similarities for each query doc
#     sum_of_sims =(np.sum(sims[query_doc_tf_idf], dtype=np.float32))
#     # calculate average of similarity for each query doc
#     avg = sum_of_sims / len(file_docs)
#     # print average of similarity for each query doc
#     print(f'avg: {sum_of_sims / len(file_docs)}')
#     # add average values into array
#     avg_sims.append(avg)  
# # calculate total average
# total_avg = np.sum(avg_sims, dtype=np.float)
# # round the value and multiply by 100 to format it as percentage
# percentage_of_similarity = round(float(total_avg) * 100)
# # if percentage is greater than 100
# # that means documents are almost same
# if percentage_of_similarity >= 100:
#     percentage_of_similarity = 100
    

In [51]:
#Bad attempt at using prebuilt functions for distancing
# arr1 = np.load("ssec_emotion-causetrain_xNP.npy")
# arr2 = np.load("ssec_grounded_emotionstrain_xNP.npy")
# arr3 = np.load("isear_ssectest_xNP.npy")
# corpus1 = "ssec"
# corpus1Data = []
# corpus1Text = []
# corpus2 = "isear"
# corpus2Data = []
# corpus2Text = []
# with open(JSON) as f:
#     for line in f:
#         data = json.loads(line)
#         if data["source"] == corpus1:
#             corpus1Data.append(data)
#         if data["source"] == corpus2:
#             corpus2Data.append(data)
# print("loaded data")
# words1 = getTop5000Words(corpus1Data)
# print(words1)
# words2 = getTop5000Words(corpus2Data)
# for data in tqdm(corpus1Data):
#     corpus1Text.append(get_vector(data["text"], words1))
# for data in tqdm(corpus1Data):
#     corpus2Text.append(get_vector(data["text"], words2))
# # print(corpus1Text[:30])
# print(np.array_equal(arr1,arr2))
# print(np.array_equal(arr1,arr3))
# print(cosine_similarity(arr1,arr3))
# print(chi2_kernel(arr1,arr3))

In [52]:
#this method is modified to track 
def make_arrays(train, test, words, labels, mode="multi", all_vs=False):
    emotions = {label: x for x, label in enumerate(labels)}
    print("emotions in make_arrays: ", emotions)
    train_x, train_y, test_x, test_y = [], [], [], []
    
    print("train raw text: ", sys.getsizeof(train)/1000000)

    for data in tqdm(train):
        # Discard examples where we don't have all selected emotions
        if (mode == "single" or all_vs or all(data["emotions"][emo] is not None for emo in labels)):
            train_y.append(get_emotion(data["emotions"], labels, emotions, mode))
            train_x.append(get_vector(data["text"], words))
    for data in tqdm(test):
        test_y.append(get_emotion(data["emotions"], labels, emotions, mode))
        test_x.append(get_vector(data["text"], words))

    print("train_x length ", len(train_x))
    print("train_x dimension of element ", len(train_x[0]))
    train_xSize = sys.getsizeof(train_x)/1000000
    train_ySize = sys.getsizeof(train_y)/1000000
    train_xLength = len(train_x)
    train_yLength = len(train_y)
    print("train_x (text) size RAW:", train_xSize,"megabytes")
    print("train_y (labels) size RAW:", train_ySize,"megabytes")
    test_xSize = sys.getsizeof(test_x)/1000000
    test_ySize = sys.getsizeof(test_y)/1000000
    test_xLength = len(test_x)
    test_yLength = len(test_y)
    print("test_x (text) size RAW:", test_xSize,"megabytes")
    print("test_y (labels) size RAW:", test_ySize,"megabytes")

    train_x = np.array(train_x)
    train_y = np.array(train_y)
    test_x = np.array(test_x)
    test_y = np.array(test_y)
    train_xNPSize = (train_x.nbytes)/1000000
    train_yNPSize = (train_y.nbytes)/1000000
    test_xNPSize = (test_x.nbytes)/1000000
    test_yNPSize = (test_y.nbytes)/1000000
    
    print("saved test_y")
    print("train_x Size stays the same", train_xSize == train_xNPSize)
    print("train_y Size stays the same", train_ySize == train_yNPSize)
    print("test_x Size stays the same", test_xSize == test_xNPSize)
    print("test_y Size stays the same", test_ySize == test_yNPSize)
    print("train_xNPSize (text) size:", train_xNPSize,"megabytes")
    print("train_yNPSize (labels) size:", train_yNPSize,"megabytes")
    print("test_xNPSize (text) size:", test_xNPSize,"megabytes")
    print("test_yNPSize (labels) size:", test_yNPSize,"megabytes")
    print("train_xNP length ", len(train_x))
    print("train_xNP dimension of element ", train_x.ndim)
    print("train_xNP size ", train_x.size)
    sizes = train_xNPSize, train_yNPSize, test_xNPSize, test_yNPSize
    return train_x, train_y, test_x, test_y, sizes

In [53]:
#kept as part of classification definitions, prevents division by 0 errors
def cheatydiv(x, y):
    return math.nan if y == 0 else x / y

In [54]:
#classification reporting is kept the same for simplicity (ie, no need to reinvent the wheel)
def classification_report_own_single(test_y, predict_y, labels):
    reports = {}
    num2emo = {i: label for i, label in enumerate(labels)}
    decisions = defaultdict(Counter)
    for t, p in zip(test_y, predict_y):
        decisions[t][p] += 1
    for label in decisions:
        tp = decisions[label][label]
        fp = sum(decisions[x][label] for x in decisions if x != label)
        tn = sum(
            decisions[x][y]
            for x in decisions
            for y in decisions[x]
            if x != label and y != label
        )
        fn = sum(decisions[label][y] for y in decisions[label] if y != label)
        precision = tp / (tp + fp) if tp + fp else math.nan
        recall = tp / (tp + fn) if tp + fn else math.nan
        f1 = 2 * cheatydiv((precision * recall), (precision + recall))
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        reports[num2emo[label]] = Report(precision, recall, accuracy, f1, tp, tn, fp, fn)
    return reports

In [55]:
#classification reporting is kept the same for simplicity (ie, no need to reinvent the wheel)
def classification_report_own_multi(test_y, predict_y, labels):
    reports = {}
    num2emo = {i: label for i, label in enumerate(labels)}
    emo2num = {label: i for i, label in enumerate(labels)}
    decisions = defaultdict(Counter)
    for label in labels:
        tp = fp = tn = fn = 0
        for t, p in zip(test_y, predict_y):
            # decisions[t][p] += 1
            tp += bool(t[emo2num[label]] and p[emo2num[label]])
            fp += bool(p[emo2num[label]] and not t[emo2num[label]])
            fn += bool(t[emo2num[label]] and not p[emo2num[label]])
            tn += bool(not t[emo2num[label]] and not p[emo2num[label]])
        precision = tp / (tp + fp) if tp + fp else math.nan
        recall = tp / (tp + fn) if tp + fn else math.nan
        f1 = 2 * cheatydiv((precision * recall), (precision + recall))
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        reports[label] = Report(precision, recall, accuracy, f1, tp, tn, fp, fn)
    return reports

In [56]:
#classification reporting is kept the same for simplicity (ie, no need to reinvent the wheel)
def analyse_results(test_y, predict_y, labels, test, first, second, output, mode):
    print("analyse_results")
    prefix = f"{first}_vs_{second}_{mode}"
    fprefix = output + "/" + prefix
    with open(fprefix + ".txt", "w", encoding="utf-8") as f, open(fprefix + ".json", "w") as g:
        print("hello")
        prec, reca, f1, supp = precision_recall_fscore_support(
            test_y, predict_y, pos_label=None, average="micro"
        )
        accuracy = accuracy_score(test_y, predict_y)
        scoreNameArray = [(prec, "Precision"),(reca, "Recall"),(f1, "F1-score"),(accuracy, "Accuracy")]
        for score, name in scoreNameArray:
            print(name, score, sep="\t", file=f)
            print(name, score, sep="\t")
            
        # print("real:", Counter(test_y), file=f)
        # print("predicted:", Counter(predict_y), file=f)
        
        print(test_y[:10], predict_y[:10], file=f)
        emotions = {i: label for i, label in enumerate(labels)}
        for text, real, predicted, _ in zip(test, test_y, predict_y, range(20)):
            if mode == "multi" and np.array_equal(real, predicted):
                continue
            elif mode == "single" and real == predicted:
                continue
            print(text, "=> predicted:", predicted, ", truth:", real, file=f)
        if mode == "multi":
            results = classification_report_own_multi(test_y, predict_y, labels)
        elif mode == "single":
            results = classification_report_own_single(test_y, predict_y, labels)
        json.dump(
            {
                "precision": prec,
                "recall": reca,
                "f1": f1,
                "accuracy": accuracy,
                "name": prefix,
                **{
                    (emotion + "_" + metric): getattr(results[emotion], metric)
                    for emotion in results
                    for metric in Report._fields
                },
            },
            g,
        )
        g.write("\n")

In [57]:
#used for benchmarking/validating the results of the authors, but not in the final version
#method is kept here for documentation
def hacky_train_test_split(training, train_size=0.8, first=None, second=None):
    tra, tes = [], []
    for example in training:
        if example.get("split") == "train" or example["source"] != second:
            tra.append(example)
        elif example.get("split") == "test":
            tes.append(example)
        else:
            # don't try this at home
            [tes, tra][random.random()<train_size].append(example)
    return tra, tes

In [58]:
#used for benchmarking/validating the results of the authors, but not in the final version
#method is kept here for documentation
def splitTrainAndTestData(training, train_size=0.8, first=None, second=None):
    tra, tes = [], []
    for example in training:
        if example.get("split") == "train" or example["source"] != second:
            tra.append(example)
        elif example.get("split") == "test":
            tes.append(example)
        else:
            # don't try this at home
            [tes, tra][random.random()<train_size].append(example)
    return tra, tes

In [59]:
#this method is used in my testing to generate the combinations that I use in my trials automation
def getPowerset(s):
    x = len(s)
    masks = [1 << i for i in range(x)]
    for i in range(1 << x):
        yield [ss for mask, ss in zip(masks, s) if i & mask]

In [60]:
#this method is used in my testing to generate the combinations that I use in my trials automation
def getPermutations(s):
    subsets = set()
    for L in range(2, 3): #this 
        for subset in itertools.permutations(s, L):
            subsets.add(subset)
    return subsets

In [61]:
#this method is simply in place to get a measure of hard drive space left on my computer
def getHardDriveSpaceLeft():
    total, used, free = shutil.disk_usage("/")
    total = (total // (2**30))
    used = (used // (2**30))
    free = (free // (2**30))
    print("Total: %d GB" % total)
    print("Used: %d GB" % used)
    print("Free: %d GB" % free)
    return total, used, free

In [62]:
def getCrossCorpusValuesWithOrder(possibleChoices):
    #gets the runtime values for cross corpus trials
    #ordering will matter if using the original authors version
    permutations = list(getPermutations(possibleChoices))
    print("permutations length: ",len(permutations))
#         print(permutations)
    corporaSets = []
    for choice in permutations:
#         print("choice ", choice)
        if(len(choice) == 2):
#             print("pair")
            first, second = choice
            firstCorpus, domain1 = first
            secondCorpus, domain2 = second
        corpusPairData = (firstCorpus, secondCorpus, domain1, domain2)
        corporaSets.append(corpusPairData)
    return(corporaSets)

In [63]:
# This method adds the combinations relating to the ALl-VS trials
def getAllVsCorpusValues(possibleChoices):
    corporaSets = []
    for entry in possibleChoices:
        firstCorpus, domain1 = (None, None)
        secondCorpus, domain2 = entry
        corpusPairData = (firstCorpus, secondCorpus, domain1, domain2)
        corporaSets.append(corpusPairData)
    return corporaSets

In [64]:
#Gets the corpora pairs of the same domain
#powerSet is specified because it is only used in the case with a powerSet where ordering does not matter
#but hypothetically, you could put in any list of possible entries
def getCorporaPairsOfSameDomain(powerSet, sizeBoundLower=1, sizeBoundUpper=3):
    for entry in powerSet:
#       if len(entry) < 3 and len(entry) > 0:
        if len(entry) < sizeBoundUpper and len(entry) > sizeBoundLower:
            domainMatch = entry[0][1]
            shouldAppend = True
            for corpus, domain in entry:
                if domain != domainMatch:
                    shouldAppend = False
            if(shouldAppend):
                powerSetCondensed.append(entry)
    print("CorporaPairsOfSameDomain:",len(powerSetCondensed))
    return sameDomainCorporaPairs

In [65]:
#this method adds the trials where the corpus is trained and tested on itself
def getCorporaPairsWithItself(possibleChoices):
    corporaSets = []
    for entry in possibleChoices:
        firstCorpus, domain1 = entry
        secondCorpus, domain2 = entry
        corpusPairData = (firstCorpus, secondCorpus, domain1, domain2)
        corporaSets.append(corpusPairData)
    return corporaSets

In [66]:
def exampleLabel(x):
    if x == 1:
        return "ssec train"
    if x == 2:
        return "ssec train"
    if x == 3:
        return "ssec test"
    if x == 4:
        return "grounded emotion test"
    if x == 5:
        return "grounded emotion train"

In [67]:
#test code for understanding how NP arrays are distributed
#after testing this, I realized that while my NP arrays are useful for classifiers, I think they are useless
#for comparing under the jensen shannon distance, as the only things I can compare are train and test of the same corpora
arr1 = np.load("ssec_emotion-causetrain_xNP.npy") #ssec
arr2 = np.load("ssec_grounded_emotionstrain_xNP.npy") #ssec
arr3 = np.load("grounded_emotions_ssectest_xNP.npy") #ssec
arr4 = np.load("ssec_grounded_emotionstest_xNP.npy") #ssec
arr5 = np.load("grounded_emotions_ssectrain_xNP.npy") #ssec
# arr4 = np.load("isear_ssectrain_xNP.npy")
# arr5 = np.load("ssec_iseartest_xNP.npy")
# arr6 = np.load("ssec_iseartrain_xNP.npy")
print(np.array_equal(arr1,arr2))
print(np.array_equal(arr1,arr3))
print(np.array_equal(arr1,arr4))
print(np.array_equal(arr1,arr5))
print("2")
print(np.array_equal(arr2,arr1))
print(np.array_equal(arr2,arr3))
print(np.array_equal(arr2,arr4))
print(np.array_equal(arr2,arr5))
print("3")
print(np.array_equal(arr3,arr1))
print(np.array_equal(arr3,arr2))
print(np.array_equal(arr3,arr4))
print(np.array_equal(arr3,arr5))
print("4")
print(np.array_equal(arr4,arr1))
print(np.array_equal(arr4,arr2))
print(np.array_equal(arr4,arr3))
print(np.array_equal(arr4,arr5))
print("5")
print(np.array_equal(arr5,arr1))
print(np.array_equal(arr5,arr2))
print(np.array_equal(arr5,arr3))
print(np.array_equal(arr5,arr4))

print("ssec_emotion-causetrain_xNP.npy", arr1.shape)
print("ssec_grounded_emotionstrain_xNP.npy",arr2.shape)
print("grounded_emotions_ssectest_xNP.npy", arr3.shape)
print("ssec_grounded_emotionstest_xNP.npy", arr4.shape)
print("grounded_emotions_ssectrain_xNP.npy", arr5.shape)
# print("ssec_iseartrain_xNP.npy", arr6.shape)
arrList = [arr1,arr2,arr3,arr4,arr5]
for i in range(5):
    for j in range(5):
        if i == j:
            continue
        if arrList[i].shape == arrList[j].shape:
            print(exampleLabel(i+1),"|", exampleLabel(j+1))
            print(getJensenShannonFromNPArrays(arrList[i],arrList[j]))
        else:
            print(exampleLabel(i+1),"|", exampleLabel(j+1), "=== different shapes")
# print(getJensenShannonFromNPArrays(arr1,arr2))
# print(getJensenShannonFromNPArrays(arr1,arr3))
# print(getJensenShannonFromNPArrays(arr1,arr4))
# print(getJensenShannonFromNPArrays(arr1,arr5))
# print("2")
# print(getJensenShannonFromNPArrays(arr2,arr1))
# print(getJensenShannonFromNPArrays(arr2,arr3))
# print(getJensenShannonFromNPArrays(arr2,arr4))
# print(getJensenShannonFromNPArrays(arr2,arr5))
# print("3")
# print(getJensenShannonFromNPArrays(arr3,arr1))
# print(getJensenShannonFromNPArrays(arr3,arr2))
# print(getJensenShannonFromNPArrays(arr3,arr4))
# print(getJensenShannonFromNPArrays(arr3,arr5))
# print("4")
# print(getJensenShannonFromNPArrays(arr4,arr1))
# print(getJensenShannonFromNPArrays(arr4,arr2))
# print(getJensenShannonFromNPArrays(arr4,arr3))
# print(getJensenShannonFromNPArrays(arr4,arr5))
# print("5")
# print(getJensenShannonFromNPArrays(arr5,arr1))
# print(getJensenShannonFromNPArrays(arr5,arr2))
# print(getJensenShannonFromNPArrays(arr5,arr3))
# print(getJensenShannonFromNPArrays(arr5,arr4))
# print(getJensenShannonFromNPArrays(arr5,arr6))

True
False
False
False
2
True
False
False
False
3
False
False
False
False
4
False
False
False
False
5
False
False
False
False
ssec_emotion-causetrain_xNP.npy (4868, 5000)
ssec_grounded_emotionstrain_xNP.npy (4868, 5000)
grounded_emotions_ssectest_xNP.npy (4868, 5000)
ssec_grounded_emotionstest_xNP.npy (2585, 5000)
grounded_emotions_ssectrain_xNP.npy (2585, 5000)
ssec train | ssec train
[0. 0. 0. ... 0. 0. 0.]
0.0
ssec train | ssec test
[0.70660613        nan 0.82713948 ... 0.83255461        nan 0.83255461]
0.41554195650531944
ssec train | grounded emotion test === different shapes
ssec train | grounded emotion train === different shapes
ssec train | ssec train
[0. 0. 0. ... 0. 0. 0.]
0.0
ssec train | ssec test
[0.70660613        nan 0.82713948 ... 0.83255461        nan 0.83255461]
0.41554195650531944
ssec train | grounded emotion test === different shapes
ssec train | grounded emotion train === different shapes
ssec test | ssec train


  p = p / np.sum(p, axis=0)


[0.70660613        nan 0.82713948 ... 0.83255461        nan 0.83255461]
0.41554195650531944
ssec test | ssec train
[0.70660613        nan 0.82713948 ... 0.83255461        nan 0.83255461]
0.41554195650531944
ssec test | grounded emotion test === different shapes
ssec test | grounded emotion train === different shapes
grounded emotion test | ssec train === different shapes
grounded emotion test | ssec train === different shapes
grounded emotion test | ssec test === different shapes
grounded emotion test | grounded emotion train
[       nan 0.65382893 0.69251332 ...        nan        nan        nan]
0.4374044772966029
grounded emotion train | ssec train === different shapes
grounded emotion train | ssec train === different shapes
grounded emotion train | ssec test === different shapes
grounded emotion train | grounded emotion test
[       nan 0.65382893 0.69251332 ...        nan        nan        nan]
0.4374044772966029


In [90]:
def getCorpusSimilarities(corpusPair):
    print("------------------------------",corpusPair,"-------------------------------------------")
    (first, second, domain1, domain2) = corpusPair
    print("Getting data")
    jsonfile = "unified-dataset.jsonl"
    output = "."
    debug = True
    forceMulti = False
    isAllVS = False
    if first == None:
        isAllVS = True
    similaritiesFileName = first + "_" + second + "Similarities" +".txt"
#     numPyFileName1 = first + "_" + second + "train_xNP" +".npy"
#     numPyFileName2 = first + "_" + second + "test_xNP" +".npy"
#     np1 = np.load(numPyFileName1)
#     np2 = np.load(numPyFileName2)
    cosineSim = getCosineSimilarityFromCorpus(first,second)
    cosineSimEmotions = getCosineSimilarityFromCorpusEmotions(first,second)
    print("creating file for corpus similarity")
    with open(similaritiesFileName, 'w', encoding="utf-8") as f:
        print("CosineSimilarity", cosineSim)
        print("CosineSimilarity", cosineSim, file=f)
        print(cosineSimEmotions)
        for emotion, cosSim in cosineSimEmotions.items():
            print(emotion, cosSim, file=f)
#         if(np1.shape == np2.shape):
#             print(np1.shape)
#             print(np2.shape)
#             js = getJensenShannonFromNPArrays(np1,np2)
#             print("jensen Shannon", js)
#         else:
#             print(np1.shape)
#             print(np2.shape)
#             print("jensenShannon", math.nan)
    print("file saved to ", similaritiesFileName)
    
        
    

In [91]:
def performTrialUsingCorpusPair(corpusPair, verifyResults):
    print("------------------------------",corpusPair,"-------------------------------------------")
    (first, second, domain1, domain2) = corpusPair
    print("Getting data")
    jsonfile = "unified-dataset.jsonl"
    output = "."
    debug = True
    forceMulti = False
    isAllVS = False
    if first == None:
        isAllVS = True
    
    if(verifyResults == False):
        if(first == None):
            first = "all-vs"
        train_xNPFileName = first + "_" + second + "train_xNP" +".npy"
        train_yNPFileName = first + "_" + second + "train_yNP" +".npy"
        test_xNPFileName = first + "_" + second + "test_xNP" +".npy"
        test_yNPFileName = first + "_" + second + "test_yNP" +".npy"
        classifierName = "RandomForestClassifier"
        classiferSaveFile = first+"_"+second+classifierName+".pkl"
        filesValid = (path.exists(train_xNPFileName) 
                       and path.exists(train_yNPFileName)
                       and path.exists(test_xNPFileName)
                       and path.exists(test_yNPFileName)
                       and path.exists(classiferSaveFile))
        print("do pickle files exist?", filesValid)
        if(filesValid):
            print("skipping trial")
            return
    
    
    training_data, testing_data = get_train_test(jsonfile, first,second)
    firstCLF, secondCLF = (["multi", "multi"] if forceMulti else get_clf_mode(training_data, testing_data))
    mode = "multi" if "multi" in [firstCLF, secondCLF] else "single"

    print("Detected mode: {}...".format(mode))
    print(len(training_data), len(testing_data))
    print("Getting wordlist...")
    if debug:
        wordlist = getTop5000Words(training_data)
    else:
        wordlist = getTop5000Words(training_data)
        # wordlist = get_wordlist(training_data)
    print("Getting emotions")
    labels = get_labels(training_data, testing_data, mode=mode)
    print(labels)
    print("Making arrays")
    print("checking for save files")
    if(first == None):
        first = "all-vs"
    train_xNPFileName = first + "_" + second + "train_xNP" +".npy"
    train_yNPFileName = first + "_" + second + "train_yNP" +".npy"
    test_xNPFileName = first + "_" + second + "test_xNP" +".npy"
    test_yNPFileName = first + "_" + second + "test_yNP" +".npy"

    if(path.exists(train_xNPFileName) 
       and path.exists(train_yNPFileName)
       and path.exists(test_xNPFileName)
       and path.exists(test_yNPFileName)):
        print("loading from np")
        train_x = np.load(train_xNPFileName)
        train_y = np.load(train_yNPFileName)
        test_x = np.load(test_xNPFileName)
        test_y = np.load(test_yNPFileName)
        train_xNPSize = (train_x.nbytes)/1000000
        train_yNPSize = (train_y.nbytes)/1000000
        test_xNPSize = (test_x.nbytes)/1000000
        test_yNPSize = (test_y.nbytes)/1000000
        print("loaded directly from NP.load")
        print("train_xNPSize (text) size loaded:", train_xNPSize,"megabytes")
        print("train_yNPSize (labels) size loaded:", train_yNPSize,"megabytes")
        print("test_xNPSize (text) size loaded:", test_xNPSize,"megabytes")
        print("test_yNPSize (labels) size loaded:", test_yNPSize,"megabytes")
    else:
#         print("training_data", training_data)
#         print("testing_data", testing_data)
        train_x, train_y, test_x, test_y, sizes = make_arrays(training_data, testing_data, wordlist, labels, mode, isAllVS)
        train_xSize, train_ySize, test_xSize, test_ySize = sizes
        if any(not part.size for part in [train_x, train_y, test_x, test_y]):
            print("Train or test empty. Did you misspell the dataset name?")
            return
        #             sys.exit(1)
        print("saving NP arrays")
        np.save(train_xNPFileName, train_x)
        np.save(train_yNPFileName, train_y)
        np.save(test_xNPFileName, test_x)
        np.save(test_yNPFileName, test_y)
        print("NP arrays saved")

    print("Initializing classifier")
    trainClassifier = True
    if debug:
        classifierName = "RandomForestClassifier"
        print("Searching for a ", classifierName)
        classiferSaveFile = first+"_"+second+classifierName+".pkl"
        print(path.exists(classiferSaveFile))
        if(path.exists(classiferSaveFile)):
            trainClassifier = False
            print("Loading classifier from file")
            classifier = joblib.load(classiferSaveFile)
            print("classifier loaded successfully")
        else:
            print("file not found, creating new classifier")
            classifier = RandomForestClassifier()
    elif mode == "single":
        classifierName = "LogisticRegressionCV"
        print("Searching for a ", classifierName)
        classiferSaveFile = first+"_"+second+classifierName+".pkl"
        print(path.exists(classiferSaveFile))
        if(path.exists(classiferSaveFile)):
            trainClassifier = False
            print("Loading classifier from file")
            classifier = joblib.load(classiferSaveFile)
            print("classifier loaded successfully")
        else:
            print("file not found, creating new classifier")
            classifier = LogisticRegressionCV(
                cv=10,
                penalty="l2",
                fit_intercept=True,
                solver="sag",
                scoring="f1",
                refit=True,
                # n_jobs=-1,
                class_weight="balanced",
            )
    else:
        classifierName = "OneVsRestClassifier"
        print("Searching for a ", classifierName)
        classiferSaveFile = first+"_"+second+classifierName+".pkl"
        print(path.exists(classiferSaveFile))
        if(path.exists(classiferSaveFile)):
            trainClassifier = False
            print("Loading classifier from file")
            classifier = joblib.load(classiferSaveFile)
            print("classifier loaded successfully")
        else:
            print("file not found, creating new classifier")
            classifier = OneVsRestClassifier(
                LogisticRegressionCV(
                    cv=10,
                    penalty="l2",
                    fit_intercept=True,
                    solver="sag",
                    scoring="f1",
                    refit=True,
                    class_weight="balanced",
                    tol = 0.1,
                ),
                n_jobs=-1,
            )
    if(trainClassifier):
        print("this is the classifierName: ", classifierName)
        print("Training...")
        print("train_x (text) size:", (train_x.nbytes)/1000000,"megabytes")
        print("train_y (labels) size:", (train_y.nbytes)/1000000,"megabytes")
        print("train_x (text) length:", len(train_x))
        print("train_y (labels) length:", len(train_y))
        print(train_x[:5])
        print(train_y[:5])

        classifier.fit(train_x, train_y)
        print("finished training, classifier size:", sys.getsizeof(classifier)/1000000,"megabytes")
    print("Predicting...")
    if first == "multi" and second == "single":
        predict_y = classifier.predict_proba(test_x)
        helper = np.zeros_like(predict_y)
        helper[range(len(predict_y)), predict_y.argmax(1)] = 1
        predict_y = helper
    else:
        predict_y = classifier.predict(test_x)

    print("Analysing...")

    analyse_results(
        test_y,
        predict_y,
        labels,
        testing_data,
        first,
        second,
        output,
        mode,  # TODO
    )
    if(path.exists(classiferSaveFile)):
        print("classifier already saved")
    else:
#         classiferSaveFile = first+"_"+second+classifierName+".pkl"
        print("classiferSaveFile: ", classiferSaveFile)
        joblib.dump(classifier, classiferSaveFile)
        print("Saved Successfully")
    total, used, free = getHardDriveSpaceLeft()
    if(free < 10):
        sys.exit("Error: less than 10 gb remaining on disk")
    print("-----------------------------------------------------------------------------------------")
    return

In [92]:
def runTrials(version, verifyResults, classifierTrials, similarityTrials, crossCorpus=True, sameCorpus=True, allVs=False):
    possibleChoices = [('affectivetext','headlines'), ('crowdflower','tweets'), ('dailydialog','conversations'), 
                       ('emoint','tweets'), ('emotion-cause','paragraphs'), ('grounded_emotions','tweets'), 
                       ('isear','descriptions'), ('ssec','tweets'),('tales-emotion','tales'), ('tec','tweets')]
                        #excluded ('emobank','headlines') because it is isn't emotion annotated
                        #and ('electoraltweets','tweets') because it has incompatible annotation
                        #and  ('fb-valence-arousal-anon','tweets') because it isn't emotion annotated
    corporaSets = []
    if version == "previous":
        corporaSets = []
        if crossCorpus:
            corporaSets = (getCrossCorpusValuesWithOrder(possibleChoices))
            #this was added to sort the lists by domain of the first, then by the first corpus name, then the second.
            #it is placed in reverse order simply because if it was put in regular order, the largest of the trials would be first
            #sorting in reverse will (loosely) make the smaller trials run first, while having no impact on the ability to obtain all results
            sortedPermutations = sorted(corporaSets, key = lambda x: (x[2], x[0], x[1]), reverse = True)
        if sameCorpus:
            sortedPermutations += (getCorporaPairsWithItself(possibleChoices))
        if allVs:
            sortedPermutations += (getAllVsCorpusValues(possibleChoices))
        for corpusPair in sortedPermutations:
            if(classifierTrials):
                performTrialUsingCorpusPair(corpusPair, verifyResults)
            if(similarityTrials):
                getCorpusSimilarities(corpusPair)
    else: #version == "myTrials"
        powerSet = list(getPowerset(possibleChoices))
    
    print("End of program!")

In [93]:
if __name__ == "__main__":
    version = "previous"
#     version = "myTrials"
    crossCorpus = True
    sameCorpus = False
    allVs = False
    verifyResults = True
    classifierTrials = False
    similarityTrials = True
    runTrials(version, verifyResults, classifierTrials, similarityTrials, crossCorpus, sameCorpus, allVs)

permutations length:  90
------------------------------ ('tec', 'tales-emotion', 'tweets', 'tales') -------------------------------------------
Getting data
{'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'shame', 'disgust', 'fear', 'noemo', 'sadness', 'guilt', 'joy'}
matched Labels: {'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'shame', 'disgust', 'fear', 'noemo', 'sadness', 'guilt', 'joy'}
matched Labels: {'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
creating file for corpus similarity
CosineSimilarity 0.5208380734001562
{'anger': 0.6949168102796758, 'surprise': 0.469556995823313, 'trust': 0.998182774928435, 'disgust': 0.7252183955756708, 'fear': 0.6000328385623378, 'sadness': 0.594206163179562, 'joy': 0.5461644236745259}
file saved to  tec_tales-emotionSimilarities.txt
-----------------------

{'anger', 'surprise', 'trust', 'shame', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'guilt', 'joy'}
matched Labels: {'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
creating file for corpus similarity
CosineSimilarity 0.5136445512583515
{'anger': 0.6621017981730385, 'surprise': 0.4983905604641875, 'trust': 0.9948006378543295, 'disgust': 0.7106004598283716, 'fear': 0.5493963173972677, 'sadness': 0.5801998644819535, 'joy': 0.5472453009778845}
file saved to  tec_affectivetextSimilarities.txt
------------------------------ ('ssec', 'tec', 'tweets', 'tweets') -------------------------------------------
Getting data
{'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'shame', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'guilt', 'joy'}
matched Labels: {'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'shame

{'anger': 0.7888190079571392, 'surprise': 0.8283091705098922, 'trust': 0.9984618567238803, 'disgust': 0.8623413227532574, 'fear': 0.6970371966192173, 'sadness': 0.7506316042606068, 'joy': 0.71263927126315}
file saved to  ssec_crowdflowerSimilarities.txt
------------------------------ ('ssec', 'affectivetext', 'tweets', 'headlines') -------------------------------------------
Getting data
{'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'shame', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'guilt', 'joy'}
matched Labels: {'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'shame', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'guilt', 'joy'}
matched Labels: {'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
creating file for corpus similarity
CosineSimilarity 0.6757430597349495
{'anger': 0.789107140037819

------------------------------ ('grounded_emotions', 'crowdflower', 'tweets', 'tweets') -------------------------------------------
Getting data
{'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'shame', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'guilt', 'joy'}
matched Labels: {'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'shame', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'guilt', 'joy'}
matched Labels: {'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
creating file for corpus similarity
CosineSimilarity 0.7274802793669567
{'anger': 0.7965467665284212, 'surprise': 0.8478462714383751, 'trust': 0.9992109372442328, 'disgust': 0.8740437964092731, 'fear': 0.7044793929382163, 'sadness': 0.8198515985961012, 'joy': 0.7853285374225585}
file saved to  grounded_emotions_crowdflowerSimilarities.txt
----

matched Labels: {'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'shame', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'guilt', 'joy'}
matched Labels: {'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
creating file for corpus similarity
CosineSimilarity 0.7818541173870399
{'anger': 0.8775650469732952, 'surprise': 0.8618189343665197, 'trust': 0.9996042593756155, 'disgust': 0.8799270018747395, 'fear': 0.7985546528726704, 'sadness': 0.875715991644953, 'joy': 0.8301310650112523}
file saved to  emoint_dailydialogSimilarities.txt
------------------------------ ('emoint', 'crowdflower', 'tweets', 'tweets') -------------------------------------------
Getting data
{'anger', 'surprise', 'trust', 'disgust', 'fear', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'shame', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'guilt', 'joy'}
matched Labels: {'anger', '

------------------------------ ('crowdflower', 'emotion-cause', 'tweets', 'paragraphs') -------------------------------------------
Getting data
{'anger', 'surprise', 'trust', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'shame', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'guilt', 'joy'}
matched Labels: {'anger', 'surprise', 'trust', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'shame', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'guilt', 'joy'}
matched Labels: {'anger', 'surprise', 'trust', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'joy'}
creating file for corpus similarity
CosineSimilarity 0.928070723206218
{'anger': 0.9235721716828474, 'surprise': 0.9436728670640441, 'trust': 0.9997999732552842, 'love': 0.9998882016677906, 'disgust': 0.8948712030963768, 'fear': 0.9588458250848761, 'noemo': 0.7364999196475945,

{'anger', 'surprise', 'trust', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'shame', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'guilt', 'joy'}
matched Labels: {'anger', 'surprise', 'trust', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'joy'}
creating file for corpus similarity
CosineSimilarity 0.9549971931565993
{'anger': 0.9517538398972355, 'surprise': 0.9766259535526319, 'trust': 0.9999041502920539, 'love': 0.9996123898611282, 'disgust': 0.9243191821933393, 'fear': 0.9741589473964044, 'noemo': 0.8646747692702178, 'sadness': 0.9745199557225721, 'joy': 0.9687974343661813}
file saved to  tales-emotion_grounded_emotionsSimilarities.txt
------------------------------ ('tales-emotion', 'emotion-cause', 'tales', 'paragraphs') -------------------------------------------
Getting data
{'anger', 'surprise', 'trust', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'shame', 'love', 'disgust', 'fear', 'noemo', 'sad

{'anger': 0.9706886684563688, 'surprise': 0.987098989128282, 'trust': 0.9999041502920539, 'love': 0.9998131613768365, 'disgust': 0.9374216201003379, 'fear': 0.9850870977666286, 'noemo': 0.8696046017073195, 'sadness': 0.9861368290123251, 'joy': 0.9768432280025922}
file saved to  emotion-cause_tales-emotionSimilarities.txt
------------------------------ ('emotion-cause', 'ssec', 'paragraphs', 'tweets') -------------------------------------------
Getting data
{'anger', 'surprise', 'trust', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'shame', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'guilt', 'joy'}
matched Labels: {'anger', 'surprise', 'trust', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'joy'}
{'anger', 'surprise', 'trust', 'shame', 'love', 'disgust', 'fear', 'noemo', 'sadness', 'guilt', 'joy'}
matched Labels: {'anger', 'surprise', 'trust', 'love', 'disgus

KeyboardInterrupt: 