In [193]:
import regex as re
import sys
import os
import json
import random
import math
import operator as op
import docopt
import numpy as np
import os.path
import itertools
from os import path
from tqdm import tqdm
from scipy.spatial.distance import jensenshannon
from numpy import asarray
import statistics 
from collections import Counter, defaultdict, namedtuple

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics.pairwise import cosine_similarity, chi2_kernel
from scipy.spatial import distance
import joblib
import shutil

Report = namedtuple("Report", ["precision", "recall", "accuracy", "f1", "tp", "tn", "fp", "fn"])
JSON = "unified-dataset.jsonl"

In [194]:
#test code for understanding how NP arrays are distributed
# arr1 = np.load("ssec_emotion-causetrain_xNP.npy")
# arr2 = np.load("ssec_grounded_emotionstrain_xNP.npy")
# arr3 = np.load("isear_ssectest_xNP.npy")
# arr4 = np.load("isear_ssectrain_xNP.npy")
# arr5 = np.load("ssec_iseartest_xNP.npy")
# arr5 = np.load("ssec_iseartrain_xNP.npy")
# print(np.array_equal(arr1,arr2))
# print(np.array_equal(arr2,arr3))
# print(np.array_equal(arr3,arr4))
# print(np.array_equal(arr4,arr5))
# print(np.array_equal(arr1,arr5))
# print(np.array_equal(arr3,arr5))
# print(arr1.shape)
# print(arr2.shape)
# print(arr3.shape)
# print(arr4.shape)
# print(arr5.shape)
# print(getJensenShannonFromNPArrays(arr1,arr2))
# print(getJensenShannonFromNPArrays(arr2,arr3))
# print(getJensenShannonFromNPArrays(arr4,arr5))

In [195]:
#this method is used to get the classifier mode and decide whether to single-label or multi-label classification
#this method comes from the original authors and is kept to replicate their results
def get_clf_mode(train, test):
    first = "single"
    for example in train:
        if example.get("labeled", "multi") == "multi":
            first = "multi"
    print(first)
    for example in test:
        if example.get("labeled", "multi") == "multi":
            return first, "multi"
    print("oof")
    return first, "single"

In [196]:
#This methods is used to extract the training and testing data from the unified corpus json.
#The unified corpus json must be produced using the authors original code
#this version is only used in getting the benchmarks for the previous paper
#this version takes the jsonfile, the name of the train file and the name of the test file as parameters
def get_train_test(jsonfile, train, test):
    print("get_train_test param:")
    print("json ", jsonfile)
    print("train ", train)
    print("test ", test)
#     same = test in train.split(",") #used if train and test corpus are same
    training, testing = [], []
    count1 = 0
    count2 = 0
    count3 = 0
    count4 = 0
    with open(jsonfile) as f:
        for line in f:
            data = json.loads(line)
            if(data["source"] == test):
                count1 += 1
            if(data["source"] != test):
                count2 += 1
            if(train == None and data["source"] != test):
                count3 += 1
                training.append(data)
            elif data["source"] == test:
                count4 += 1
                testing.append(data)
            elif(data["source"] in train.split(",")):
                count3 += 1
                training.append(data)
    print("there were ", count1, " entries that were in test and ", count2, "that were not in test",
          "and ", count3, " that were in train")
    print("test was appended ", count4, " times")
#     if same:
#         training, testing = hacky_train_test_split(training, train_size=0.8, first=train, second=test)
    return training, testing

In [197]:
#this method stays as is from the original paper
def get_labels(train, test, operation=op.and_, mode="multi"):
    """Return a list of the emotional intersection of two sources."""
    emotions = set()
    if mode == "single":
        emotions.add("noemo")
    train_emotions = set(
        emotion
        for data in train
        for emotion in data["emotions"]
        if data["emotions"][emotion] is not None
    )
    # print(train_emotions)
    test_emotions = set(
        emotion
        for emotion in test[0]["emotions"]
        if test[0]["emotions"][emotion] is not None
    )
    # print(test_emotions)
    return list(emotions | operation(train_emotions, test_emotions))

In [198]:
#expects corpus list in data form
#returns compatible labels
def getMatchingLabels(corpora):
    emotionSetList = []
    for corpus in corpora:
        emoSet = set(emotion for data in corpus for emotion in data["emotions"] if data["emotions"][emotion] is not None)
        emotionSetList.append(emoSet)
    intersectionSet = set.intersection(*emotionSetList)
    print(intersectionSet)
    return intersectionSet

In [199]:
#this method stays as is from the original paper
def get_emotion(emovals, labels, emotions, mode="multi"):
#     print("get emotion mode ", mode)
#     print("emovals ",emovals)
#     print("labels ",labels)
#     print("emotions ",emotions)
    if mode == "single":
        truthy = len(list(filter(bool, emovals.values())))
        if truthy == 1:
            emotion = [v for v in emovals if emovals[v]][0]
        elif truthy == 0:
            emotion = "noemo"
        else:
            raise ValueError("Dataset marked as 'single' contains multiple emotions")
        return emotions.get(emotion, emotions.get("noemo"))
    else:
        el = [int((emovals[label] or 0) > 0.1) for label in labels]
        return np.array(el)

In [200]:
#this method stays as is from the original paper
def get_vector(text, wordlist):
    tokens = set(tokenize(text))
    print(tokens)
    return [1 if word in tokens else 0 for word in wordlist]

In [201]:
#The comment below was left by the original authors. As you can see, their results were unable to use the full bag of words
# this is bad. memory error for all_vs (too many words...)
def get_wordlist(dataset):
    """Get a bag of words from a dataset."""
    bag = set()
    for data in dataset:
        bag.update({token for token in tokenize(data["text"])})
    return list(bag)

In [202]:
#averages the values the come from jensenshannon into a single value
def getJensenShannonFromNPArrays(np1,np2):
    js_pq = jensenshannon(np1, np2)
    print(js_pq)
    sumJS = 0
    length = len(js_pq)
    for x in js_pq:
        if math.isnan(x): #assume nan values should be interpretted as 0
            sumJS += 0
        else:
            sumJS += x
    js = sumJS/length
    return js

In [203]:
#bag of word limit of 5000 is kept from the original authors to match their results
def getTop5000Words(dataset):
    """Get a bag of words from a dataset."""
    bag = Counter()
    for data in dataset:
        bag.update({token for token in tokenize(data["text"])})
    print("bag size", len(bag))
#     print("bag", bag)
    out = list(map(op.itemgetter(0), bag.most_common(5000)))
#     print("this is the output", out)
    return out

In [204]:
# def cleanReviewDataLemma(dataset):
#     taggedDataset = nltk.pos_tag(dataset)
#     filteredString = []
#     for token, tag in taggedReview:
#         for char in token:
#             if char in string.punctuation:
#                 token = token.replace(char,"") #remove punctuation
#         if (token not in stopWords):
#             lemmatizedToken = ""
#             if tag[0] == 'N':
#                 lemmatizedToken = lemmatizer.lemmatize(token, 'n')
#             elif tag[0] == 'V':
#                 lemmatizedToken = lemmatizer.lemmatize(token, 'v')
#             else:
#                 lemmatizedToken = token
#             if len(lemmatizedToken) > 2:
#                 filteredString.append(lemmatizedToken)
#     return filteredString

In [205]:
#tokenization is kept the same so that performance results match the ones used in the paper as closely as possible
#if there is improvement, it should be because of my changes
def tokenize(text):
    return re.findall(r"\p{L}+", text.lower())

In [206]:
def getTop5000WordsByEmotion(dataset, wordsList, emotionLabels):
    emotionCounts = []
    print("emotions")
    for emotion in emotionLabels:
        emotionDict = Counter()
        for data in dataset:
#             if data["emotions"][emotion] == 1:
#                 print(emotion)
#                 print(data)
#                 print(data["emotions"][emotion])
            emotionDict.update({token for token in tokenize(data["text"]) if data["emotions"][emotion] == 1})
        print(len(emotionDict))
        emotionCounts.append(emotionDict)
    return emotionCounts

In [207]:
def getTokenFrequency(dataset):
    token2DocFreq = {}
    for data in dataset:
        tempDict = {}
        for word in data:
            if word not in tempDict:
                tempDict[word] = 1
        for key, value in tempDict.items():
            if key in token2DocFreq:
                token2DocFreq[key] += value
            else:
                token2DocFreq[key] = value
    return token2DocFreq

In [215]:
def getTokenizedCorpusTextPair(corpus1, corpus2):
    with open(JSON) as f:
        for line in f:
            data = json.loads(line)
            if data["source"] in corporaNameList:
                corpus1Text.append(tokenize(data["text"]))
                corpus1Data.append(data)
            if data["source"] in corporaNameList:
                corpus2Text.append(tokenize(data["text"]))
                corpus2Data.append(data)
    corporaData = [corpus1Data,corpus2Data]
    return corpus1Text, corpus2Text, corporaData

In [237]:
def getNormalizedFreq(tokenFreq):
# def getNormalizedFreq(corpus):
#     newCorpus = []
#     for entry in corpus:
#         newCorpus.append(tokenize(entry))
#     tokenFreq = getTokenFrequency(newCorpus)
#     print(tokenFreq.items())
    print("freq values", tokenFreq)
    for item, freq in tokenFreq.items():
        if(freq == 0):
            tokenFreq[item] = 0
        else:
            tokenFreq[item] = 1 + math.log10(freq)
    print("log weighted values", tokenFreq)
    docLength = 0
    for freq in tokenFreq.values():
        docLength += freq*freq
    docLength = math.sqrt(docLength)
    print("doclength", docLength)
    for item, freq in tokenFreq.items():
        tokenFreq[item] = freq/docLength
    # logFreq = freq for freq in math.log() 
    print("normalized")
    print(tokenFreq)
    return tokenFreq

In [238]:
def getCosineSimilarityFromTokenFreq(tokenFreq1, tokenFreq2):
    normFreq1 = getNormalizedFreq(tokenFreq1)
    normFreq2 = getNormalizedFreq(tokenFreq2)
    cosineSum = 0
    for (item1,freq1) in normFreq1.items():
        if item1 in normFreq2:
            x = freq1 * normFreq2[item1]
            cosineSum += x
#     for (item2,freq2) in normFreq2.items():
#         if item2 in normFreq1:
#             x = freq1 * normFreq2[item1]
#             cosineSum += x
    return cosineSum

In [239]:
def getCosineSimilarityFromCorpus(corpus1,corpus2):
    corpus1Text, corpus2Text, corpus1Data, corpus2Data = getTokenizedCorpusTextPair(corpus1, corpus2)
    emotionLabels = getMatchingLabels(corporaData)
    emotionDicts1 = getTop5000WordsByEmotion(corpus1Data, words, emotionLabels)
    emotionDicts2 = getTop5000WordsByEmotion(corpus2Data, words, emotionLabels)
    for emotion in range(len(emotionLabels)):
        sim = getCosineSimilarityFromTokenFreq(emotionDicts1[emotion], emotionDicts2[emotion])
        print(sim)

In [240]:
corpus1 = "ssec"
corpus1Data = []
corpus1Text = []
corpus2 = "isear"
corpus2Data = []
corpus2Text = []
with open(JSON) as f:
    for line in f:
        data = json.loads(line)
        if data["source"] == corpus1:
            corpus1Text.append(tokenize(data["text"]))
            corpus1Data.append(data)
        if data["source"] == corpus2:
            corpus2Text.append(tokenize(data["text"]))
            corpus2Data.append(data)
print("loaded data")
combinedCorpus = corpus1Data + corpus2Data
combinedCorpusText = corpus1Text + corpus2Text
tokenFreq = getTokenFrequency(corpus1Text)
# print("tokenFreq", tokenFreq)
words = getTop5000Words(combinedCorpus)
corporaData = [corpus1Data,corpus2Data]
emotionLabels = getMatchingLabels(corporaData)
emotions1 = getTop5000WordsByEmotion(corpus1Data, words, emotionLabels)
# print(emotions1)
emotions2 = getTop5000WordsByEmotion(corpus2Data, words, emotionLabels)
# print(emotions2)
tokenFreq1 = {"affection":115, "jealous":10, "gossip":2}
tokenFreq2 = {"affection":58, "jealous":7, "gossip":0}
tokenFreq3 = {"affection":20, "jealous":11, "gossip":6}
sim = getCosineSimilarityFromTokenFreq(tokenFreq1, tokenFreq2)
print("test", sim)
for emotion in range(len(emotionLabels)):
    sim = getCosineSimilarityFromTokenFreq(emotions1[emotion], emotions2[emotion])
    print(sim)

loaded data
bag size 17756
{'joy', 'sadness', 'fear', 'disgust', 'anger'}
emotions
6938
8349
6707
7353
8864
emotions
2527
2523
3118
3401
3339
freq values {'affection': 115, 'jealous': 10, 'gossip': 2}
log weighted values {'affection': 3.060697840353612, 'jealous': 2.0, 'gossip': 1.3010299956639813}
doclength 3.880792486021725
normalized
{'affection': 0.7886785627878784, 'jealous': 0.5153586560486872, 'gossip': 0.33524853502220936}
freq values {'affection': 58, 'jealous': 7, 'gossip': 0}
log weighted values {'affection': 2.7634279935629373, 'jealous': 1.845098040014257, 'gossip': 0}
doclength 3.3227881444461267
normalized
{'affection': 0.8316593997067999, 'jealous': 0.5552860910191477, 'gossip': 0.0}
test 0.9420834336799457


freq values Counter({'semst': 2029, 'the': 836, 'to': 721, 'i': 528, 'a': 515, 'is': 460, 'and': 451, 'of': 417, 'in': 398, 'you': 391, 'for': 378, 's': 311, 'it': 271, 'be': 265, 'that': 233, 'on': 218, 't': 214, 'are': 206, 'we': 192, 'my': 181, 'with': 162, 'this': 159, 'god': 154, 'all': 152, 'can': 152, 'have': 151, 'your': 151, 'not': 148, 'will': 136, 'me': 128, 'what': 126, 'hillaryclinton': 126, 'but': 118, 'if': 117, 'so': 117, 'realdonaldtrump': 117, 'love': 110, 'm': 109, 'just': 104, 'they': 102, 'hillary': 102, 'trump': 102, 'like': 101, 'at': 101, 'he': 99, 'do': 97, 'women': 95, 'who': 94, 'don': 92, 'people': 91, 'about': 88, 'our': 85, 'life': 84, 'one': 82, 'how': 81, 'as': 81, 'has': 79, 'when': 79, 'no': 79, 'get': 79, 'rt': 77, 'us': 77, 'she': 77, 'by': 76, 'up': 76, 'now': 75, 'out': 74, 'from': 72, 'or': 71, 'there': 70, 're': 68, 'good': 67, 'more': 66, 'need': 66, 'great': 65, 'want': 62, 'her': 59, 'time': 59, 'make': 59, 'world': 56, 'an': 56, 'right': 56, 

log weighted values Counter({'semst': 4.307282047033346, 'the': 3.9222062774390163, 'to': 3.857935264719429, 'i': 3.722633922533812, 'a': 3.711807229041191, 'is': 3.662757831681574, 'and': 3.6541765418779604, 'of': 3.6201360549737576, 'in': 3.5998830720736876, 'you': 3.5921767573958667, 'for': 3.5774917998372255, 's': 3.4927603890268375, 'it': 3.432969290874406, 'be': 3.423245873936808, 'that': 3.367355921026019, 'on': 3.3384564936046046, 't': 3.330413773349191, 'are': 3.3138672203691533, 'we': 3.2833012287035497, 'my': 3.2576785748691846, 'with': 3.2095150145426308, 'this': 3.2013971243204513, 'god': 3.187520720836463, 'all': 3.1818435879447726, 'can': 3.1818435879447726, 'have': 3.1789769472931693, 'your': 3.1789769472931693, 'not': 3.1702617153949575, 'will': 3.1335389083702174, 'me': 3.1072099696478683, 'what': 3.100370545117563, 'hillaryclinton': 3.100370545117563, 'but': 3.0718820073061255, 'if': 3.0681858617461617, 'so': 3.0681858617461617, 'realdonaldtrump': 3.0681858617461617,


doclength 107.30520610354347
normalized


Counter({'semst': 0.04014047596979649, 'the': 0.03655187310906713, 'to': 0.035952917894745376, 'i': 0.03469201595812304, 'a': 0.034591119702612624, 'is': 0.034134017953865343, 'and': 0.034054047092103676, 'of': 0.033736816566761266, 'in': 0.03354807471876065, 'you': 0.0334762579359814, 'for': 0.0333394056984071, 's': 0.03254977568988145, 'it': 0.03199256975063991, 'be': 0.03190195516360659, 'that': 0.0313811048252096, 'on': 0.03111178492479836, 't': 0.03103683310701188, 'are': 0.030882632266429445, 'we': 0.030597781299961806, 'my': 0.03035899834837192, 'with': 0.02991015190302724, 'this': 0.02983449956035948, 'god': 0.02970518240988872, 'all': 0.029652276003034495, 'can': 0.029652276003034495, 'have': 0.029625561170122872, 'your': 0.029625561170122872, 'not': 0.029544342073541462, 'will': 0.029202114437453568, 'me': 0.028956749467025727, 'what': 0.028893011417599628, 'hillaryclinton': 0.028893011417599628, 'but': 0.02862752068470874, 'if': 0.028593075519425734, 'so': 0.0285930755194257

freq values Counter({'i': 807, 'when': 583, 'a': 565, 'the': 510, 'my': 500, 'was': 410, 'and': 388, 'to': 377, 'had': 310, 'of': 281, 'that': 278, 'in': 278, 'for': 252, 'me': 229, 'with': 193, 'at': 173, 'very': 160, 'after': 137, 'it': 133, 'friend': 131, 'got': 115, 'on': 115, 'an': 104, 'from': 103, 'time': 101, 'not': 100, 'passed': 100, 'felt': 95, 'first': 93, 'happy': 90, 'joy': 86, 'been': 84, 'we': 81, 'as': 76, 'university': 75, 'good': 73, 'one': 73, 'this': 73, 'school': 73, 'year': 70, 'he': 59, 'were': 59, 'long': 57, 'friends': 57, 'exam': 56, 'came': 54, 'about': 54, 'her': 53, 'out': 52, 'she': 51, 'which': 50, 'met': 49, 'told': 49, 'who': 49, 'received': 49, 'love': 47, 'accepted': 47, 's': 46, 'day': 45, 'by': 44, 'have': 44, 'last': 44, 'heard': 43, 'went': 42, 'boyfriend': 42, 'made': 42, 'examination': 41, 'home': 41, 't': 41, 'girl': 41, 'so': 41, 'saw': 39, 'well': 38, 'all': 38, 'exams': 38, 'but': 38, 'selected': 38, 'is': 37, 'letter': 36, 'having': 36, 'o




log weighted values Counter({'i': 3.90687353472207, 'when': 3.765668554759014, 'a': 3.7520484478194387, 'the': 3.7075701760979363, 'my': 3.6989700043360187, 'was': 3.6127838567197355, 'and': 3.5888317255942073, 'to': 3.576341350205793, 'had': 3.4913616938342726, 'of': 3.44870631990508, 'that': 3.444044795918076, 'in': 3.444044795918076, 'for': 3.401400540781544, 'me': 3.359835482339888, 'with': 3.285557309007774, 'at': 3.2380461031287955, 'very': 3.2041199826559246, 'after': 3.1367205671564067, 'it': 3.123851640967086, 'friend': 3.1172712956557644, 'got': 3.060697840353612, 'on': 3.060697840353612, 'an': 3.0170333392987803, 'from': 3.012837224705172, 'time': 3.0043213737826426, 'not': 3.0, 'passed': 3.0, 'felt': 2.9777236052888476, 'first': 2.968482948553935, 'happy': 2.954242509439325, 'joy': 2.9344984512435675, 'been': 2.9242792860618816, 'we': 2.9084850188786495, 'as': 2.8808135922807914, 'university': 2.8750612633917, 'good': 2.863322860120456, 'one': 2.863322860120456, 'this': 2.8

doclength 71.32637706073776
normalized


Counter({'i': 0.05477459666001519, 'when': 0.05279489453883759, 'a': 0.05260393983875549, 'the': 0.05198035185413618, 'my': 0.05185977694039012, 'was': 0.05065144208352656, 'and': 0.05031563179688979, 'to': 0.05014051599957713, 'had': 0.04894909622090036, 'of': 0.04835106537050585, 'that': 0.048285710530135445, 'in': 0.048285710530135445, 'for': 0.04768783556586776, 'me': 0.04710509100271321, 'with': 0.046063706645438726, 'at': 0.045397596745611335, 'very': 0.04492195054190774, 'after': 0.04397700677388033, 'it': 0.0437965836720822, 'friend': 0.043704326843928464, 'got': 0.04291116367437651, 'on': 0.04291116367437651, 'an': 0.042298984802349275, 'from': 0.04224015502903785, 'time': 0.04212076229841762, 'not': 0.04206017638391137, 'passed': 0.04206017638391137, 'felt': 0.04174786002032847, 'first': 0.04161830546960393, 'happy': 0.04141865367595566, 'joy': 0.041141840819206396, 'been': 0.040998567522527056, 'we': 0.04077713096799993, 'as': 0.04038917594016648, 'university': 0.04030852795

0.4439890786057672







doclength 119.9092028563165
normalized











doclength 70.67471883669863
normalized




0.4538342791036567







doclength 105.78685464072434
normalized











doclength 78.1703130436185
normalized




0.43688992382789704







doclength 112.00556072232439
normalized




freq values Counter({'i': 687, 'a': 669, 'the': 522, 'and': 443, 'when': 430, 'to': 421, 'was': 388, 'in': 380, 'of': 378, 'my': 329, 'me': 225, 'with': 214, 'had': 197, 'that': 193, 'at': 174, 'on': 173, 'it': 147, 'disgusted': 147, 'not': 143, 'for': 140, 'saw': 134, 'he': 126, 'who': 124, 'felt': 108, 'people': 103, 'about': 103, 'an': 101, 'his': 96, 'very': 94, 'one': 93, 'friend': 89, 'by': 88, 'man': 82, 'this': 82, 't': 80, 'which': 79, 'disgust': 76, 'her': 76, 'she': 75, 'were': 75, 'from': 74, 'some': 71, 'they': 67, 'as': 64, 'is': 62, 'but': 61, 'out': 60, 'person': 60, 'did': 58, 'because': 58, 'we': 57, 'drunk': 56, 'someone': 56, 's': 55, 'after': 55, 'friends': 53, 'been': 53, 'time': 50, 'him': 50, 'there': 50, 'all': 49, 'like': 49, 'up': 48, 'so': 47, 'girl': 45, 'found': 43, 'got': 43, 'have': 43, 'came': 42, 'do': 42, 'other': 42, 'day': 42, 'went': 40, 'á': 40, 'be': 39, 'way': 39, 'mine': 39, 'having': 38, 'while': 37, 'feel': 37, 'them': 37, 'our': 37, 'made': 




log weighted values Counter({'i': 3.8369567370595505, 'a': 3.8254261177678233, 'the': 3.717670503002262, 'and': 3.6464037262230695, 'when': 3.6334684555795866, 'to': 3.6242820958356683, 'was': 3.5888317255942073, 'in': 3.57978359661681, 'of': 3.5774917998372255, 'my': 3.5171958979499744, 'me': 3.3521825181113627, 'with': 3.330413773349191, 'had': 3.294466226161593, 'that': 3.285557309007774, 'at': 3.2405492482826, 'on': 3.2380461031287955, 'it': 3.167317334748176, 'disgusted': 3.167317334748176, 'not': 3.155336037465062, 'for': 3.146128035678238, 'saw': 3.1271047983648077, 'he': 3.100370545117563, 'who': 3.093421685162235, 'felt': 3.03342375548695, 'people': 3.012837224705172, 'about': 3.012837224705172, 'an': 3.0043213737826426, 'his': 2.9822712330395684, 'very': 2.9731278535996983, 'one': 2.968482948553935, 'friend': 2.949390006644913, 'by': 2.9444826721501687, 'man': 2.9138138523837167, 'this': 2.9138138523837167, 't': 2.9030899869919438, 'which': 2.8976270912904414, 'disgust': 2.88

doclength 79.46071336754694
normalized


Counter({'i': 0.04828746904538396, 'a': 0.04814235809931943, 'the': 0.046786271422031046, 'and': 0.0458893907654285, 'when': 0.045726602513281166, 'to': 0.04561099368780503, 'was': 0.04516485661277671, 'in': 0.04505098739874707, 'of': 0.045022145513462404, 'my': 0.04426333151177642, 'me': 0.04218666528458891, 'with': 0.041912709214480655, 'had': 0.04146031524941112, 'that': 0.04134819799326958, 'at': 0.04078177895651882, 'on': 0.040750277286728545, 'it': 0.03986016737727603, 'disgusted': 0.03986016737727603, 'not': 0.03970938472286297, 'for': 0.03959350353583874, 'saw': 0.039354099225114285, 'he': 0.03901765304795017, 'who': 0.0389302027890633, 'felt': 0.03817513871862431, 'people': 0.03791606061688925, 'about': 0.03791606061688925, 'an': 0.03780889003457722, 'his': 0.037531392642361765, 'very': 0.03741632471693833, 'one': 0.03735786935140091, 'friend': 0.037117587819813, 'by': 0.03705582982285109, 'man': 0.03666986777359799, 'this': 0.03666986777359799, 't': 0.03653490969258795, 'whic

0.4516565863648794







doclength 123.91516769963692
normalized




freq values Counter({'i': 714, 'a': 608, 'my': 583, 'to': 526, 'the': 518, 'when': 507, 'and': 493, 'was': 450, 'me': 411, 'of': 346, 'in': 314, 'had': 310, 'that': 270, 'with': 248, 'not': 243, 'for': 236, 'at': 219, 'it': 189, 'angry': 184, 'he': 165, 'friend': 162, 'on': 141, 'an': 130, 'very': 116, 'she': 109, 't': 108, 'one': 104, 'did': 103, 'this': 101, 'about': 100, 'by': 97, 'who': 95, 'as': 92, 'because': 92, 'him': 89, 'but': 89, 'her': 88, 'we': 87, 'out': 86, 'were': 85, 'up': 82, 'his': 81, 'time': 78, 'got': 78, 's': 78, 'told': 76, 'which': 75, 'some': 73, 'from': 70, 'felt': 69, 'been': 68, 'they': 66, 'so': 63, 'after': 62, 'someone': 62, 'be': 62, 'person': 62, 'people': 61, 'do': 60, 'have': 60, 'no': 59, 'friends': 59, 'mother': 58, 'day': 56, 'without': 55, 'other': 55, 'brother': 55, 'being': 54, 'mine': 54, 'would': 53, 'didn': 53, 'anger': 52, 'school': 52, 'made': 52, 'another': 51, 'all': 50, 'our': 50, 'father': 50, 'home': 47, 'them': 47, 'came': 46, 'is': 




log weighted values Counter({'i': 3.8536982117761744, 'a': 3.783903579272735, 'my': 3.765668554759014, 'to': 3.7209857441537393, 'the': 3.714329759745233, 'when': 3.705007959333336, 'and': 3.69284691927723, 'was': 3.6532125137753435, 'me': 3.6138418218760693, 'of': 3.5390760987927767, 'in': 3.496929648073215, 'had': 3.4913616938342726, 'that': 3.4313637641589874, 'with': 3.3944516808262164, 'not': 3.385606273598312, 'for': 3.3729120029701067, 'at': 3.3404441148401185, 'it': 3.2764618041732443, 'angry': 3.2648178230095364, 'he': 3.2174839442139063, 'friend': 3.2095150145426308, 'on': 3.1492191126553797, 'an': 3.113943352306837, 'very': 3.0644579892269186, 'she': 3.037426497940624, 't': 3.03342375548695, 'one': 3.0170333392987803, 'did': 3.012837224705172, 'this': 3.0043213737826426, 'about': 3.0, 'by': 2.9867717342662448, 'who': 2.9777236052888476, 'as': 2.9637878273455556, 'because': 2.9637878273455556, 'him': 2.949390006644913, 'but': 2.949390006644913, 'her': 2.9444826721501687, 'we'

doclength 80.60429090068108
normalized


Counter({'i': 0.04781008763571434, 'a': 0.04694419536467583, 'my': 0.04671796640949292, 'to': 0.046163618618500844, 'the': 0.04608104256288233, 'when': 0.045965393627723485, 'and': 0.04581452026949134, 'was': 0.04532280444321203, 'me': 0.04483436032368264, 'of': 0.043906795274131896, 'in': 0.043383914292876274, 'had': 0.043314836652260304, 'that': 0.042570485092252, 'with': 0.042112543177245845, 'not': 0.04200280451285137, 'for': 0.04184531574288196, 'at': 0.04144250979090113, 'it': 0.040648726855130234, 'angry': 0.040504268278128974, 'he': 0.039917030573203886, 'friend': 0.03981816574129196, 'on': 0.03907011745237957, 'an': 0.038632476230623665, 'very': 0.038018546593293394, 'she': 0.03768318614307118, 't': 0.03763352696973256, 'one': 0.0374301827556092, 'did': 0.037378124551924005, 'this': 0.03727247445777427, 'about': 0.03721886225258823, 'by': 0.03705474858585981, 'who': 0.03694249489717534, 'as': 0.036769603630623995, 'because': 0.036769603630623995, 'him': 0.036590980128825765, '

0.46229743778369753


In [None]:
# import gensim
# import numpy as np
# from nltk.tokenize import word_tokenize
# print(corpus1Text[:5])
# gen_docs = corpus1Text[:5]
# dictionary = gensim.corpora.Dictionary(gen_docs)
# # print(dictionary.token2id)
# corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
# tf_idf = gensim.models.TfidfModel(corpus)
# for doc in tf_idf [corpus]:
#     print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])
# sims = gensim.similarities.Similarity("../Ling506TermProject/",tf_idf[corpus],
#                                         num_features=len(dictionary))



# file2_docs = ["Mars is the fourth planet in our solar system.",
#         "It is second-smallest planet in the Solar System after Mercury.",
#         "Saturn is yellow planet."]
# tf_idf = gensim.models.TfidfModel(corpus)

# print("Number of documents:",len(file2_docs))  
# for line in file2_docs:
#     query_doc = [w.lower() for w in word_tokenize(line)]
#     query_doc_bow = dictionary.doc2bow(query_doc) #update an existing dictionary and create bag of words

# # perform a similarity query against the corpus
# query_doc_tf_idf = tf_idf[query_doc_bow]
# # print(document_number, document_similarity)
# print('Comparing Result:', sims[query_doc_tf_idf]) 





# sum_of_sims =(np.sum(sims[query_doc_tf_idf], dtype=np.float32))
# print(sum_of_sims)

# avg_sims = [] # array of averages


# # for line in query documents
# for line in file2_docs:
#     # tokenize words
#     query_doc = [w.lower() for w in word_tokenize(line)]
#     # create bag of words
#     query_doc_bow = dictionary.doc2bow(query_doc)
#     # find similarity for each document
#     query_doc_tf_idf = tf_idf[query_doc_bow]
#     # print (document_number, document_similarity)
#     print('Comparing Result:', sims[query_doc_tf_idf]) 
#     # calculate sum of similarities for each query doc
#     sum_of_sims =(np.sum(sims[query_doc_tf_idf], dtype=np.float32))
#     # calculate average of similarity for each query doc
#     avg = sum_of_sims / len(file_docs)
#     # print average of similarity for each query doc
#     print(f'avg: {sum_of_sims / len(file_docs)}')
#     # add average values into array
#     avg_sims.append(avg)  
# # calculate total average
# total_avg = np.sum(avg_sims, dtype=np.float)
# # round the value and multiply by 100 to format it as percentage
# percentage_of_similarity = round(float(total_avg) * 100)
# # if percentage is greater than 100
# # that means documents are almost same
# if percentage_of_similarity >= 100:
#     percentage_of_similarity = 100
    

In [None]:
arr1 = np.load("ssec_emotion-causetrain_xNP.npy")
arr2 = np.load("ssec_grounded_emotionstrain_xNP.npy")
arr3 = np.load("isear_ssectest_xNP.npy")
corpus1 = "ssec"
corpus1Data = []
corpus1Text = []
corpus2 = "isear"
corpus2Data = []
corpus2Text = []
with open(JSON) as f:
    for line in f:
        data = json.loads(line)
        if data["source"] == corpus1:
            corpus1Data.append(data)
        if data["source"] == corpus2:
            corpus2Data.append(data)
print("loaded data")
words1 = getTop5000Words(corpus1Data)
print(words1)
words2 = getTop5000Words(corpus2Data)
for data in tqdm(corpus1Data):
    corpus1Text.append(get_vector(data["text"], words1))
for data in tqdm(corpus1Data):
    corpus2Text.append(get_vector(data["text"], words2))
# print(corpus1Text[:30])
# print(np.array_equal(arr1,arr2))
# print(np.array_equal(arr1,arr3))
# print(cosine_similarity(arr1,arr3))
# print(chi2_kernel(arr1,arr3))
from scipy import spatial

dataSetI = [3, 45, 7, 2]
dataSetII = [2, 54, 13, 15]
result = 1 - spatial.distance.cosine(arr1, arr2)
print(result)

In [None]:
#this method is modified to track 
def make_arrays(train, test, words, labels, mode="multi", all_vs=False):
    emotions = {label: x for x, label in enumerate(labels)}
    print("emotions in make_arrays: ", emotions)
    train_x, train_y, test_x, test_y = [], [], [], []
    
    print("train raw text: ", sys.getsizeof(train)/1000000)

    for data in tqdm(train):
        # Discard examples where we don't have all selected emotions
        if (mode == "single" or all_vs or all(data["emotions"][emo] is not None for emo in labels)):
            train_y.append(get_emotion(data["emotions"], labels, emotions, mode))
            train_x.append(get_vector(data["text"], words))
    for data in tqdm(test):
        test_y.append(get_emotion(data["emotions"], labels, emotions, mode))
        test_x.append(get_vector(data["text"], words))

    print("train_x length ", len(train_x))
    print("train_x dimension of element ", len(train_x[0]))
    train_xSize = sys.getsizeof(train_x)/1000000
    train_ySize = sys.getsizeof(train_y)/1000000
    train_xLength = len(train_x)
    train_yLength = len(train_y)
    print("train_x (text) size RAW:", train_xSize,"megabytes")
    print("train_y (labels) size RAW:", train_ySize,"megabytes")
    test_xSize = sys.getsizeof(test_x)/1000000
    test_ySize = sys.getsizeof(test_y)/1000000
    test_xLength = len(test_x)
    test_yLength = len(test_y)
    print("test_x (text) size RAW:", test_xSize,"megabytes")
    print("test_y (labels) size RAW:", test_ySize,"megabytes")

    train_x = np.array(train_x)
    train_y = np.array(train_y)
    test_x = np.array(test_x)
    test_y = np.array(test_y)
    train_xNPSize = (train_x.nbytes)/1000000
    train_yNPSize = (train_y.nbytes)/1000000
    test_xNPSize = (test_x.nbytes)/1000000
    test_yNPSize = (test_y.nbytes)/1000000
    
    print("saved test_y")
    print("train_x Size stays the same", train_xSize == train_xNPSize)
    print("train_y Size stays the same", train_ySize == train_yNPSize)
    print("test_x Size stays the same", test_xSize == test_xNPSize)
    print("test_y Size stays the same", test_ySize == test_yNPSize)
    print("train_xNPSize (text) size:", train_xNPSize,"megabytes")
    print("train_yNPSize (labels) size:", train_yNPSize,"megabytes")
    print("test_xNPSize (text) size:", test_xNPSize,"megabytes")
    print("test_yNPSize (labels) size:", test_yNPSize,"megabytes")
    print("train_xNP length ", len(train_x))
    print("train_xNP dimension of element ", train_x.ndim)
    print("train_xNP size ", train_x.size)
    sizes = train_xNPSize, train_yNPSize, test_xNPSize, test_yNPSize
    return train_x, train_y, test_x, test_y, sizes

In [None]:
#kept as part of classification definitions, prevents division by 0 errors
def cheatydiv(x, y):
    return math.nan if y == 0 else x / y

In [None]:
# from scipy.spatial import distance
# distance.jensenshannon([0, 0, 1], [0, 1, 0])

In [60]:
#classification reporting is kept the same for simplicity (ie, no need to reinvent the wheel)
def classification_report_own_single(test_y, predict_y, labels):
    reports = {}
    num2emo = {i: label for i, label in enumerate(labels)}
    decisions = defaultdict(Counter)
    for t, p in zip(test_y, predict_y):
        decisions[t][p] += 1
    for label in decisions:
        tp = decisions[label][label]
        fp = sum(decisions[x][label] for x in decisions if x != label)
        tn = sum(
            decisions[x][y]
            for x in decisions
            for y in decisions[x]
            if x != label and y != label
        )
        fn = sum(decisions[label][y] for y in decisions[label] if y != label)
        precision = tp / (tp + fp) if tp + fp else math.nan
        recall = tp / (tp + fn) if tp + fn else math.nan
        if y == 0:
        f1 = 2 * cheatydiv((precision * recall), (precision + recall))
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        reports[num2emo[label]] = Report(precision, recall, accuracy, f1, tp, tn, fp, fn)
    return reports

In [61]:
#classification reporting is kept the same for simplicity (ie, no need to reinvent the wheel)
def classification_report_own_multi(test_y, predict_y, labels):
    reports = {}
    num2emo = {i: label for i, label in enumerate(labels)}
    emo2num = {label: i for i, label in enumerate(labels)}
    decisions = defaultdict(Counter)
    for label in labels:
        tp = fp = tn = fn = 0
        for t, p in zip(test_y, predict_y):
            # decisions[t][p] += 1
            tp += bool(t[emo2num[label]] and p[emo2num[label]])
            fp += bool(p[emo2num[label]] and not t[emo2num[label]])
            fn += bool(t[emo2num[label]] and not p[emo2num[label]])
            tn += bool(not t[emo2num[label]] and not p[emo2num[label]])
        precision = tp / (tp + fp) if tp + fp else math.nan
        recall = tp / (tp + fn) if tp + fn else math.nan
        f1 = 2 * cheatydiv((precision * recall), (precision + recall))
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        reports[label] = Report(precision, recall, accuracy, f1, tp, tn, fp, fn)
    return reports

In [62]:
#classification reporting is kept the same for simplicity (ie, no need to reinvent the wheel)
def analyse_results(test_y, predict_y, labels, test, first, second, output, mode):
    print("analyse_results")
    prefix = f"{first}_vs_{second}_{mode}"
    fprefix = output + "/" + prefix
    with open(fprefix + ".txt", "w", encoding="utf-8") as f, open(fprefix + ".json", "w") as g:
        print("hello")
        prec, reca, f1, supp = precision_recall_fscore_support(
            test_y, predict_y, pos_label=None, average="micro"
        )
        accuracy = accuracy_score(test_y, predict_y)
        scoreNameArray = [(prec, "Precision"),(reca, "Recall"),(f1, "F1-score"),(accuracy, "Accuracy")]
        for score, name in scoreNameArray:
            print(name, score, sep="\t", file=f)
            print(name, score, sep="\t")
            
        # print("real:", Counter(test_y), file=f)
        # print("predicted:", Counter(predict_y), file=f)
        
        print(test_y[:10], predict_y[:10], file=f)
        emotions = {i: label for i, label in enumerate(labels)}
        for text, real, predicted, _ in zip(test, test_y, predict_y, range(20)):
            if mode == "multi" and np.array_equal(real, predicted):
                continue
            elif mode == "single" and real == predicted:
                continue
            print(text, "=> predicted:", predicted, ", truth:", real, file=f)
        if mode == "multi":
            results = classification_report_own_multi(test_y, predict_y, labels)
        elif mode == "single":
            results = classification_report_own_single(test_y, predict_y, labels)
        json.dump(
            {
                "precision": prec,
                "recall": reca,
                "f1": f1,
                "accuracy": accuracy,
                "name": prefix,
                **{
                    (emotion + "_" + metric): getattr(results[emotion], metric)
                    for emotion in results
                    for metric in Report._fields
                },
            },
            g,
        )
        g.write("\n")

In [94]:
#used for benchmarking/validating the results of the authors, but not in the final version
#method is kept here for documentation
def hacky_train_test_split(training, train_size=0.8, first=None, second=None):
    tra, tes = [], []
    for example in training:
        if example.get("split") == "train" or example["source"] != second:
            tra.append(example)
        elif example.get("split") == "test":
            tes.append(example)
        else:
            # don't try this at home
            [tes, tra][random.random()<train_size].append(example)
    return tra, tes

In [95]:
# def getTrainTest(jsonfile, corporaList):
#     for corpus in corporaList:

In [98]:
#this method is used in my testing to generate the combinations that I use in my trials automation
def getPowerset(s):
    x = len(s)
    masks = [1 << i for i in range(x)]
    for i in range(1 << x):
        yield [ss for mask, ss in zip(masks, s) if i & mask]

In [99]:
#this method is used in my testing to generate the combinations that I use in my trials automation
def getPermutations(s):
    subsets = set()
    for L in range(2, 3): #this 
        for subset in itertools.permutations(s, L):
            subsets.add(subset)
    return subsets

In [100]:
#this method is simply in place to get a measure of hard drive space left on my computer
def getHardDriveSpaceLeft():
    total, used, free = shutil.disk_usage("/")
    total = (total // (2**30))
    used = (used // (2**30))
    free = (free // (2**30))
    print("Total: %d GB" % total)
    print("Used: %d GB" % used)
    print("Free: %d GB" % free)
    return total, used, free

In [101]:
#gets the runtime values for cross corpus trials
#ordering will matter if using the original authors version
def getCrossCorpusValuesWithOrder(possibleChoices):
    if orderingMatters:
        permutations = list(getPermutations(possibleChoices))
        print("permutations length: ",len(permutations))
#         print(permutations)
        corporaSets = []
        for choice in permutations:
    #         print("choice ", choice)
            if(len(choice) == 2):
    #             print("pair")
                first, second = choice
                firstCorpus, domain1 = first
                secondCorpus, domain2 = second
            corpusPairData = (firstCorpus, secondCorpus, domain1, domain2)
            corporaSets.append(corpusPairData)
    return(corporaSets)

In [102]:
# This method adds the combinations relating to the ALl-VS trials
def getAllVsCorpusValues(possibleChoices):
    corporaSets = []
    for entry in possibleChoices:
        firstCorpus, domain1 = (None, None)
        secondCorpus, domain2 = entry
        corpusPairData = (firstCorpus, secondCorpus, domain1, domain2)
        corporaSets.append(corpusPairData)
    return corporaSets

In [103]:
#Gets the corpora pairs of the same domain
def getCorporaPairsOfSameDomain(powerSet, sizeBoundLower=1, sizeBoundUpper=3):
    for entry in powerSet:
#       if len(entry) < 3 and len(entry) > 0:
        if len(entry) < sizeBoundUpper and len(entry) > sizeBoundLower:
            domainMatch = entry[0][1]
            shouldAppend = True
            for corpus, domain in entry:
                if domain != domainMatch:
                    shouldAppend = False
            if(shouldAppend):
                powerSetCondensed.append(entry)
    print("CorporaPairsOfSameDomain:",len(powerSetCondensed))
    return sameDomainCorporaPairs

In [104]:
#this method adds the trials where the corpus is trained and tested on itself
def getCorporaPairsWithItself(possibleChoices):
    corporaSets = []
    for entry in possibleChoices:
        firstCorpus, domain1 = entry
        secondCorpus, domain2 = entry
        corpusPairData = (firstCorpus, secondCorpus, domain1, domain2)
        corporaSets.append(corpusPairData)
    return corporaSets

In [105]:
# def getCorpusSimilarity(corpusList):
    

In [None]:
def performTrialUsingCorpusPair(corpusPair):
    print(entry)
    (first, second, domain1, domain2) = entry
    print("Getting data")
    jsonfile = "unified-dataset.jsonl"
    output = "."
    debug = True
    forceMulti = False
    isAllVS = False
    if first == None:
        isAllVS = True

    training_data, testing_data = get_train_test(jsonfile, first,second)
    firstCLF, secondCLF = (["multi", "multi"] if forceMulti else get_clf_mode(training_data, testing_data))
    mode = "multi" if "multi" in [firstCLF, secondCLF] else "single"

    print("Detected mode: {}...".format(mode))
    print(len(training_data), len(testing_data))
    print("Getting wordlist...")
    if debug:
        wordlist = getTop5000Words(training_data)
    else:
        wordlist = getTop5000Words(training_data)
        # wordlist = get_wordlist(training_data)
    print("Getting emotions")
    labels = get_labels(training_data, testing_data, mode=mode)
    print(labels)
    print("Making arrays")
    print("checking for save files")
    if(first == None):
        first = "all-vs"
    train_xNPFileName = first + "_" + second + "train_xNP" +".npy"
    train_yNPFileName = first + "_" + second + "train_yNP" +".npy"
    test_xNPFileName = first + "_" + second + "test_xNP" +".npy"
    test_yNPFileName = first + "_" + second + "test_yNP" +".npy"

    if(path.exists(train_xNPFileName) 
       and path.exists(train_yNPFileName)
       and path.exists(test_xNPFileName)
       and path.exists(test_yNPFileName)):
        print('saved train_xNP as', train_xNPFileName)
        print('saved train_yNP as', train_yNPFileName)
        print('saved test_xNP as', test_xNPFileName)
        print('saved test_yNP as', test_yNPFileName)
        print("loading from np")
        train_x = np.load(train_xNPFileName)
        train_y = np.load(train_yNPFileName)
        test_x = np.load(test_xNPFileName)
        test_y = np.load(test_yNPFileName)
        train_xNPSize = (train_x.nbytes)/1000000
        train_yNPSize = (train_y.nbytes)/1000000
        test_xNPSize = (test_x.nbytes)/1000000
        test_yNPSize = (test_y.nbytes)/1000000
        print("loaded directly from NP.load")
        print("train_xNPSize (text) size loaded:", train_xNPSize,"megabytes")
        print("train_yNPSize (labels) size loaded:", train_yNPSize,"megabytes")
        print("test_xNPSize (text) size loaded:", test_xNPSize,"megabytes")
        print("test_yNPSize (labels) size loaded:", test_yNPSize,"megabytes")
    else:
        train_x, train_y, test_x, test_y, sizes = make_arrays(training_data, testing_data, wordlist, labels, mode, isAllVS)
        train_xSize, train_ySize, test_xSize, test_ySize = sizes
        if any(not part.size for part in [train_x, train_y, test_x, test_y]):
            print("Train or test empty. Did you misspell the dataset name?")
            continue
        #             sys.exit(1)
        print("saving NP arrays")
        np.save(train_xNPFileName, train_x)
        np.save(train_yNPFileName, train_y)
        np.save(test_xNPFileName, test_x)
        np.save(test_yNPFileName, test_y)
        print("NP arrays saved")

    print("Initializing classifier")
    trainClassifier = True
    if debug:
        classifierName = "RandomForestClassifier"
        print("Searching for a ", classifierName)
        classiferSaveFile = first+"_"+second+classifierName+".pkl"
        print(path.exists(classiferSaveFile))
        if(path.exists(classiferSaveFile)):
            trainClassifier = False
            print("Loading classifier from file")
            classifier = joblib.load(classiferSaveFile)
            print("classifier loaded successfully")
        else:
            print("file not found, creating new classifier")
            classifier = RandomForestClassifier()
    elif mode == "single":
        classifierName = "LogisticRegressionCV"
        print("Searching for a ", classifierName)
        classiferSaveFile = first+"_"+second+classifierName+".pkl"
        print(path.exists(classiferSaveFile))
        if(path.exists(classiferSaveFile)):
            trainClassifier = False
            print("Loading classifier from file")
            classifier = joblib.load(classiferSaveFile)
            print("classifier loaded successfully")
        else:
            print("file not found, creating new classifier")
            classifier = LogisticRegressionCV(
                cv=10,
                penalty="l2",
                fit_intercept=True,
                solver="sag",
                scoring="f1",
                refit=True,
                # n_jobs=-1,
                class_weight="balanced",
            )
    else:
        classifierName = "OneVsRestClassifier"
        print("Searching for a ", classifierName)
        classiferSaveFile = first+"_"+second+classifierName+".pkl"
        print(path.exists(classiferSaveFile))
        if(path.exists(classiferSaveFile)):
            trainClassifier = False
            print("Loading classifier from file")
            classifier = joblib.load(classiferSaveFile)
            print("classifier loaded successfully")
        else:
            print("file not found, creating new classifier")
            classifier = OneVsRestClassifier(
                LogisticRegressionCV(
                    cv=10,
                    penalty="l2",
                    fit_intercept=True,
                    solver="sag",
                    scoring="f1",
                    refit=True,
                    class_weight="balanced",
                    tol = 0.1,
                ),
                n_jobs=-1,
            )
    if(trainClassifier):
        print("this is the classifierName: ", classifierName)
        print("Training...")
        print("train_x (text) size:", (train_x.nbytes)/1000000,"megabytes")
        print("train_y (labels) size:", (train_y.nbytes)/1000000,"megabytes")
        print("train_x (text) length:", len(train_x))
        print("train_y (labels) length:", len(train_y))
        print(train_x[:5])
        print(train_y[:5])

        classifier.fit(train_x, train_y)
        print("finished training, classifier size:", sys.getsizeof(classifier)/1000000,"megabytes")
    print("Predicting...")
    if first == "multi" and second == "single":
        predict_y = classifier.predict_proba(test_x)
        helper = np.zeros_like(predict_y)
        helper[range(len(predict_y)), predict_y.argmax(1)] = 1
        predict_y = helper
    else:
        predict_y = classifier.predict(test_x)

    print("Analysing...")

    analyse_results(
        test_y,
        predict_y,
        labels,
        testing_data,
        first,
        second,
        output,
        mode,  # TODO
    )
    if(path.exists(classiferSaveFile)):
        print("classifier already saved")
    else:
#         classiferSaveFile = first+"_"+second+classifierName+".pkl"
        print("classiferSaveFile: ", classiferSaveFile)
        joblib.dump(classifier, classiferSaveFile)
        print("Saved Successfully")
    total, used, free = getHardDriveSpaceLeft()
    if(free < 10):
        sys.exit("Error: less than 10 gb remaining on disk")
    print("-----------------------------------------------------------------------------------------")

In [None]:
def runTrials(version, crossCorpus=True, sameCorpus=True, allVs=False):
    possibleChoices = [('affectivetext','headlines'), ('crowdflower','tweets'), ('dailydialog','conversations'), 
                       ('emobank','headlines'), ('emoint','tweets'), 
                       ('emotion-cause','paragraphs'), ('grounded_emotions','tweets'), ('isear','descriptions'),
                       ('ssec','tweets'),('tales-emotion','tales'), ('tec','tweets')]
    corporaSets = []
    if version == "previous":
        corporaSets.append(getCrossCorpusValuesWithOrder(possibleChoices))
        sortedPermutations = sorted(corporaSets, key = lambda x: (x[2], x[0], x[1]), reverse = True)
        sortedPermutations.append(getCorporaPairsWithItself(possibleChoices))
        sortedPermutations.append(getAllVsCorpusValues(possibleChoices))
        return sortedPermutations
        #this was added to sort the lists by domain of the first, then by the first corpus name, then the second.
        #it is placed in reverse order simply because if it was put in regular order, the largest of the trials would be first
        #sorting in reverse will (loosely) make the smaller trials run first, while having no impact on the ability to obtain all results
        sortedPermutations = sorted(corporaSets, key = lambda x: (x[2], x[0], x[1]), reverse = True)
        for corpusPair in sortedPermutations:
            performTrialUsingCorpusPair(corpusPair)
    else: #version == "my trials"
        
    
    print("End of program!")

In [43]:
if __name__ == "__main__":
    
    possibleChoices = [('affectivetext','headlines'), ('crowdflower','tweets'), ('dailydialog','conversations'), 
                       ('emobank','headlines'), ('emoint','tweets'), 
                       ('emotion-cause','paragraphs'), ('grounded_emotions','tweets'), ('isear','descriptions'),
                       ('ssec','tweets'),('tales-emotion','tales'), ('tec','tweets')] 
                        #('electoraltweets','tweets') <- incompatible due to labelling
    #     print(possibleChoices)
    permutations = list(getPermutations(possibleChoices))
    powerSet = list(getPowerset(possibleChoices))
#     print("powerset: ", powerSet)
    print("permutations length: ",len(permutations))
#     print(permutations)
    corporaSets = []
    for choice in permutations:
#         print("choice ", choice)
        if(len(choice) == 2):
#             print("pair")
            first, second = choice
            firstCorpus, domain1 = first
            secondCorpus, domain2 = second
        corpusPairData = (firstCorpus, secondCorpus, domain1, domain2)
        corporaSets.append(corpusPairData)
#     print(corporaSets)
    #this was added to sort the lists by domain of the first, then by the first corpus name, then the second.
    #it is placed in reverse order simply because if it was put in regular order, the largest of the trials would be first
    #sorting in reverse will (loosely) make the smaller trials run first, while having no impact on the ability to obtain all results
    sortedPermutations = sorted(corporaSets, key = lambda x: (x[2], x[0], x[1]), reverse = True)
    
    #this for loop adds the trials where the corpus is trained and tested on itself
    for entry in possibleChoices:
        firstCorpus, domain1 = entry
        secondCorpus, domain2 = entry
        corpusPairData = (firstCorpus, secondCorpus, domain1, domain2)
        sortedPermutations.append(corpusPairData)
        
    #This loop adds the combinations relating to the ALl-VS trials
    for entry in possibleChoices:
        firstCorpus, domain1 = (None, None)
        secondCorpus, domain2 = entry
        corpusPairData = (firstCorpus, secondCorpus, domain1, domain2)
        sortedPermutations.append(corpusPairData)
    for entry in sortedPermutations:
        print(entry)
    powerSetCondensedGood = []
    powerSetCondensedBad = []
    powerSetCondensed = []
    for entry in powerSet:
#         if len(entry) < 3 and len(entry) > 0:
            if len(entry) == 2:
                domainMatch = entry[0][1]
                appendGood = True
                for corpus, domain in entry:
                    if domain != domainMatch:
                        appendGood = False
                if(appendGood):
                    print("good entry",entry)
                    powerSetCondensedGood.append(entry)
                else:
                    print("bad entry",entry)
                    powerSetCondensedBad.append(entry)
            if len(entry) < 3 and len(entry) > 0:
                powerSetCondensed.append(entry)
    print("powerSetCondensed length: ",len(powerSetCondensed))
    print("powerSetCondensedGood length: ",len(powerSetCondensedGood))
    print("powerSetCondensedGood: ", powerSetCondensedGood)
    print("powerSetCondensedBad length: ",len(powerSetCondensedBad))
    print("powerSetCondensedBad: ", powerSetCondensedBad)
    print(powerSetCondensedGood[0])
    
#     example1 = ('ssec', 'tec', 'tweets', 'tweets')
#     example2 = (None, 'affectivetext', None, 'headlines')
    for entry in sortedPermutations:
        print(entry)
        (first, second, domain1, domain2) = entry
        print("Getting data")
        jsonfile = "unified-dataset.jsonl"
#         first = example2[0] #use first = None if you want to do ALl vs
#         second = example2[1]
    #     first = "isear" #use first = None if you want to do ALl vs
    #     second = "crowdflower"

        output = "."
        debug = True
        forceMulti = False
        isAllVS = False
        if first == None:
            isAllVS = True

        training_data, testing_data = get_train_test(jsonfile, first,second)
        firstCLF, secondCLF = (["multi", "multi"] if forceMulti else get_clf_mode(training_data, testing_data))
        mode = "multi" if "multi" in [firstCLF, secondCLF] else "single"

        print("Detected mode: {}...".format(mode))
        print(len(training_data), len(testing_data))
        print("Getting wordlist...")
        if debug:
            wordlist = getTop5000Words(training_data)
        else:
            wordlist = getTop5000Words(training_data)
        print("Getting emotions")
        labels = get_labels(training_data, testing_data, mode=mode)
        print(labels)
        print("Making arrays")
        print("checking for save files")
        if(first == None):
            first = "all-vs"
        train_xNPFileName = first + "_" + second + "train_xNP" +".npy"
        train_yNPFileName = first + "_" + second + "train_yNP" +".npy"
        test_xNPFileName = first + "_" + second + "test_xNP" +".npy"
        test_yNPFileName = first + "_" + second + "test_yNP" +".npy"

        if(path.exists(train_xNPFileName) 
           and path.exists(train_yNPFileName)
           and path.exists(test_xNPFileName)
           and path.exists(test_yNPFileName)):
            print('saved train_xNP as', train_xNPFileName)
            print('saved train_yNP as', train_yNPFileName)
            print('saved test_xNP as', test_xNPFileName)
            print('saved test_yNP as', test_yNPFileName)
            print("loading from np")
            train_x = np.load(train_xNPFileName)
            train_y = np.load(train_yNPFileName)
            test_x = np.load(test_xNPFileName)
            test_y = np.load(test_yNPFileName)
            train_xNPSize = (train_x.nbytes)/1000000
            train_yNPSize = (train_y.nbytes)/1000000
            test_xNPSize = (test_x.nbytes)/1000000
            test_yNPSize = (test_y.nbytes)/1000000
            print("loaded directly from NP.load")
            print("train_xNPSize (text) size loaded:", train_xNPSize,"megabytes")
            print("train_yNPSize (labels) size loaded:", train_yNPSize,"megabytes")
            print("test_xNPSize (text) size loaded:", test_xNPSize,"megabytes")
            print("test_yNPSize (labels) size loaded:", test_yNPSize,"megabytes")
        else:
            train_x, train_y, test_x, test_y, sizes = make_arrays(training_data, testing_data, wordlist, labels, mode, isAllVS)
            train_xSize, train_ySize, test_xSize, test_ySize = sizes
            if any(not part.size for part in [train_x, train_y, test_x, test_y]):
                print("Train or test empty. Did you misspell the dataset name?")
                continue
            #             sys.exit(1)
            print("saving NP arrays")
            np.save(train_xNPFileName, train_x)
            np.save(train_yNPFileName, train_y)
            np.save(test_xNPFileName, test_x)
            np.save(test_yNPFileName, test_y)
            print("NP arrays saved")
        
        print("Initializing classifier")
        trainClassifier = True
        if debug:
            classifierName = "RandomForestClassifier"
            print("Searching for a ", classifierName)
            classiferSaveFile = first+"_"+second+classifierName+".pkl"
            print(path.exists(classiferSaveFile))
            if(path.exists(classiferSaveFile)):
                trainClassifier = False
                print("Loading classifier from file")
                classifier = joblib.load(classiferSaveFile)
                print("classifier loaded successfully")
            else:
                print("file not found, creating new classifier")
                classifier = RandomForestClassifier()
        elif mode == "single":
            classifierName = "LogisticRegressionCV"
            print("Searching for a ", classifierName)
            classiferSaveFile = first+"_"+second+classifierName+".pkl"
            print(path.exists(classiferSaveFile))
            if(path.exists(classiferSaveFile)):
                trainClassifier = False
                print("Loading classifier from file")
                classifier = joblib.load(classiferSaveFile)
                print("classifier loaded successfully")
            else:
                print("file not found, creating new classifier")
                classifier = LogisticRegressionCV(
                    cv=10,
                    penalty="l2",
                    fit_intercept=True,
                    solver="sag",
                    scoring="f1",
                    refit=True,
                    # n_jobs=-1,
                    class_weight="balanced",
                )
        else:
            classifierName = "OneVsRestClassifier"
            print("Searching for a ", classifierName)
            classiferSaveFile = first+"_"+second+classifierName+".pkl"
            print(path.exists(classiferSaveFile))
            if(path.exists(classiferSaveFile)):
                trainClassifier = False
                print("Loading classifier from file")
                classifier = joblib.load(classiferSaveFile)
                print("classifier loaded successfully")
            else:
                print("file not found, creating new classifier")
                classifier = OneVsRestClassifier(
                    LogisticRegressionCV(
                        cv=10,
                        penalty="l2",
                        fit_intercept=True,
                        solver="sag",
                        scoring="f1",
                        refit=True,
                        class_weight="balanced",
                        tol = 0.1,
                    ),
                    n_jobs=-1,
                )
        if(trainClassifier):
            print("this is the classifierName: ", classifierName)
            print("Training...")
            print("train_x (text) size:", (train_x.nbytes)/1000000,"megabytes")
            print("train_y (labels) size:", (train_y.nbytes)/1000000,"megabytes")
            print("train_x (text) length:", len(train_x))
            print("train_y (labels) length:", len(train_y))
            print(train_x[:5])
            print(train_y[:5])

            classifier.fit(train_x, train_y)
            print("finished training, classifier size:", sys.getsizeof(classifier)/1000000,"megabytes")
        print("Predicting...")
        if first == "multi" and second == "single":
            predict_y = classifier.predict_proba(test_x)
            helper = np.zeros_like(predict_y)
            helper[range(len(predict_y)), predict_y.argmax(1)] = 1
            predict_y = helper
        else:
            predict_y = classifier.predict(test_x)

        print("Analysing...")

        analyse_results(
            test_y,
            predict_y,
            labels,
            testing_data,
            first,
            second,
            output,
            mode,  # TODO
        )
        if(path.exists(classiferSaveFile)):
            print("classifier already saved")
        else:
    #         classiferSaveFile = first+"_"+second+classifierName+".pkl"
            print("classiferSaveFile: ", classiferSaveFile)
            joblib.dump(classifier, classiferSaveFile)
            print("Saved Successfully")
        total, used, free = getHardDriveSpaceLeft()
        if(free < 10):
            sys.exit("Error: less than 10 gb remaining on disk")
        print("-----------------------------------------------------------------------------------------")
print("End of program!")

permutations length:  110
('tec', 'tales-emotion', 'tweets', 'tales')
('tec', 'ssec', 'tweets', 'tweets')
('tec', 'isear', 'tweets', 'descriptions')
('tec', 'grounded_emotions', 'tweets', 'tweets')
('tec', 'emotion-cause', 'tweets', 'paragraphs')
('tec', 'emoint', 'tweets', 'tweets')
('tec', 'emobank', 'tweets', 'headlines')
('tec', 'dailydialog', 'tweets', 'conversations')
('tec', 'crowdflower', 'tweets', 'tweets')
('tec', 'affectivetext', 'tweets', 'headlines')
('ssec', 'tec', 'tweets', 'tweets')
('ssec', 'tales-emotion', 'tweets', 'tales')
('ssec', 'isear', 'tweets', 'descriptions')
('ssec', 'grounded_emotions', 'tweets', 'tweets')
('ssec', 'emotion-cause', 'tweets', 'paragraphs')
('ssec', 'emoint', 'tweets', 'tweets')
('ssec', 'emobank', 'tweets', 'headlines')
('ssec', 'dailydialog', 'tweets', 'conversations')
('ssec', 'crowdflower', 'tweets', 'tweets')
('ssec', 'affectivetext', 'tweets', 'headlines')
('grounded_emotions', 'tec', 'tweets', 'tweets')
('grounded_emotions', 'tales-emo

100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:08<00:00, 2627.20it/s]
100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:04<00:00, 2169.32it/s]


train_x length  21051
train_x dimension of element  5000
train_x (text) size RAW: 0.178016 megabytes
train_y (labels) size RAW: 0.178016 megabytes
test_x (text) size RAW: 0.087616 megabytes
test_y (labels) size RAW: 0.087616 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 421.02 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 201.24 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  21051
train_xNP dimension of element  2
train_xNP size  105255000
Train or test empty. Did you misspell the dataset name?
('tec', 'dailydialog', 'tweets', 'conversations')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  tec
test  dailydialog
there were  102979  entries that were in test and  118460 that were not in test and  21051  that were in train
test was appended  102979  times
single
oof
Detected 

100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:07<00:00, 2726.93it/s]
100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:45<00:00, 2248.01it/s]


train_x length  21051
train_x dimension of element  5000
train_x (text) size RAW: 0.178016 megabytes
train_y (labels) size RAW: 0.178016 megabytes
test_x (text) size RAW: 0.824456 megabytes
test_y (labels) size RAW: 0.824456 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 421.02 megabytes
train_yNPSize (labels) size: 0.084204 megabytes
test_xNPSize (text) size: 2059.58 megabytes
test_yNPSize (labels) size: 0.411916 megabytes
train_xNP length  21051
train_xNP dimension of element  2
train_xNP size  105255000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 421.02 megabytes
train_y (labels) size: 0.084204 megabytes
train_x (text) length: 21051
train_y (labels) length: 21051
[[0 

100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:07<00:00, 2717.54it/s]
100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:16<00:00, 2399.72it/s]


train_x length  21051
train_x dimension of element  5000
train_x (text) size RAW: 0.178016 megabytes
train_y (labels) size RAW: 0.178016 megabytes
test_x (text) size RAW: 0.321096 megabytes
test_y (labels) size RAW: 0.321096 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 421.02 megabytes
train_yNPSize (labels) size: 0.084204 megabytes
test_xNPSize (text) size: 794.8 megabytes
test_yNPSize (labels) size: 0.15896 megabytes
train_xNP length  21051
train_xNP dimension of element  2
train_xNP size  105255000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 421.02 megabytes
train_y (labels) size: 0.084204 megabytes
train_x (text) length: 21051
train_y (labels) length: 21051
[[0 0 0

100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:07<00:00, 2654.00it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 2859.63it/s]


train_x length  21051
train_x dimension of element  5000
train_x (text) size RAW: 0.178016 megabytes
train_y (labels) size RAW: 0.178016 megabytes
test_x (text) size RAW: 0.010192 megabytes
test_y (labels) size RAW: 0.010192 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 421.02 megabytes
train_yNPSize (labels) size: 0.505224 megabytes
test_xNPSize (text) size: 25.0 megabytes
test_yNPSize (labels) size: 0.03 megabytes
train_xNP length  21051
train_xNP dimension of element  2
train_xNP size  105255000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 421.02 megabytes
train_y (labels) size: 0.505224 megabytes
train_x (text) length: 21051
train_y (labels) length: 21051
[[0 0 0 ...

100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:02<00:00, 1836.17it/s]
100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:05<00:00, 2551.23it/s]


train_x length  4868
train_x dimension of element  5000
train_x (text) size RAW: 0.043032 megabytes
train_y (labels) size RAW: 0.043032 megabytes
test_x (text) size RAW: 0.124912 megabytes
test_y (labels) size RAW: 0.124912 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 97.36 megabytes
train_yNPSize (labels) size: 0.116832 megabytes
test_xNPSize (text) size: 295.42 megabytes
test_yNPSize (labels) size: 0.354504 megabytes
train_xNP length  4868
train_xNP dimension of element  2
train_xNP size  24340000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 97.36 megabytes
train_y (labels) size: 0.116832 megabytes
train_x (text) length: 4868
train_y (labels) length: 4868
[[1 0 0 ... 

100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:02<00:00, 2171.56it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:03<00:00, 2101.59it/s]


train_x length  4868
train_x dimension of element  5000
train_x (text) size RAW: 0.043032 megabytes
train_y (labels) size RAW: 0.043032 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 97.36 megabytes
train_yNPSize (labels) size: 0.09736 megabytes
test_xNPSize (text) size: 153.32 megabytes
test_yNPSize (labels) size: 0.15332 megabytes
train_xNP length  4868
train_xNP dimension of element  2
train_xNP size  24340000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 97.36 megabytes
train_y (labels) size: 0.09736 megabytes
train_x (text) length: 4868
train_y (labels) length: 4868
[[1 0 0 ... 0 0

100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:02<00:00, 2199.62it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:01<00:00, 2123.12it/s]


train_x length  4868
train_x dimension of element  5000
train_x (text) size RAW: 0.043032 megabytes
train_y (labels) size RAW: 0.043032 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 97.36 megabytes
train_yNPSize (labels) size: 0.038944 megabytes
test_xNPSize (text) size: 51.7 megabytes
test_yNPSize (labels) size: 0.02068 megabytes
train_xNP length  4868
train_xNP dimension of element  2
train_xNP size  24340000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 97.36 megabytes
train_y (labels) size: 0.038944 megabytes
train_x (text) length: 4868
train_y (labels) length: 4868
[[1 0 0 ... 0 0 0

100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:02<00:00, 2366.29it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:00<00:00, 2659.38it/s]


train_x length  4868
train_x dimension of element  5000
train_x (text) size RAW: 0.043032 megabytes
train_y (labels) size RAW: 0.043032 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 97.36 megabytes
train_yNPSize (labels) size: 0.116832 megabytes
test_xNPSize (text) size: 48.28 megabytes
test_yNPSize (labels) size: 0.057936 megabytes
train_xNP length  4868
train_xNP dimension of element  2
train_xNP size  24340000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 97.36 megabytes
train_y (labels) size: 0.116832 megabytes
train_x (text) length: 4868
train_y (labels) length: 4868
[[1 0 0 ... 0 0

100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:01<00:00, 2674.10it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:02<00:00, 2594.80it/s]


train_x length  4868
train_x dimension of element  5000
train_x (text) size RAW: 0.043032 megabytes
train_y (labels) size RAW: 0.043032 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 97.36 megabytes
train_yNPSize (labels) size: 0.077888 megabytes
test_xNPSize (text) size: 142.04 megabytes
test_yNPSize (labels) size: 0.113632 megabytes
train_xNP length  4868
train_xNP dimension of element  2
train_xNP size  24340000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 97.36 megabytes
train_y (labels) size: 0.077888 megabytes
train_x (text) length: 4868
train_y (labels) length: 4868
[[1 0 0 ... 

100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:01<00:00, 2463.20it/s]
100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:03<00:00, 2594.24it/s]


train_x length  4868
train_x dimension of element  5000
train_x (text) size RAW: 0.043032 megabytes
train_y (labels) size RAW: 0.043032 megabytes
test_x (text) size RAW: 0.087616 megabytes
test_y (labels) size RAW: 0.087616 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 97.36 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 201.24 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  4868
train_xNP dimension of element  2
train_xNP size  24340000
Train or test empty. Did you misspell the dataset name?
('ssec', 'dailydialog', 'tweets', 'conversations')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  ssec
test  dailydialog
there were  102979  entries that were in test and  118460 that were not in test and  4868  that were in train
test was appended  102979  times
multi
oof
Detected mode

100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:01<00:00, 2549.72it/s]
100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:44<00:00, 2290.94it/s]


train_x length  4868
train_x dimension of element  5000
train_x (text) size RAW: 0.043032 megabytes
train_y (labels) size RAW: 0.043032 megabytes
test_x (text) size RAW: 0.824456 megabytes
test_y (labels) size RAW: 0.824456 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 97.36 megabytes
train_yNPSize (labels) size: 0.116832 megabytes
test_xNPSize (text) size: 2059.58 megabytes
test_yNPSize (labels) size: 2.471496 megabytes
train_xNP length  4868
train_xNP dimension of element  2
train_xNP size  24340000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 97.36 megabytes
train_y (labels) size: 0.116832 megabytes
train_x (text) length: 4868
train_y (labels) length: 4868
[[1 0 0 ...

100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:01<00:00, 2701.92it/s]
100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:16<00:00, 2465.08it/s]


train_x length  4868
train_x dimension of element  5000
train_x (text) size RAW: 0.043032 megabytes
train_y (labels) size RAW: 0.043032 megabytes
test_x (text) size RAW: 0.321096 megabytes
test_y (labels) size RAW: 0.321096 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 97.36 megabytes
train_yNPSize (labels) size: 0.116832 megabytes
test_xNPSize (text) size: 794.8 megabytes
test_yNPSize (labels) size: 0.95376 megabytes
train_xNP length  4868
train_xNP dimension of element  2
train_xNP size  24340000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 97.36 megabytes
train_y (labels) size: 0.116832 megabytes
train_x (text) length: 4868
train_y (labels) length: 4868
[[1 0 0 ... 0 

100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:01<00:00, 2719.84it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 2354.82it/s]


train_x length  4868
train_x dimension of element  5000
train_x (text) size RAW: 0.043032 megabytes
train_y (labels) size RAW: 0.043032 megabytes
test_x (text) size RAW: 0.010192 megabytes
test_y (labels) size RAW: 0.010192 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 97.36 megabytes
train_yNPSize (labels) size: 0.116832 megabytes
test_xNPSize (text) size: 25.0 megabytes
test_yNPSize (labels) size: 0.03 megabytes
train_xNP length  4868
train_xNP dimension of element  2
train_xNP size  24340000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 97.36 megabytes
train_y (labels) size: 0.116832 megabytes
train_x (text) length: 4868
train_y (labels) length: 4868
[[1 0 0 ... 0 0 0]

100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:00<00:00, 2869.51it/s]
100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:08<00:00, 2557.60it/s]


train_x length  2585
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.178016 megabytes
test_y (labels) size RAW: 0.178016 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 51.7 megabytes
train_yNPSize (labels) size: 0.01034 megabytes
test_xNPSize (text) size: 421.02 megabytes
test_yNPSize (labels) size: 0.084204 megabytes
train_xNP length  2585
train_xNP dimension of element  2
train_xNP size  12925000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 51.7 megabytes
train_y (labels) size: 0.01034 megabytes
train_x (text) length: 2585
train_y (labels) length: 2585
[[0 0 0 ... 0 0 0]

100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:00<00:00, 2734.80it/s]
100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:05<00:00, 2580.25it/s]


train_x length  2585
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.124912 megabytes
test_y (labels) size RAW: 0.124912 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 51.7 megabytes
train_yNPSize (labels) size: 0.01034 megabytes
test_xNPSize (text) size: 295.42 megabytes
test_yNPSize (labels) size: 0.059084 megabytes
train_xNP length  2585
train_xNP dimension of element  2
train_xNP size  12925000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 51.7 megabytes
train_y (labels) size: 0.01034 megabytes
train_x (text) length: 2585
train_y (labels) length: 2585
[[0 0 0 ... 0 0 0]

100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:00<00:00, 2793.03it/s]
100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:01<00:00, 2756.26it/s]


train_x length  2585
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.043032 megabytes
test_y (labels) size RAW: 0.043032 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 51.7 megabytes
train_yNPSize (labels) size: 0.02068 megabytes
test_xNPSize (text) size: 97.36 megabytes
test_yNPSize (labels) size: 0.038944 megabytes
train_xNP length  2585
train_xNP dimension of element  2
train_xNP size  12925000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 51.7 megabytes
train_y (labels) size: 0.02068 megabytes
train_x (text) length: 2585
train_y (labels) length: 2585
[[0 0 0 ... 0 0 0]


100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:00<00:00, 2726.26it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:02<00:00, 2755.15it/s]


train_x length  2585
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 51.7 megabytes
train_yNPSize (labels) size: 0.01034 megabytes
test_xNPSize (text) size: 153.32 megabytes
test_yNPSize (labels) size: 0.030664 megabytes
train_xNP length  2585
train_xNP dimension of element  2
train_xNP size  12925000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 51.7 megabytes
train_y (labels) size: 0.01034 megabytes
train_x (text) length: 2585
train_y (labels) length: 2585
[[0 0 0 ... 0 0 0]

100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:01<00:00, 2179.64it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:01<00:00, 2352.52it/s]


train_x length  2585
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 51.7 megabytes
train_yNPSize (labels) size: 0.01034 megabytes
test_xNPSize (text) size: 48.28 megabytes
test_yNPSize (labels) size: 0.009656 megabytes
train_xNP length  2585
train_xNP dimension of element  2
train_xNP size  12925000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 51.7 megabytes
train_y (labels) size: 0.01034 megabytes
train_x (text) length: 2585
train_y (labels) length: 2585
[[0 0 0 ... 0 0 0]
 [

100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:00<00:00, 2749.05it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:02<00:00, 2814.98it/s]


train_x length  2585
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 51.7 megabytes
train_yNPSize (labels) size: 0.01034 megabytes
test_xNPSize (text) size: 142.04 megabytes
test_yNPSize (labels) size: 0.028408 megabytes
train_xNP length  2585
train_xNP dimension of element  2
train_xNP size  12925000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
True
Loading classifier from file
classifier loaded successfully
Predicting...
Analysing...
analyse_results
hello
Precision	0.51957195156294
Recall	0.51957195156294
F1-score	0.51957195156294
Accuracy	0.51957195156294
classifier already saved
Total: 475 GiB
Used: 

100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:01<00:00, 2509.45it/s]
100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:03<00:00, 2612.86it/s]


train_x length  2585
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.087616 megabytes
test_y (labels) size RAW: 0.087616 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 51.7 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 201.24 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  2585
train_xNP dimension of element  2
train_xNP size  12925000
Train or test empty. Did you misspell the dataset name?
('grounded_emotions', 'dailydialog', 'tweets', 'conversations')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  grounded_emotions
test  dailydialog
there were  102979  entries that were in test and  118460 that were not in test and  2585  that were in train
test was appended  102979  times


100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:01<00:00, 2526.46it/s]
100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:43<00:00, 2380.23it/s]


train_x length  2585
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.824456 megabytes
test_y (labels) size RAW: 0.824456 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 51.7 megabytes
train_yNPSize (labels) size: 0.01034 megabytes
test_xNPSize (text) size: 2059.58 megabytes
test_yNPSize (labels) size: 0.411916 megabytes
train_xNP length  2585
train_xNP dimension of element  2
train_xNP size  12925000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 51.7 megabytes
train_y (labels) size: 0.01034 megabytes
train_x (text) length: 2585
train_y (labels) length: 2585
[[0 0 0 ... 0 0 0

100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:01<00:00, 2341.32it/s]
100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:15<00:00, 2501.68it/s]


train_x length  2585
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.321096 megabytes
test_y (labels) size RAW: 0.321096 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 51.7 megabytes
train_yNPSize (labels) size: 0.01034 megabytes
test_xNPSize (text) size: 794.8 megabytes
test_yNPSize (labels) size: 0.15896 megabytes
train_xNP length  2585
train_xNP dimension of element  2
train_xNP size  12925000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 51.7 megabytes
train_y (labels) size: 0.01034 megabytes
train_x (text) length: 2585
train_y (labels) length: 2585
[[0 0 0 ... 0 0 0]
 

100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:01<00:00, 2185.10it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 2372.45it/s]


train_x length  2585
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.010192 megabytes
test_y (labels) size RAW: 0.010192 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 51.7 megabytes
train_yNPSize (labels) size: 0.02068 megabytes
test_xNPSize (text) size: 25.0 megabytes
test_yNPSize (labels) size: 0.01 megabytes
train_xNP length  2585
train_xNP dimension of element  2
train_xNP size  12925000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
True
Loading classifier from file
classifier loaded successfully
Predicting...
Analysing...
analyse_results
hello
Precision	0.524
Recall	0.5161544523246651
F1-score	0.5200476379515682
Accuracy	0.3896
classifier already saved
Total: 475 GiB
Used: 309 GiB
Free: 165 GiB
-

100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:03<00:00, 1896.70it/s]
100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:09<00:00, 2168.89it/s]


train_x length  7102
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.178016 megabytes
test_y (labels) size RAW: 0.178016 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 142.04 megabytes
train_yNPSize (labels) size: 0.028408 megabytes
test_xNPSize (text) size: 421.02 megabytes
test_yNPSize (labels) size: 0.084204 megabytes
train_xNP length  7102
train_xNP dimension of element  2
train_xNP size  35510000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 142.04 megabytes
train_y (labels) size: 0.028408 megabytes
train_x (text) length: 7102
train_y (labels) length: 7102
[[1 0 0 ..

100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:02<00:00, 2721.35it/s]
100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:05<00:00, 2609.47it/s]


train_x length  7102
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.124912 megabytes
test_y (labels) size RAW: 0.124912 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 142.04 megabytes
train_yNPSize (labels) size: 0.028408 megabytes
test_xNPSize (text) size: 295.42 megabytes
test_yNPSize (labels) size: 0.059084 megabytes
train_xNP length  7102
train_xNP dimension of element  2
train_xNP size  35510000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 142.04 megabytes
train_y (labels) size: 0.028408 megabytes
train_x (text) length: 7102
train_y (labels) length: 7102
[[1 0 0 ..

100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:02<00:00, 2722.38it/s]
100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:01<00:00, 2613.24it/s]


train_x length  7102
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.043032 megabytes
test_y (labels) size RAW: 0.043032 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 142.04 megabytes
train_yNPSize (labels) size: 0.113632 megabytes
test_xNPSize (text) size: 97.36 megabytes
test_yNPSize (labels) size: 0.077888 megabytes
train_xNP length  7102
train_xNP dimension of element  2
train_xNP size  35510000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 142.04 megabytes
train_y (labels) size: 0.113632 megabytes
train_x (text) length: 7102
train_y (labels) length: 7102
[[1 0 0 ...

100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:02<00:00, 2653.12it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:02<00:00, 2649.03it/s]


train_x length  7102
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 142.04 megabytes
train_yNPSize (labels) size: 0.028408 megabytes
test_xNPSize (text) size: 153.32 megabytes
test_yNPSize (labels) size: 0.030664 megabytes
train_xNP length  7102
train_xNP dimension of element  2
train_xNP size  35510000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 142.04 megabytes
train_y (labels) size: 0.028408 megabytes
train_x (text) length: 7102
train_y (labels) length: 7102
[[1 0 0 ..

100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:02<00:00, 2721.35it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:00<00:00, 2746.19it/s]


train_x length  7102
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 142.04 megabytes
train_yNPSize (labels) size: 0.028408 megabytes
test_xNPSize (text) size: 51.7 megabytes
test_yNPSize (labels) size: 0.01034 megabytes
train_xNP length  7102
train_xNP dimension of element  2
train_xNP size  35510000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 142.04 megabytes
train_y (labels) size: 0.028408 megabytes
train_x (text) length: 7102
train_y (labels) length: 7102
[[1 0 0 ... 0 0

100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:02<00:00, 2708.02it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:00<00:00, 2506.79it/s]


train_x length  7102
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 142.04 megabytes
train_yNPSize (labels) size: 0.028408 megabytes
test_xNPSize (text) size: 48.28 megabytes
test_yNPSize (labels) size: 0.009656 megabytes
train_xNP length  7102
train_xNP dimension of element  2
train_xNP size  35510000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 142.04 megabytes
train_y (labels) size: 0.028408 megabytes
train_x (text) length: 7102
train_y (labels) length: 7102
[[1 0 0 ... 0

100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:02<00:00, 2635.66it/s]
100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:03<00:00, 2560.38it/s]


train_x length  7102
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.087616 megabytes
test_y (labels) size RAW: 0.087616 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 142.04 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 201.24 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  7102
train_xNP dimension of element  2
train_xNP size  35510000
Train or test empty. Did you misspell the dataset name?
('emoint', 'dailydialog', 'tweets', 'conversations')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  emoint
test  dailydialog
there were  102979  entries that were in test and  118460 that were not in test and  7102  that were in train
test was appended  102979  times
single
oof
Detecte

100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:02<00:00, 2766.32it/s]
100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:41<00:00, 2484.57it/s]


train_x length  7102
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.824456 megabytes
test_y (labels) size RAW: 0.824456 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 142.04 megabytes
train_yNPSize (labels) size: 0.028408 megabytes
test_xNPSize (text) size: 2059.58 megabytes
test_yNPSize (labels) size: 0.411916 megabytes
train_xNP length  7102
train_xNP dimension of element  2
train_xNP size  35510000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 142.04 megabytes
train_y (labels) size: 0.028408 megabytes
train_x (text) length: 7102
train_y (labels) length: 7102
[[1 0 0 .

100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:02<00:00, 2690.83it/s]
100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:15<00:00, 2555.97it/s]


train_x length  7102
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.321096 megabytes
test_y (labels) size RAW: 0.321096 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 142.04 megabytes
train_yNPSize (labels) size: 0.028408 megabytes
test_xNPSize (text) size: 794.8 megabytes
test_yNPSize (labels) size: 0.15896 megabytes
train_xNP length  7102
train_xNP dimension of element  2
train_xNP size  35510000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 142.04 megabytes
train_y (labels) size: 0.028408 megabytes
train_x (text) length: 7102
train_y (labels) length: 7102
[[1 0 0 ... 

100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:02<00:00, 2718.27it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 2853.36it/s]


train_x length  7102
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.010192 megabytes
test_y (labels) size RAW: 0.010192 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 142.04 megabytes
train_yNPSize (labels) size: 0.113632 megabytes
test_xNPSize (text) size: 25.0 megabytes
test_yNPSize (labels) size: 0.02 megabytes
train_xNP length  7102
train_xNP dimension of element  2
train_xNP size  35510000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 142.04 megabytes
train_y (labels) size: 0.113632 megabytes
train_x (text) length: 7102
train_y (labels) length: 7102
[[1 0 0 ... 0 0 

100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:15<00:00, 2561.66it/s]
100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:08<00:00, 2434.45it/s]


train_x length  39740
train_x dimension of element  5000
train_x (text) size RAW: 0.321096 megabytes
train_y (labels) size RAW: 0.321096 megabytes
test_x (text) size RAW: 0.178016 megabytes
test_y (labels) size RAW: 0.178016 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 794.8 megabytes
train_yNPSize (labels) size: 0.15896 megabytes
test_xNPSize (text) size: 421.02 megabytes
test_yNPSize (labels) size: 0.084204 megabytes
train_xNP length  39740
train_xNP dimension of element  2
train_xNP size  198700000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 794.8 megabytes
train_y (labels) size: 0.15896 megabytes
train_x (text) length: 39740
train_y (labels) length: 39740
[[1 1 0 .

100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:15<00:00, 2590.00it/s]
100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:06<00:00, 2296.46it/s]


train_x length  39740
train_x dimension of element  5000
train_x (text) size RAW: 0.321096 megabytes
train_y (labels) size RAW: 0.321096 megabytes
test_x (text) size RAW: 0.124912 megabytes
test_y (labels) size RAW: 0.124912 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 794.8 megabytes
train_yNPSize (labels) size: 0.15896 megabytes
test_xNPSize (text) size: 295.42 megabytes
test_yNPSize (labels) size: 0.059084 megabytes
train_xNP length  39740
train_xNP dimension of element  2
train_xNP size  198700000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 794.8 megabytes
train_y (labels) size: 0.15896 megabytes
train_x (text) length: 39740
train_y (labels) length: 39740
[[1 1 0 .

100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:17<00:00, 2269.13it/s]
100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:01<00:00, 2495.59it/s]


train_x length  39740
train_x dimension of element  5000
train_x (text) size RAW: 0.321096 megabytes
train_y (labels) size RAW: 0.321096 megabytes
test_x (text) size RAW: 0.043032 megabytes
test_y (labels) size RAW: 0.043032 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 794.8 megabytes
train_yNPSize (labels) size: 0.95376 megabytes
test_xNPSize (text) size: 97.36 megabytes
test_yNPSize (labels) size: 0.116832 megabytes
train_xNP length  39740
train_xNP dimension of element  2
train_xNP size  198700000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 794.8 megabytes
train_y (labels) size: 0.95376 megabytes
train_x (text) length: 39740
train_y (labels) length: 39740
[[1 1 0 ..

100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:16<00:00, 2350.57it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:03<00:00, 2208.53it/s]


train_x length  39740
train_x dimension of element  5000
train_x (text) size RAW: 0.321096 megabytes
train_y (labels) size RAW: 0.321096 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 794.8 megabytes
train_yNPSize (labels) size: 0.15896 megabytes
test_xNPSize (text) size: 153.32 megabytes
test_yNPSize (labels) size: 0.030664 megabytes
train_xNP length  39740
train_xNP dimension of element  2
train_xNP size  198700000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 794.8 megabytes
train_y (labels) size: 0.15896 megabytes
train_x (text) length: 39740
train_y (labels) length: 39740
[[1 1 0 .

100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:16<00:00, 2443.39it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:01<00:00, 2255.96it/s]


train_x length  39740
train_x dimension of element  5000
train_x (text) size RAW: 0.321096 megabytes
train_y (labels) size RAW: 0.321096 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 794.8 megabytes
train_yNPSize (labels) size: 0.15896 megabytes
test_xNPSize (text) size: 51.7 megabytes
test_yNPSize (labels) size: 0.01034 megabytes
train_xNP length  39740
train_xNP dimension of element  2
train_xNP size  198700000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 794.8 megabytes
train_y (labels) size: 0.15896 megabytes
train_x (text) length: 39740
train_y (labels) length: 39740
[[1 1 0 ... 0 

100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:16<00:00, 2383.12it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:01<00:00, 2297.30it/s]


train_x length  39740
train_x dimension of element  5000
train_x (text) size RAW: 0.321096 megabytes
train_y (labels) size RAW: 0.321096 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 794.8 megabytes
train_yNPSize (labels) size: 0.15896 megabytes
test_xNPSize (text) size: 48.28 megabytes
test_yNPSize (labels) size: 0.009656 megabytes
train_xNP length  39740
train_xNP dimension of element  2
train_xNP size  198700000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 794.8 megabytes
train_y (labels) size: 0.15896 megabytes
train_x (text) length: 39740
train_y (labels) length: 39740
[[1 1 0 ... 

100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:16<00:00, 2349.06it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:03<00:00, 2289.43it/s]


train_x length  39740
train_x dimension of element  5000
train_x (text) size RAW: 0.321096 megabytes
train_y (labels) size RAW: 0.321096 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 794.8 megabytes
train_yNPSize (labels) size: 0.15896 megabytes
test_xNPSize (text) size: 142.04 megabytes
test_yNPSize (labels) size: 0.028408 megabytes
train_xNP length  39740
train_xNP dimension of element  2
train_xNP size  198700000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 794.8 megabytes
train_y (labels) size: 0.15896 megabytes
train_x (text) length: 39740
train_y (labels) length: 39740
[[1 1 0 .

100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:15<00:00, 2535.86it/s]
100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:04<00:00, 2202.11it/s]


train_x length  39740
train_x dimension of element  5000
train_x (text) size RAW: 0.321096 megabytes
train_y (labels) size RAW: 0.321096 megabytes
test_x (text) size RAW: 0.087616 megabytes
test_y (labels) size RAW: 0.087616 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 794.8 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 201.24 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  39740
train_xNP dimension of element  2
train_xNP size  198700000
Train or test empty. Did you misspell the dataset name?
('crowdflower', 'dailydialog', 'tweets', 'conversations')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  crowdflower
test  dailydialog
there were  102979  entries that were in test and  118460 that were not in test and  39740  that were in train
test was appended  102979  times
singl

100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:15<00:00, 2509.79it/s]
100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:46<00:00, 2194.18it/s]


train_x length  39740
train_x dimension of element  5000
train_x (text) size RAW: 0.321096 megabytes
train_y (labels) size RAW: 0.321096 megabytes
test_x (text) size RAW: 0.824456 megabytes
test_y (labels) size RAW: 0.824456 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 794.8 megabytes
train_yNPSize (labels) size: 0.15896 megabytes
test_xNPSize (text) size: 2059.58 megabytes
test_yNPSize (labels) size: 0.411916 megabytes
train_xNP length  39740
train_xNP dimension of element  2
train_xNP size  198700000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 794.8 megabytes
train_y (labels) size: 0.15896 megabytes
train_x (text) length: 39740
train_y (labels) length: 39740
[[1 1 0 

100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:16<00:00, 2468.56it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 2328.72it/s]


train_x length  39740
train_x dimension of element  5000
train_x (text) size RAW: 0.321096 megabytes
train_y (labels) size RAW: 0.321096 megabytes
test_x (text) size RAW: 0.010192 megabytes
test_y (labels) size RAW: 0.010192 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 794.8 megabytes
train_yNPSize (labels) size: 0.95376 megabytes
test_xNPSize (text) size: 25.0 megabytes
test_yNPSize (labels) size: 0.03 megabytes
train_xNP length  39740
train_xNP dimension of element  2
train_xNP size  198700000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 794.8 megabytes
train_y (labels) size: 0.95376 megabytes
train_x (text) length: 39740
train_y (labels) length: 39740
[[1 1 0 ... 0 0

100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:07<00:00, 1998.75it/s]
100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:11<00:00, 1888.10it/s]


train_x length  14771
train_x dimension of element  5000
train_x (text) size RAW: 0.124912 megabytes
train_y (labels) size RAW: 0.124912 megabytes
test_x (text) size RAW: 0.178016 megabytes
test_y (labels) size RAW: 0.178016 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 295.42 megabytes
train_yNPSize (labels) size: 0.059084 megabytes
test_xNPSize (text) size: 421.02 megabytes
test_yNPSize (labels) size: 0.084204 megabytes
train_xNP length  14771
train_xNP dimension of element  2
train_xNP size  73855000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 295.42 megabytes
train_y (labels) size: 0.059084 megabytes
train_x (text) length: 14771
train_y (labels) length: 14771
[[0 0 

100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:05<00:00, 2527.54it/s]
100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:02<00:00, 2270.49it/s]


train_x length  14771
train_x dimension of element  5000
train_x (text) size RAW: 0.124912 megabytes
train_y (labels) size RAW: 0.124912 megabytes
test_x (text) size RAW: 0.043032 megabytes
test_y (labels) size RAW: 0.043032 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 295.42 megabytes
train_yNPSize (labels) size: 0.354504 megabytes
test_xNPSize (text) size: 97.36 megabytes
test_yNPSize (labels) size: 0.116832 megabytes
train_xNP length  14771
train_xNP dimension of element  2
train_xNP size  73855000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 295.42 megabytes
train_y (labels) size: 0.354504 megabytes
train_x (text) length: 14771
train_y (labels) length: 14771
[[0 0 0

100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:05<00:00, 2498.02it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:03<00:00, 2194.18it/s]


train_x length  14771
train_x dimension of element  5000
train_x (text) size RAW: 0.124912 megabytes
train_y (labels) size RAW: 0.124912 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 295.42 megabytes
train_yNPSize (labels) size: 0.059084 megabytes
test_xNPSize (text) size: 153.32 megabytes
test_yNPSize (labels) size: 0.030664 megabytes
train_xNP length  14771
train_xNP dimension of element  2
train_xNP size  73855000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 295.42 megabytes
train_y (labels) size: 0.059084 megabytes
train_x (text) length: 14771
train_y (labels) length: 14771
[[0 0 

100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:05<00:00, 2556.89it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:01<00:00, 2457.65it/s]


train_x length  14771
train_x dimension of element  5000
train_x (text) size RAW: 0.124912 megabytes
train_y (labels) size RAW: 0.124912 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 295.42 megabytes
train_yNPSize (labels) size: 0.059084 megabytes
test_xNPSize (text) size: 51.7 megabytes
test_yNPSize (labels) size: 0.01034 megabytes
train_xNP length  14771
train_xNP dimension of element  2
train_xNP size  73855000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 295.42 megabytes
train_y (labels) size: 0.059084 megabytes
train_x (text) length: 14771
train_y (labels) length: 14771
[[0 0 0 ...

100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:05<00:00, 2534.81it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:01<00:00, 2308.08it/s]


train_x length  14771
train_x dimension of element  5000
train_x (text) size RAW: 0.124912 megabytes
train_y (labels) size RAW: 0.124912 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 295.42 megabytes
train_yNPSize (labels) size: 0.059084 megabytes
test_xNPSize (text) size: 48.28 megabytes
test_yNPSize (labels) size: 0.009656 megabytes
train_xNP length  14771
train_xNP dimension of element  2
train_xNP size  73855000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 295.42 megabytes
train_y (labels) size: 0.059084 megabytes
train_x (text) length: 14771
train_y (labels) length: 14771
[[0 0 0 .

100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:09<00:00, 1618.97it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:04<00:00, 1527.91it/s]


train_x length  14771
train_x dimension of element  5000
train_x (text) size RAW: 0.124912 megabytes
train_y (labels) size RAW: 0.124912 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 295.42 megabytes
train_yNPSize (labels) size: 0.059084 megabytes
test_xNPSize (text) size: 142.04 megabytes
test_yNPSize (labels) size: 0.028408 megabytes
train_xNP length  14771
train_xNP dimension of element  2
train_xNP size  73855000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 295.42 megabytes
train_y (labels) size: 0.059084 megabytes
train_x (text) length: 14771
train_y (labels) length: 14771
[[0 0 

100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:06<00:00, 2425.18it/s]
100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:04<00:00, 2318.78it/s]


train_x length  14771
train_x dimension of element  5000
train_x (text) size RAW: 0.124912 megabytes
train_y (labels) size RAW: 0.124912 megabytes
test_x (text) size RAW: 0.087616 megabytes
test_y (labels) size RAW: 0.087616 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 295.42 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 201.24 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  14771
train_xNP dimension of element  2
train_xNP size  73855000
Train or test empty. Did you misspell the dataset name?
('tales-emotion', 'dailydialog', 'tales', 'conversations')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  tales-emotion
test  dailydialog
there were  102979  entries that were in test and  118460 that were not in test and  14771  that were in train
test was appended  102979  times
si

100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:06<00:00, 2454.20it/s]
100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:43<00:00, 2348.05it/s]


train_x length  14771
train_x dimension of element  5000
train_x (text) size RAW: 0.124912 megabytes
train_y (labels) size RAW: 0.124912 megabytes
test_x (text) size RAW: 0.824456 megabytes
test_y (labels) size RAW: 0.824456 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 295.42 megabytes
train_yNPSize (labels) size: 0.059084 megabytes
test_xNPSize (text) size: 2059.58 megabytes
test_yNPSize (labels) size: 0.411916 megabytes
train_xNP length  14771
train_xNP dimension of element  2
train_xNP size  73855000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 295.42 megabytes
train_y (labels) size: 0.059084 megabytes
train_x (text) length: 14771
train_y (labels) length: 14771
[[0 0

100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:06<00:00, 2454.19it/s]
100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:16<00:00, 2385.94it/s]


train_x length  14771
train_x dimension of element  5000
train_x (text) size RAW: 0.124912 megabytes
train_y (labels) size RAW: 0.124912 megabytes
test_x (text) size RAW: 0.321096 megabytes
test_y (labels) size RAW: 0.321096 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 295.42 megabytes
train_yNPSize (labels) size: 0.059084 megabytes
test_xNPSize (text) size: 794.8 megabytes
test_yNPSize (labels) size: 0.15896 megabytes
train_xNP length  14771
train_xNP dimension of element  2
train_xNP size  73855000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 295.42 megabytes
train_y (labels) size: 0.059084 megabytes
train_x (text) length: 14771
train_y (labels) length: 14771
[[0 0 0 

100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:06<00:00, 2354.24it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 2129.23it/s]


train_x length  14771
train_x dimension of element  5000
train_x (text) size RAW: 0.124912 megabytes
train_y (labels) size RAW: 0.124912 megabytes
test_x (text) size RAW: 0.010192 megabytes
test_y (labels) size RAW: 0.010192 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 295.42 megabytes
train_yNPSize (labels) size: 0.354504 megabytes
test_xNPSize (text) size: 25.0 megabytes
test_yNPSize (labels) size: 0.03 megabytes
train_xNP length  14771
train_xNP dimension of element  2
train_xNP size  73855000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 295.42 megabytes
train_y (labels) size: 0.354504 megabytes
train_x (text) length: 14771
train_y (labels) length: 14771
[[0 0 0 ... 

100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:00<00:00, 2647.86it/s]
100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:08<00:00, 2489.85it/s]


train_x length  2414
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.178016 megabytes
test_y (labels) size RAW: 0.178016 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 48.28 megabytes
train_yNPSize (labels) size: 0.009656 megabytes
test_xNPSize (text) size: 421.02 megabytes
test_yNPSize (labels) size: 0.084204 megabytes
train_xNP length  2414
train_xNP dimension of element  2
train_xNP size  12070000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 48.28 megabytes
train_y (labels) size: 0.009656 megabytes
train_x (text) length: 2414
train_y (labels) length: 2414
[[1 1 0 ... 0 

100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:00<00:00, 2958.43it/s]
100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:05<00:00, 2505.95it/s]


train_x length  2414
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.124912 megabytes
test_y (labels) size RAW: 0.124912 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 48.28 megabytes
train_yNPSize (labels) size: 0.009656 megabytes
test_xNPSize (text) size: 295.42 megabytes
test_yNPSize (labels) size: 0.059084 megabytes
train_xNP length  2414
train_xNP dimension of element  2
train_xNP size  12070000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 48.28 megabytes
train_y (labels) size: 0.009656 megabytes
train_x (text) length: 2414
train_y (labels) length: 2414
[[1 1 0 ... 0 

100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:00<00:00, 2527.43it/s]
100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:02<00:00, 2393.88it/s]


train_x length  2414
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.043032 megabytes
test_y (labels) size RAW: 0.043032 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 48.28 megabytes
train_yNPSize (labels) size: 0.057936 megabytes
test_xNPSize (text) size: 97.36 megabytes
test_yNPSize (labels) size: 0.116832 megabytes
train_xNP length  2414
train_xNP dimension of element  2
train_xNP size  12070000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 48.28 megabytes
train_y (labels) size: 0.057936 megabytes
train_x (text) length: 2414
train_y (labels) length: 2414
[[1 1 0 ... 0 0

100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:00<00:00, 2983.68it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:02<00:00, 2948.58it/s]


train_x length  2414
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 48.28 megabytes
train_yNPSize (labels) size: 0.009656 megabytes
test_xNPSize (text) size: 153.32 megabytes
test_yNPSize (labels) size: 0.030664 megabytes
train_xNP length  2414
train_xNP dimension of element  2
train_xNP size  12070000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 48.28 megabytes
train_y (labels) size: 0.009656 megabytes
train_x (text) length: 2414
train_y (labels) length: 2414
[[1 1 0 ... 0 

100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:00<00:00, 2937.13it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:00<00:00, 3093.14it/s]


train_x length  2414
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 48.28 megabytes
train_yNPSize (labels) size: 0.009656 megabytes
test_xNPSize (text) size: 51.7 megabytes
test_yNPSize (labels) size: 0.01034 megabytes
train_xNP length  2414
train_xNP dimension of element  2
train_xNP size  12070000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 48.28 megabytes
train_y (labels) size: 0.009656 megabytes
train_x (text) length: 2414
train_y (labels) length: 2414
[[1 1 0 ... 0 0 0]


100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:00<00:00, 2954.86it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:02<00:00, 2797.51it/s]


train_x length  2414
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 48.28 megabytes
train_yNPSize (labels) size: 0.009656 megabytes
test_xNPSize (text) size: 142.04 megabytes
test_yNPSize (labels) size: 0.028408 megabytes
train_xNP length  2414
train_xNP dimension of element  2
train_xNP size  12070000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 48.28 megabytes
train_y (labels) size: 0.009656 megabytes
train_x (text) length: 2414
train_y (labels) length: 2414
[[1 1 0 ... 0 

100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:00<00:00, 2745.85it/s]
100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:03<00:00, 2662.64it/s]


train_x length  2414
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.087616 megabytes
test_y (labels) size RAW: 0.087616 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 48.28 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 201.24 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  2414
train_xNP dimension of element  2
train_xNP size  12070000
Train or test empty. Did you misspell the dataset name?
('emotion-cause', 'dailydialog', 'paragraphs', 'conversations')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  emotion-cause
test  dailydialog
there were  102979  entries that were in test and  118460 that were not in test and  2414  that were in train
test was appended  102979  times
sin

100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:01<00:00, 2248.68it/s]
100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:41<00:00, 2488.16it/s]


train_x length  2414
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.824456 megabytes
test_y (labels) size RAW: 0.824456 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 48.28 megabytes
train_yNPSize (labels) size: 0.009656 megabytes
test_xNPSize (text) size: 2059.58 megabytes
test_yNPSize (labels) size: 0.411916 megabytes
train_xNP length  2414
train_xNP dimension of element  2
train_xNP size  12070000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 48.28 megabytes
train_y (labels) size: 0.009656 megabytes
train_x (text) length: 2414
train_y (labels) length: 2414
[[1 1 0 ... 0

100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:00<00:00, 2783.41it/s]
100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:15<00:00, 2596.01it/s]


train_x length  2414
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.321096 megabytes
test_y (labels) size RAW: 0.321096 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 48.28 megabytes
train_yNPSize (labels) size: 0.009656 megabytes
test_xNPSize (text) size: 794.8 megabytes
test_yNPSize (labels) size: 0.15896 megabytes
train_xNP length  2414
train_xNP dimension of element  2
train_xNP size  12070000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 48.28 megabytes
train_y (labels) size: 0.009656 megabytes
train_x (text) length: 2414
train_y (labels) length: 2414
[[1 1 0 ... 0 0 

100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:01<00:00, 2013.64it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 2100.99it/s]


train_x length  2414
train_x dimension of element  5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.010192 megabytes
test_y (labels) size RAW: 0.010192 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 48.28 megabytes
train_yNPSize (labels) size: 0.057936 megabytes
test_xNPSize (text) size: 25.0 megabytes
test_yNPSize (labels) size: 0.03 megabytes
train_xNP length  2414
train_xNP dimension of element  2
train_xNP size  12070000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 48.28 megabytes
train_y (labels) size: 0.057936 megabytes
train_x (text) length: 2414
train_y (labels) length: 2414
[[1 1 0 ... 0 0 0]
 

100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:04<00:00, 2433.31it/s]
100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:09<00:00, 2295.12it/s]


train_x length  10062
train_x dimension of element  5000
train_x (text) size RAW: 0.087616 megabytes
train_y (labels) size RAW: 0.087616 megabytes
test_x (text) size RAW: 0.178016 megabytes
test_y (labels) size RAW: 0.178016 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 201.24 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 421.02 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  10062
train_xNP dimension of element  2
train_xNP size  50310000
Train or test empty. Did you misspell the dataset name?
('emobank', 'tales-emotion', 'headlines', 'tales')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  emobank
test  tales-emotion
there were  14771  entries that were in test and  206668 that were not in test and  10062  that were in train
test was appended  14771  times
multi
oof
Detect

100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:04<00:00, 2434.45it/s]
100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:06<00:00, 2436.22it/s]


train_x length  10062
train_x dimension of element  5000
train_x (text) size RAW: 0.087616 megabytes
train_y (labels) size RAW: 0.087616 megabytes
test_x (text) size RAW: 0.124912 megabytes
test_y (labels) size RAW: 0.124912 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 201.24 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 295.42 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  10062
train_xNP dimension of element  2
train_xNP size  50310000
Train or test empty. Did you misspell the dataset name?
('emobank', 'ssec', 'headlines', 'tweets')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  emobank
test  ssec
there were  4868  entries that were in test and  216571 that were not in test and  10062  that were in train
test was appended  4868  times
multi
Detected mode: multi...
10062

100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:03<00:00, 2564.88it/s]
100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:02<00:00, 2410.25it/s]


train_x length  10062
train_x dimension of element  5000
train_x (text) size RAW: 0.087616 megabytes
train_y (labels) size RAW: 0.087616 megabytes
test_x (text) size RAW: 0.043032 megabytes
test_y (labels) size RAW: 0.043032 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 201.24 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 97.36 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  10062
train_xNP dimension of element  2
train_xNP size  50310000
Train or test empty. Did you misspell the dataset name?
('emobank', 'isear', 'headlines', 'descriptions')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  emobank
test  isear
there were  7666  entries that were in test and  213773 that were not in test and  10062  that were in train
test was appended  7666  times
multi
oof
Detected mode: mul

100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:04<00:00, 2484.28it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:05<00:00, 1478.52it/s]


train_x length  10062
train_x dimension of element  5000
train_x (text) size RAW: 0.087616 megabytes
train_y (labels) size RAW: 0.087616 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 201.24 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 153.32 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  10062
train_xNP dimension of element  2
train_xNP size  50310000
Train or test empty. Did you misspell the dataset name?
('emobank', 'grounded_emotions', 'headlines', 'tweets')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  emobank
test  grounded_emotions
there were  2585  entries that were in test and  218854 that were not in test and  10062  that were in train
test was appended  2585  times
multi
oof

100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:04<00:00, 2458.53it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:01<00:00, 1870.05it/s]


train_x length  10062
train_x dimension of element  5000
train_x (text) size RAW: 0.087616 megabytes
train_y (labels) size RAW: 0.087616 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 201.24 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 51.7 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  10062
train_xNP dimension of element  2
train_xNP size  50310000
Train or test empty. Did you misspell the dataset name?
('emobank', 'emotion-cause', 'headlines', 'paragraphs')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  emobank
test  emotion-cause
there were  2414  entries that were in test and  219025 that were not in test and  10062  that were in train
test was appended  2414  times
multi
oof
Detecte

100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:04<00:00, 2302.01it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:01<00:00, 2398.65it/s]


train_x length  10062
train_x dimension of element  5000
train_x (text) size RAW: 0.087616 megabytes
train_y (labels) size RAW: 0.087616 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 201.24 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 48.28 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  10062
train_xNP dimension of element  2
train_xNP size  50310000
Train or test empty. Did you misspell the dataset name?
('emobank', 'emoint', 'headlines', 'tweets')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  emobank
test  emoint
there were  7102  entries that were in test and  214337 that were not in test and  10062  that were in train
test was appended  7102  times
multi
oof
Detected mode: multi...


100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:04<00:00, 2410.85it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:02<00:00, 2497.56it/s]


train_x length  10062
train_x dimension of element  5000
train_x (text) size RAW: 0.087616 megabytes
train_y (labels) size RAW: 0.087616 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 201.24 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 142.04 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  10062
train_xNP dimension of element  2
train_xNP size  50310000
Train or test empty. Did you misspell the dataset name?
('emobank', 'dailydialog', 'headlines', 'conversations')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  emobank
test  dailydialog
there were  102979  entries that were in test and  118460 that were not in test and  10062  that were in train
test was appended  102979  times
multi
oof


100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:04<00:00, 2449.08it/s]
100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:44<00:00, 2315.64it/s]


train_x length  10062
train_x dimension of element  5000
train_x (text) size RAW: 0.087616 megabytes
train_y (labels) size RAW: 0.087616 megabytes
test_x (text) size RAW: 0.824456 megabytes
test_y (labels) size RAW: 0.824456 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 201.24 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 2059.58 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  10062
train_xNP dimension of element  2
train_xNP size  50310000
Train or test empty. Did you misspell the dataset name?
('emobank', 'crowdflower', 'headlines', 'tweets')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  emobank
test  crowdflower
there were  39740  entries that were in test and  181699 that were not in test and  10062  that were in train
test was appended  39740  times
multi
oof
Detected

100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:03<00:00, 2619.57it/s]
100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:16<00:00, 2362.43it/s]


train_x length  10062
train_x dimension of element  5000
train_x (text) size RAW: 0.087616 megabytes
train_y (labels) size RAW: 0.087616 megabytes
test_x (text) size RAW: 0.321096 megabytes
test_y (labels) size RAW: 0.321096 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 201.24 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 794.8 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  10062
train_xNP dimension of element  2
train_xNP size  50310000
Train or test empty. Did you misspell the dataset name?
('emobank', 'affectivetext', 'headlines', 'headlines')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  emobank
test  affectivetext
there were  1250  entries that were in test and  220189 that were not in test and  10062  that were in train
test was appended  1250  times
multi
Detected 

100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:04<00:00, 2446.73it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 2230.41it/s]


train_x length  10062
train_x dimension of element  5000
train_x (text) size RAW: 0.087616 megabytes
train_y (labels) size RAW: 0.087616 megabytes
test_x (text) size RAW: 0.010192 megabytes
test_y (labels) size RAW: 0.010192 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 201.24 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 25.0 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  10062
train_xNP dimension of element  2
train_xNP size  50310000
Train or test empty. Did you misspell the dataset name?
('affectivetext', 'tec', 'headlines', 'tweets')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  affectivetext
test  tec
there were  21051  entries that were in test and  200388 that were not in test and  1250  that were in train
test was appended  21051  times
multi
oof
Detected mode: m

100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 3959.05it/s]
100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:05<00:00, 3664.63it/s]


train_x length  1250
train_x dimension of element  3321
train_x (text) size RAW: 0.010192 megabytes
train_y (labels) size RAW: 0.010192 megabytes
test_x (text) size RAW: 0.178016 megabytes
test_y (labels) size RAW: 0.178016 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 16.605 megabytes
train_yNPSize (labels) size: 0.03 megabytes
test_xNPSize (text) size: 279.641484 megabytes
test_yNPSize (labels) size: 0.505224 megabytes
train_xNP length  1250
train_xNP dimension of element  2
train_xNP size  4151250
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 16.605 megabytes
train_y (labels) size: 0.03 megabytes
train_x (text) length: 1250
train_y (labels) length: 1250
[[0 0 0 ... 0 0

100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 4508.53it/s]
100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:03<00:00, 3936.53it/s]


train_x length  1250
train_x dimension of element  3321
train_x (text) size RAW: 0.010192 megabytes
train_y (labels) size RAW: 0.010192 megabytes
test_x (text) size RAW: 0.124912 megabytes
test_y (labels) size RAW: 0.124912 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 16.605 megabytes
train_yNPSize (labels) size: 0.03 megabytes
test_xNPSize (text) size: 196.217964 megabytes
test_yNPSize (labels) size: 0.354504 megabytes
train_xNP length  1250
train_xNP dimension of element  2
train_xNP size  4151250
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 16.605 megabytes
train_y (labels) size: 0.03 megabytes
train_x (text) length: 1250
train_y (labels) length: 1250
[[0 0 0 ... 0 0

100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 4153.33it/s]
100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:01<00:00, 3994.88it/s]


train_x length  1250
train_x dimension of element  3321
train_x (text) size RAW: 0.010192 megabytes
train_y (labels) size RAW: 0.010192 megabytes
test_x (text) size RAW: 0.043032 megabytes
test_y (labels) size RAW: 0.043032 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 16.605 megabytes
train_yNPSize (labels) size: 0.03 megabytes
test_xNPSize (text) size: 64.666512 megabytes
test_yNPSize (labels) size: 0.116832 megabytes
train_xNP length  1250
train_xNP dimension of element  2
train_xNP size  4151250
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 16.605 megabytes
train_y (labels) size: 0.03 megabytes
train_x (text) length: 1250
train_y (labels) length: 1250
[[0 0 0 ... 0 0 

100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 3499.70it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:02<00:00, 3808.58it/s]


train_x length  1250
train_x dimension of element  3321
train_x (text) size RAW: 0.010192 megabytes
train_y (labels) size RAW: 0.010192 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 16.605 megabytes
train_yNPSize (labels) size: 0.025 megabytes
test_xNPSize (text) size: 101.835144 megabytes
test_yNPSize (labels) size: 0.15332 megabytes
train_xNP length  1250
train_xNP dimension of element  2
train_xNP size  4151250
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 16.605 megabytes
train_y (labels) size: 0.025 megabytes
train_x (text) length: 1250
train_y (labels) length: 1250
[[0 0 0 ... 0 

100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 4280.01it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:00<00:00, 4309.10it/s]


train_x length  1250
train_x dimension of element  3321
train_x (text) size RAW: 0.010192 megabytes
train_y (labels) size RAW: 0.010192 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 16.605 megabytes
train_yNPSize (labels) size: 0.01 megabytes
test_xNPSize (text) size: 34.33914 megabytes
test_yNPSize (labels) size: 0.02068 megabytes
train_xNP length  1250
train_xNP dimension of element  2
train_xNP size  4151250
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 16.605 megabytes
train_y (labels) size: 0.01 megabytes
train_x (text) length: 1250
train_y (labels) length: 1250
[[0 0 0 ... 0 0 0]
 

100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 4099.63it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:00<00:00, 4269.75it/s]


train_x length  1250
train_x dimension of element  3321
train_x (text) size RAW: 0.010192 megabytes
train_y (labels) size RAW: 0.010192 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 16.605 megabytes
train_yNPSize (labels) size: 0.03 megabytes
test_xNPSize (text) size: 32.067576 megabytes
test_yNPSize (labels) size: 0.057936 megabytes
train_xNP length  1250
train_xNP dimension of element  2
train_xNP size  4151250
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 16.605 megabytes
train_y (labels) size: 0.03 megabytes
train_x (text) length: 1250
train_y (labels) length: 1250
[[0 0 0 ... 0 0 0]

100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 4508.07it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:01<00:00, 4338.75it/s]


train_x length  1250
train_x dimension of element  3321
train_x (text) size RAW: 0.010192 megabytes
train_y (labels) size RAW: 0.010192 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 16.605 megabytes
train_yNPSize (labels) size: 0.02 megabytes
test_xNPSize (text) size: 94.342968 megabytes
test_yNPSize (labels) size: 0.113632 megabytes
train_xNP length  1250
train_xNP dimension of element  2
train_xNP size  4151250
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 16.605 megabytes
train_y (labels) size: 0.02 megabytes
train_x (text) length: 1250
train_y (labels) length: 1250
[[0 0 0 ... 0 0 

100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 4265.63it/s]
100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:02<00:00, 3920.77it/s]


train_x length  1250
train_x dimension of element  3321
train_x (text) size RAW: 0.010192 megabytes
train_y (labels) size RAW: 0.010192 megabytes
test_x (text) size RAW: 0.087616 megabytes
test_y (labels) size RAW: 0.087616 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 16.605 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 133.663608 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  1250
train_xNP dimension of element  2
train_xNP size  4151250
Train or test empty. Did you misspell the dataset name?
('affectivetext', 'dailydialog', 'headlines', 'conversations')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  affectivetext
test  dailydialog
there were  102979  entries that were in test and  118460 that were not in test and  1250  that were in train
test was appended  102979  time

100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 4383.31it/s]
100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:26<00:00, 3871.02it/s]


train_x length  1250
train_x dimension of element  3321
train_x (text) size RAW: 0.010192 megabytes
train_y (labels) size RAW: 0.010192 megabytes
test_x (text) size RAW: 0.824456 megabytes
test_y (labels) size RAW: 0.824456 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 16.605 megabytes
train_yNPSize (labels) size: 0.03 megabytes
test_xNPSize (text) size: 1367.973036 megabytes
test_yNPSize (labels) size: 2.471496 megabytes
train_xNP length  1250
train_xNP dimension of element  2
train_xNP size  4151250
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 16.605 megabytes
train_y (labels) size: 0.03 megabytes
train_x (text) length: 1250
train_y (labels) length: 1250
[[0 0 0 ... 0 

100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 4208.94it/s]
100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:10<00:00, 3866.83it/s]


train_x length  1250
train_x dimension of element  3321
train_x (text) size RAW: 0.010192 megabytes
train_y (labels) size RAW: 0.010192 megabytes
test_x (text) size RAW: 0.321096 megabytes
test_y (labels) size RAW: 0.321096 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 16.605 megabytes
train_yNPSize (labels) size: 0.03 megabytes
test_xNPSize (text) size: 527.90616 megabytes
test_yNPSize (labels) size: 0.95376 megabytes
train_xNP length  1250
train_xNP dimension of element  2
train_xNP size  4151250
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 16.605 megabytes
train_y (labels) size: 0.03 megabytes
train_x (text) length: 1250
train_y (labels) length: 1250
[[0 0 0 ... 0 0 0

100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:03<00:00, 2516.88it/s]
100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:08<00:00, 2508.29it/s]


train_x length  7666
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.178016 megabytes
test_y (labels) size RAW: 0.178016 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 153.32 megabytes
train_yNPSize (labels) size: 0.030664 megabytes
test_xNPSize (text) size: 421.02 megabytes
test_yNPSize (labels) size: 0.084204 megabytes
train_xNP length  7666
train_xNP dimension of element  2
train_xNP size  38330000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 153.32 megabytes
train_y (labels) size: 0.030664 megabytes
train_x (text) length: 7666
train_y (labels) length: 7666
[[0 1 1 ..

100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:02<00:00, 2595.06it/s]
100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:05<00:00, 2524.13it/s]


train_x length  7666
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.124912 megabytes
test_y (labels) size RAW: 0.124912 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 153.32 megabytes
train_yNPSize (labels) size: 0.030664 megabytes
test_xNPSize (text) size: 295.42 megabytes
test_yNPSize (labels) size: 0.059084 megabytes
train_xNP length  7666
train_xNP dimension of element  2
train_xNP size  38330000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 153.32 megabytes
train_y (labels) size: 0.030664 megabytes
train_x (text) length: 7666
train_y (labels) length: 7666
[[0 1 1 ..

100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:02<00:00, 2679.17it/s]
100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:01<00:00, 2509.56it/s]


train_x length  7666
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.043032 megabytes
test_y (labels) size RAW: 0.043032 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 153.32 megabytes
train_yNPSize (labels) size: 0.15332 megabytes
test_xNPSize (text) size: 97.36 megabytes
test_yNPSize (labels) size: 0.09736 megabytes
train_xNP length  7666
train_xNP dimension of element  2
train_xNP size  38330000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 153.32 megabytes
train_y (labels) size: 0.15332 megabytes
train_x (text) length: 7666
train_y (labels) length: 7666
[[0 1 1 ... 0 

100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:03<00:00, 2525.06it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:01<00:00, 2536.24it/s]


train_x length  7666
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 153.32 megabytes
train_yNPSize (labels) size: 0.030664 megabytes
test_xNPSize (text) size: 51.7 megabytes
test_yNPSize (labels) size: 0.01034 megabytes
train_xNP length  7666
train_xNP dimension of element  2
train_xNP size  38330000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 153.32 megabytes
train_y (labels) size: 0.030664 megabytes
train_x (text) length: 7666
train_y (labels) length: 7666
[[0 1 1 ... 0 0

100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:02<00:00, 2575.25it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:01<00:00, 2361.55it/s]


train_x length  7666
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 153.32 megabytes
train_yNPSize (labels) size: 0.030664 megabytes
test_xNPSize (text) size: 48.28 megabytes
test_yNPSize (labels) size: 0.009656 megabytes
train_xNP length  7666
train_xNP dimension of element  2
train_xNP size  38330000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 153.32 megabytes
train_y (labels) size: 0.030664 megabytes
train_x (text) length: 7666
train_y (labels) length: 7666
[[0 1 1 ... 0

100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:03<00:00, 2534.92it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:02<00:00, 2603.25it/s]


train_x length  7666
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 153.32 megabytes
train_yNPSize (labels) size: 0.030664 megabytes
test_xNPSize (text) size: 142.04 megabytes
test_yNPSize (labels) size: 0.028408 megabytes
train_xNP length  7666
train_xNP dimension of element  2
train_xNP size  38330000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 153.32 megabytes
train_y (labels) size: 0.030664 megabytes
train_x (text) length: 7666
train_y (labels) length: 7666
[[0 1 1 ..

100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:03<00:00, 2466.52it/s]
100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:04<00:00, 2435.03it/s]


train_x length  7666
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.087616 megabytes
test_y (labels) size RAW: 0.087616 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 153.32 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 201.24 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  7666
train_xNP dimension of element  2
train_xNP size  38330000
Train or test empty. Did you misspell the dataset name?
('isear', 'dailydialog', 'descriptions', 'conversations')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  isear
test  dailydialog
there were  102979  entries that were in test and  118460 that were not in test and  7666  that were in train
test was appended  102979  times
single
oof
Det

100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:03<00:00, 2486.24it/s]
100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:45<00:00, 2277.39it/s]


train_x length  7666
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.824456 megabytes
test_y (labels) size RAW: 0.824456 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 153.32 megabytes
train_yNPSize (labels) size: 0.030664 megabytes
test_xNPSize (text) size: 2059.58 megabytes
test_yNPSize (labels) size: 0.411916 megabytes
train_xNP length  7666
train_xNP dimension of element  2
train_xNP size  38330000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 153.32 megabytes
train_y (labels) size: 0.030664 megabytes
train_x (text) length: 7666
train_y (labels) length: 7666
[[0 1 1 .

100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:02<00:00, 2606.37it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 2176.79it/s]


train_x length  7666
train_x dimension of element  5000
train_x (text) size RAW: 0.061424 megabytes
train_y (labels) size RAW: 0.061424 megabytes
test_x (text) size RAW: 0.010192 megabytes
test_y (labels) size RAW: 0.010192 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 153.32 megabytes
train_yNPSize (labels) size: 0.15332 megabytes
test_xNPSize (text) size: 25.0 megabytes
test_yNPSize (labels) size: 0.025 megabytes
train_xNP length  7666
train_xNP dimension of element  2
train_xNP size  38330000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 153.32 megabytes
train_y (labels) size: 0.15332 megabytes
train_x (text) length: 7666
train_y (labels) length: 7666
[[0 1 1 ... 0 0 0

100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:45<00:00, 2248.06it/s]
100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:08<00:00, 2370.60it/s]


train_x length  102979
train_x dimension of element  5000
train_x (text) size RAW: 0.824456 megabytes
train_y (labels) size RAW: 0.824456 megabytes
test_x (text) size RAW: 0.178016 megabytes
test_y (labels) size RAW: 0.178016 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 2059.58 megabytes
train_yNPSize (labels) size: 0.411916 megabytes
test_xNPSize (text) size: 421.02 megabytes
test_yNPSize (labels) size: 0.084204 megabytes
train_xNP length  102979
train_xNP dimension of element  2
train_xNP size  514895000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 2059.58 megabytes
train_y (labels) size: 0.411916 megabytes
train_x (text) length: 102979
train_y (labels) length: 102979

100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:38<00:00, 2674.80it/s]
100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:05<00:00, 2615.85it/s]


train_x length  102979
train_x dimension of element  5000
train_x (text) size RAW: 0.824456 megabytes
train_y (labels) size RAW: 0.824456 megabytes
test_x (text) size RAW: 0.124912 megabytes
test_y (labels) size RAW: 0.124912 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 2059.58 megabytes
train_yNPSize (labels) size: 0.411916 megabytes
test_xNPSize (text) size: 295.42 megabytes
test_yNPSize (labels) size: 0.059084 megabytes
train_xNP length  102979
train_xNP dimension of element  2
train_xNP size  514895000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 2059.58 megabytes
train_y (labels) size: 0.411916 megabytes
train_x (text) length: 102979
train_y (labels) length: 102979

100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:42<00:00, 2429.94it/s]
100%|████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:01<00:00, 2457.03it/s]


train_x length  102979
train_x dimension of element  5000
train_x (text) size RAW: 0.824456 megabytes
train_y (labels) size RAW: 0.824456 megabytes
test_x (text) size RAW: 0.043032 megabytes
test_y (labels) size RAW: 0.043032 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 2059.58 megabytes
train_yNPSize (labels) size: 2.471496 megabytes
test_xNPSize (text) size: 97.36 megabytes
test_yNPSize (labels) size: 0.116832 megabytes
train_xNP length  102979
train_xNP dimension of element  2
train_xNP size  514895000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 2059.58 megabytes
train_y (labels) size: 2.471496 megabytes
train_x (text) length: 102979
train_y (labels) length: 102979


100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:45<00:00, 2267.10it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:03<00:00, 2247.50it/s]


train_x length  102979
train_x dimension of element  5000
train_x (text) size RAW: 0.824456 megabytes
train_y (labels) size RAW: 0.824456 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 2059.58 megabytes
train_yNPSize (labels) size: 0.411916 megabytes
test_xNPSize (text) size: 153.32 megabytes
test_yNPSize (labels) size: 0.030664 megabytes
train_xNP length  102979
train_xNP dimension of element  2
train_xNP size  514895000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 2059.58 megabytes
train_y (labels) size: 0.411916 megabytes
train_x (text) length: 102979
train_y (labels) length: 102979

100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:45<00:00, 2276.25it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:01<00:00, 2324.69it/s]


train_x length  102979
train_x dimension of element  5000
train_x (text) size RAW: 0.824456 megabytes
train_y (labels) size RAW: 0.824456 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 2059.58 megabytes
train_yNPSize (labels) size: 0.411916 megabytes
test_xNPSize (text) size: 51.7 megabytes
test_yNPSize (labels) size: 0.01034 megabytes
train_xNP length  102979
train_xNP dimension of element  2
train_xNP size  514895000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 2059.58 megabytes
train_y (labels) size: 0.411916 megabytes
train_x (text) length: 102979
train_y (labels) length: 102979
[[0 

100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:38<00:00, 2640.56it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:00<00:00, 2700.38it/s]


train_x length  102979
train_x dimension of element  5000
train_x (text) size RAW: 0.824456 megabytes
train_y (labels) size RAW: 0.824456 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 2059.58 megabytes
train_yNPSize (labels) size: 0.411916 megabytes
test_xNPSize (text) size: 48.28 megabytes
test_yNPSize (labels) size: 0.009656 megabytes
train_xNP length  102979
train_xNP dimension of element  2
train_xNP size  514895000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 2059.58 megabytes
train_y (labels) size: 0.411916 megabytes
train_x (text) length: 102979
train_y (labels) length: 102979
[[

100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:38<00:00, 2681.33it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:02<00:00, 2666.90it/s]


train_x length  102979
train_x dimension of element  5000
train_x (text) size RAW: 0.824456 megabytes
train_y (labels) size RAW: 0.824456 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 2059.58 megabytes
train_yNPSize (labels) size: 0.411916 megabytes
test_xNPSize (text) size: 142.04 megabytes
test_yNPSize (labels) size: 0.028408 megabytes
train_xNP length  102979
train_xNP dimension of element  2
train_xNP size  514895000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 2059.58 megabytes
train_y (labels) size: 0.411916 megabytes
train_x (text) length: 102979
train_y (labels) length: 102979

100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:39<00:00, 2615.87it/s]
100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:04<00:00, 2156.93it/s]


train_x length  102979
train_x dimension of element  5000
train_x (text) size RAW: 0.824456 megabytes
train_y (labels) size RAW: 0.824456 megabytes
test_x (text) size RAW: 0.087616 megabytes
test_y (labels) size RAW: 0.087616 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 2059.58 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 201.24 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  102979
train_xNP dimension of element  2
train_xNP size  514895000
Train or test empty. Did you misspell the dataset name?
('dailydialog', 'crowdflower', 'conversations', 'tweets')
Getting data
get_train_test param:
json  unified-dataset.jsonl
train  dailydialog
test  crowdflower
there were  39740  entries that were in test and  181699 that were not in test and  102979  that were in train
test was appended  39740  times
si

100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:38<00:00, 2694.06it/s]
100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:16<00:00, 2404.03it/s]


train_x length  102979
train_x dimension of element  5000
train_x (text) size RAW: 0.824456 megabytes
train_y (labels) size RAW: 0.824456 megabytes
test_x (text) size RAW: 0.321096 megabytes
test_y (labels) size RAW: 0.321096 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 2059.58 megabytes
train_yNPSize (labels) size: 0.411916 megabytes
test_xNPSize (text) size: 794.8 megabytes
test_yNPSize (labels) size: 0.15896 megabytes
train_xNP length  102979
train_xNP dimension of element  2
train_xNP size  514895000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 2059.58 megabytes
train_y (labels) size: 0.411916 megabytes
train_x (text) length: 102979
train_y (labels) length: 102979
[

0it [00:00, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 9523.26it/s]


train_x length  0


IndexError: list index out of range