In [44]:
import regex as re
import sys
import os
import json
import random
import math
import operator as op
import docopt
import numpy as np
import os.path
import itertools
from os import path
from tqdm import tqdm
from scipy.spatial.distance import jensenshannon, cosine
from numpy import asarray
import statistics 
from collections import Counter, defaultdict, namedtuple

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics.pairwise import cosine_similarity, chi2_kernel
from scipy.spatial import distance
import joblib
import shutil

Report = namedtuple("Report", ["precision", "recall", "accuracy", "f1", "tp", "tn", "fp", "fn"])
JSON = "unified-dataset.jsonl"

In [45]:
#this method is used to get the classifier mode and decide whether to single-label or multi-label classification
#this method comes from the original authors and is kept to replicate their results
def get_clf_mode(train, test):
    first = "single"
    for example in train:
        if example.get("labeled", "multi") == "multi":
            first = "multi"
    print(first)
    for example in test:
        if example.get("labeled", "multi") == "multi":
            return first, "multi"
    print("oof")
    return first, "single"

In [46]:
#This methods is used to extract the training and testing data from the unified corpus json.
#The unified corpus json must be produced using the authors original code
#this version is only used in getting the benchmarks for the previous paper
#this version takes the jsonfile, the name of the train file and the name of the test file as parameters
def get_train_test(jsonfile, train, test):
    print("get_train_test param:")
    print("json ", jsonfile)
    print("train ", train)
    print("test ", test)
    same = test in train.split(",") #used if train and test corpus are same
    training, testing = [], []
    count1 = 0
    count2 = 0
    count3 = 0
    count4 = 0
    with open(jsonfile) as f:
        for line in f:
            data = json.loads(line)
            if(data["source"] == test):
                count1 += 1
            if(data["source"] != test):
                count2 += 1
            if((train == None or train == "all-vs") and data["source"] != test):
                count3 += 1
                training.append(data)
            elif data["source"] == test:
                count4 += 1
                testing.append(data)
            elif(data["source"] in train.split(",")):
                count3 += 1
                training.append(data)
    print("there were ", count1, " entries that were in test and ", count2, "that were not in test",
          "and ", count3, " that were in train")
    print("test was appended ", count4, " times")
    if same:
        training, testing = hacky_train_test_split(testing, train_size=0.8, first=train, second=test)
        print("revised", "there were ", len(testing), " entries that were in test and ", len(training), " that were in train")
    return training, testing

In [47]:
#this method stays as is from the original paper
def get_labels(train, test, operation=op.and_, mode="multi"):
    """Return a list of the emotional intersection of two sources."""
    emotions = set()
    if mode == "single":
        emotions.add("noemo")
    train_emotions = set(
        emotion
        for data in train
        for emotion in data["emotions"]
        if data["emotions"][emotion] is not None
    )
    # print(train_emotions)
    test_emotions = set(
        emotion
        for emotion in test[0]["emotions"]
        if test[0]["emotions"][emotion] is not None
    )
    # print(test_emotions)
    return list(emotions | operation(train_emotions, test_emotions))

In [48]:
#expects corpus list in data form
#returns compatible labels
def getMatchingLabels(corpora):
    emotionSetList = []
    for corpus in corpora:
        emoSet = set(emotion for data in corpus for emotion in data["emotions"] if data["emotions"][emotion] is not None)
        emotionSetList.append(emoSet)
    intersectionSet = set.intersection(*emotionSetList)
    print(intersectionSet)
    return intersectionSet

In [49]:
#this method stays as is from the original paper
def get_emotion(emovals, labels, emotions, mode="multi"):
#     print("get emotion mode ", mode)
#     print("emovals ",emovals)
#     print("labels ",labels)
#     print("emotions ",emotions)
    if mode == "single":
        truthy = len(list(filter(bool, emovals.values())))
        if truthy == 1:
            emotion = [v for v in emovals if emovals[v]][0]
        elif truthy == 0:
            emotion = "noemo"
        else:
            raise ValueError("Dataset marked as 'single' contains multiple emotions")
        return emotions.get(emotion, emotions.get("noemo"))
    else:
        el = [int((emovals[label] or 0) > 0.1) for label in labels]
        return np.array(el)

In [50]:
#this method stays as is from the original paper
def get_vector(text, wordlist):
    tokens = set(tokenize(text))
#     print(tokens)
    return [1 if word in tokens else 0 for word in wordlist]

In [51]:
#The comment below was left by the original authors. As you can see, their results were unable to use the full bag of words
# this is bad. memory error for all_vs (too many words...)
def get_wordlist(dataset):
    """Get a bag of words from a dataset."""
    bag = set()
    for data in dataset:
        bag.update({token for token in tokenize(data["text"])})
    return list(bag)

In [52]:
#bag of word limit of 5000 is kept from the original authors to match their results
def getTop5000Words(dataset):
    """Get a bag of words from a dataset."""
    bag = Counter()
    for data in dataset:
        bag.update({token for token in tokenize(data["text"])})
    print("bag size", len(bag))
#     print("bag", bag)
    out = list(map(op.itemgetter(0), bag.most_common(5000)))
#     print("this is the output", out)
    return out

In [53]:
#Taken from my own Ling 413 final project, I was going to run trials with lemmatization and other tokenization
#but by the time I was far enough in the project to do this, I didn't have time to run trials with this
# def cleanDataLemma(dataset):
#     taggedDataset = nltk.pos_tag(dataset)
#     filteredString = []
#     for token, tag in taggedDataset:
#         for char in token:
#             if char in string.punctuation:
#                 token = token.replace(char,"") #remove punctuation
#         if (token not in stopWords):
#             lemmatizedToken = ""
#             if tag[0] == 'N':
#                 lemmatizedToken = lemmatizer.lemmatize(token, 'n')
#             elif tag[0] == 'V':
#                 lemmatizedToken = lemmatizer.lemmatize(token, 'v')
#             else:
#                 lemmatizedToken = token
#             if len(lemmatizedToken) > 2:
#                 filteredString.append(lemmatizedToken)
#     return filteredString

In [54]:
#tokenization is kept the same so that performance results match the ones used in the paper as closely as possible
#if there is improvement, it should be because of my changes
def tokenize(text):
    return re.findall(r"\p{L}+", text.lower())

In [55]:
def getWordCountsByEmotion(dataset, emotionLabels):
    emotionCounts = []
    print("emotions")
    for emotion in emotionLabels:
        emotionDict = Counter()
        for data in dataset:
#             if data["emotions"][emotion] == 1:
#                 print(emotion)
#                 print(data)
#                 print(data["emotions"][emotion])
            emotionDict.update({token for token in tokenize(data["text"]) if data["emotions"][emotion] == 1})
        print(len(emotionDict))
        emotionCounts.append(emotionDict)
    return emotionCounts

In [56]:
def getTokenFrequency(dataset):
    token2DocFreq = {}
    for data in dataset:
        tempDict = {}
        for word in data:
            if word not in tempDict:
                tempDict[word] = 1
        for key, value in tempDict.items():
            if key in token2DocFreq:
                token2DocFreq[key] += value
            else:
                token2DocFreq[key] = value
    return token2DocFreq

In [57]:
def getTokenizedCorpusTextPair(corpus1, corpus2):
    with open(JSON) as f:
        for line in f:
            data = json.loads(line)
            if data["source"] in corporaNameList:
                corpus1Text.append(tokenize(data["text"]))
                corpus1Data.append(data)
            if data["source"] in corporaNameList:
                corpus2Text.append(tokenize(data["text"]))
                corpus2Data.append(data)
    corporaData = [corpus1Data,corpus2Data]
    return corpus1Text, corpus2Text, corporaData

In [58]:
def getNormalizedFreq(tokenFreq):
# def getNormalizedFreq(corpus):
#     newCorpus = []
#     for entry in corpus:
#         newCorpus.append(tokenize(entry))
#     tokenFreq = getTokenFrequency(newCorpus)
#     print(tokenFreq.items())
#     print("freq values", tokenFreq)
    newTokenFreq = tokenFreq.copy()
    for item, freq in newTokenFreq.items():
        if(freq == 0):
            newTokenFreq[item] = 0
        else:
            newTokenFreq[item] = 1 + math.log10(freq)
#     print("log weighted values", tokenFreq)
    docLength = 0
    for freq in newTokenFreq.values():
        docLength += freq*freq
    docLength = math.sqrt(docLength)
#     print("doclength", docLength)
    for item, freq in newTokenFreq.items():
        newTokenFreq[item] = freq/docLength
    # logFreq = freq for freq in math.log() 
#     print("normalized")
#     print(tokenFreq)
    return newTokenFreq

In [59]:
def getCosineSimilarityFromTokenFreq(tokenFreq1, tokenFreq2):
    normFreq1 = getNormalizedFreq(tokenFreq1)
    normFreq2 = getNormalizedFreq(tokenFreq2)
    cosineSum = 0
    normFreq1.items()
    print("length 1", len(normFreq1))
    print("length 2", len(normFreq2))
    intersection = normFreq1.keys() & normFreq2.keys()
    #only loop intersection because unshared values will be multiplied by 0 anyway
    for item in intersection:
#         if normFreq1[item] < 0 or normFreq2[item] < 0 :
#             print("negative?", item, normFreq1[item],normFreq2[item])
#             sys.exit()
#         print(item)
#         print(normFreq1[item])
#         print(normFreq2[item])
        x = normFreq1[item] * normFreq2[item]
        cosineSum += x
    return cosineSum

In [60]:
def getCosineSimilarityFromCorpus(corpus1,corpus2):
    corpus1Text, corpus2Text, corpus1Data, corpus2Data = getTokenizedCorpusTextPair(corpus1, corpus2)
    emotionLabels = getMatchingLabels(corporaData)
    tokenFreq1 = getTokenFrequency(corpus1Text)
    tokenFreq2 = getTokenFrequency(corpus2Text)
    sim = getCosineSimilarityFromTokenFreq(tokenFreq1, tokenFreq2)
    return sim

In [61]:
def getCosineSimilarityFromCorpusEmotions(corpus1,corpus2):
    corpus1Text, corpus2Text, corpus1Data, corpus2Data = getTokenizedCorpusTextPair(corpus1, corpus2)
    emotionLabels = getMatchingLabels(corporaData)
    emotionDicts1 = getTop5000WordsByEmotion(corpus1Data, words, emotionLabels)
    emotionDicts2 = getTop5000WordsByEmotion(corpus2Data, words, emotionLabels)
    for emotion in range(len(emotionLabels)):
        sim = getCosineSimilarityFromTokenFreq(emotionDicts1[emotion], emotionDicts2[emotion])
        print(sim)

In [62]:
#averages the values the come from jensenshannon into a single value
def getJensenShannonFromNPArrays(np1,np2):
    js_pq = jensenshannon(np1, np2)
    print(js_pq)
    sumJS = 0
    length = len(js_pq)
    for x in js_pq:
        if math.isnan(x): #assume nan values should be interpretted as 0
            sumJS += 0
        else:
            sumJS += x
    js = sumJS/length
    return js

In [63]:
#test code for understanding how NP arrays are distributed
arr1 = np.load("ssec_emotion-causetrain_xNP.npy")
arr2 = np.load("ssec_grounded_emotionstrain_xNP.npy")
arr3 = np.load("isear_ssectest_xNP.npy")
arr4 = np.load("isear_ssectrain_xNP.npy")
arr5 = np.load("ssec_iseartest_xNP.npy")
arr5 = np.load("ssec_iseartrain_xNP.npy")
print(np.array_equal(arr1,arr2))
print(np.array_equal(arr2,arr3))
print(np.array_equal(arr3,arr4))
print(np.array_equal(arr4,arr5))
print(np.array_equal(arr1,arr5))
print(np.array_equal(arr3,arr5))
print(arr1.shape)
print(arr2.shape)
print(arr3.shape)
print(arr4.shape)
print(arr5.shape)
print(getJensenShannonFromNPArrays(arr1,arr2))
print(getJensenShannonFromNPArrays(arr2,arr3))
# print(getJensenShannonFromNPArrays(arr4,arr5))

True
False
False
False
True
False
(4868, 5000)
(4868, 5000)
(4868, 5000)
(7666, 5000)
(4868, 5000)
[0. 0. 0. ... 0. 0. 0.]
0.0
[0.65300289 0.71217394 0.65470119 ...        nan 0.83255461        nan]
0.48101544698034715


In [64]:
# def getChiSquare(observed,calculated):
#     chiSquare = ((observed - calculated)**2)/calculated
#     return chiSquare

In [65]:
#Calculates Chi Square, or at least it would have if I had finished implementing
#see paper for details on why it was not implemented
# def getChiSquareFromTokenFreq(tokenFreq1, tokenFreq2):
# columnTotal1 = sum(tokenFreq1.values())
# columnTotal2 = sum(tokenFreq2.values())
# intersection = tokenFreq1.keys() & tokenFreq2.keys()
# rowTotals = {key: tokenFreq1.get(key, 0) + tokenFreq2.get(key, 0)
#           for key in set(dict1) | set(dict2)}
# grandTotal = columnTotal1 + columnTotal2
# chiSquareTotal = 0
# calculated1 = []
# calculated2 = []
# for item, rowTotal in rowTotals.items():
#     calculated1[item] = (rowTotal * columnTotal1) / grandTotal
#     calculated2[item] = (rowTotal * columnTotal2) / grandTotal
# for item, value in calculated.items():
#     getChiSquare(observed,calculated[item])
#     calculated[item]
#     return js

In [66]:
#This is a validation of my corpus similarity metrics
corpus1 = "ssec"
corpus1Data = []
corpus1Text = []
corpus2 = "isear"
corpus2Data = []
corpus2Text = []
with open(JSON) as f:
    for line in f:
        data = json.loads(line)
        if data["source"] == corpus1:
            corpus1Text.append(tokenize(data["text"]))
            corpus1Data.append(data)
        if data["source"] == corpus2:
            corpus2Text.append(tokenize(data["text"]))
            corpus2Data.append(data)
print("loaded data")
combinedCorpus = corpus1Data + corpus2Data
combinedCorpusText = corpus1Text + corpus2Text
# tokenFreq = getTokenFrequency(corpus1Text)
# print("tokenFreq", tokenFreq)
words = getTop5000Words(combinedCorpus)
corporaData = [corpus1Data,corpus2Data]
emotionLabels = list(getMatchingLabels(corporaData))
emotions1 = getWordCountsByEmotion(corpus1Data, emotionLabels)
# print(emotions1)
emotions2 = getWordCountsByEmotion(corpus2Data, emotionLabels)
# print(emotions2)
for emotion in range(len(emotionLabels)):
    sim = getCosineSimilarityFromTokenFreq(emotions1[emotion], emotions2[emotion])
    print(emotionLabels[emotion], sim)
fullCorpus1Words = getTokenFrequency(corpus1Text)
fullCorpus2Words = getTokenFrequency(corpus2Text)
sim1 = getCosineSimilarityFromTokenFreq(fullCorpus1Words, fullCorpus2Words)
sim2 = getCosineSimilarityFromTokenFreq(fullCorpus2Words, fullCorpus1Words)
print(sim1)
print(sim2)

loaded data
bag size 17756
{'joy', 'disgust', 'fear', 'sadness', 'anger'}
emotions
6938
7353
6707
8349
8864
emotions
2527
3401
3118
2523
3339
length 1 6938
length 2 2527
joy 0.44398907860576825
length 1 7353
length 2 3401
disgust 0.45165658636487693
length 1 6707
length 2 3118
fear 0.43688992382789676
length 1 8349
length 2 2523
sadness 0.4538342791036568
length 1 8864
length 2 3339
anger 0.46229743778369375
length 1 12661
length 2 8888
length 1 8888
length 2 12661
0.5107359175632193
0.5107359175632193


In [67]:
# import gensim
# import numpy as np
# from nltk.tokenize import word_tokenize
# print(corpus1Text[:5])
# gen_docs = corpus1Text[:5]
# dictionary = gensim.corpora.Dictionary(gen_docs)
# # print(dictionary.token2id)
# corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
# tf_idf = gensim.models.TfidfModel(corpus)
# for doc in tf_idf [corpus]:
#     print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])
# sims = gensim.similarities.Similarity("../Ling506TermProject/",tf_idf[corpus],
#                                         num_features=len(dictionary))



# file2_docs = ["Mars is the fourth planet in our solar system.",
#         "It is second-smallest planet in the Solar System after Mercury.",
#         "Saturn is yellow planet."]
# tf_idf = gensim.models.TfidfModel(corpus)

# print("Number of documents:",len(file2_docs))  
# for line in file2_docs:
#     query_doc = [w.lower() for w in word_tokenize(line)]
#     query_doc_bow = dictionary.doc2bow(query_doc) #update an existing dictionary and create bag of words

# # perform a similarity query against the corpus
# query_doc_tf_idf = tf_idf[query_doc_bow]
# # print(document_number, document_similarity)
# print('Comparing Result:', sims[query_doc_tf_idf]) 

# sum_of_sims =(np.sum(sims[query_doc_tf_idf], dtype=np.float32))
# print(sum_of_sims)

# avg_sims = [] # array of averages


# # for line in query documents
# for line in file2_docs:
#     # tokenize words
#     query_doc = [w.lower() for w in word_tokenize(line)]
#     # create bag of words
#     query_doc_bow = dictionary.doc2bow(query_doc)
#     # find similarity for each document
#     query_doc_tf_idf = tf_idf[query_doc_bow]
#     # print (document_number, document_similarity)
#     print('Comparing Result:', sims[query_doc_tf_idf]) 
#     # calculate sum of similarities for each query doc
#     sum_of_sims =(np.sum(sims[query_doc_tf_idf], dtype=np.float32))
#     # calculate average of similarity for each query doc
#     avg = sum_of_sims / len(file_docs)
#     # print average of similarity for each query doc
#     print(f'avg: {sum_of_sims / len(file_docs)}')
#     # add average values into array
#     avg_sims.append(avg)  
# # calculate total average
# total_avg = np.sum(avg_sims, dtype=np.float)
# # round the value and multiply by 100 to format it as percentage
# percentage_of_similarity = round(float(total_avg) * 100)
# # if percentage is greater than 100
# # that means documents are almost same
# if percentage_of_similarity >= 100:
#     percentage_of_similarity = 100
    

In [68]:
#Bad attempt at using prebuilt functions for distancing
# arr1 = np.load("ssec_emotion-causetrain_xNP.npy")
# arr2 = np.load("ssec_grounded_emotionstrain_xNP.npy")
# arr3 = np.load("isear_ssectest_xNP.npy")
# corpus1 = "ssec"
# corpus1Data = []
# corpus1Text = []
# corpus2 = "isear"
# corpus2Data = []
# corpus2Text = []
# with open(JSON) as f:
#     for line in f:
#         data = json.loads(line)
#         if data["source"] == corpus1:
#             corpus1Data.append(data)
#         if data["source"] == corpus2:
#             corpus2Data.append(data)
# print("loaded data")
# words1 = getTop5000Words(corpus1Data)
# print(words1)
# words2 = getTop5000Words(corpus2Data)
# for data in tqdm(corpus1Data):
#     corpus1Text.append(get_vector(data["text"], words1))
# for data in tqdm(corpus1Data):
#     corpus2Text.append(get_vector(data["text"], words2))
# # print(corpus1Text[:30])
# print(np.array_equal(arr1,arr2))
# print(np.array_equal(arr1,arr3))
# print(cosine_similarity(arr1,arr3))
# print(chi2_kernel(arr1,arr3))

In [69]:
#this method is modified to track 
def make_arrays(train, test, words, labels, mode="multi", all_vs=False):
    emotions = {label: x for x, label in enumerate(labels)}
    print("emotions in make_arrays: ", emotions)
    train_x, train_y, test_x, test_y = [], [], [], []
    
    print("train raw text: ", sys.getsizeof(train)/1000000)

    for data in tqdm(train):
        # Discard examples where we don't have all selected emotions
        if (mode == "single" or all_vs or all(data["emotions"][emo] is not None for emo in labels)):
            train_y.append(get_emotion(data["emotions"], labels, emotions, mode))
            train_x.append(get_vector(data["text"], words))
    for data in tqdm(test):
        test_y.append(get_emotion(data["emotions"], labels, emotions, mode))
        test_x.append(get_vector(data["text"], words))

    print("train_x length ", len(train_x))
    print("train_x dimension of element ", len(train_x[0]))
    train_xSize = sys.getsizeof(train_x)/1000000
    train_ySize = sys.getsizeof(train_y)/1000000
    train_xLength = len(train_x)
    train_yLength = len(train_y)
    print("train_x (text) size RAW:", train_xSize,"megabytes")
    print("train_y (labels) size RAW:", train_ySize,"megabytes")
    test_xSize = sys.getsizeof(test_x)/1000000
    test_ySize = sys.getsizeof(test_y)/1000000
    test_xLength = len(test_x)
    test_yLength = len(test_y)
    print("test_x (text) size RAW:", test_xSize,"megabytes")
    print("test_y (labels) size RAW:", test_ySize,"megabytes")

    train_x = np.array(train_x)
    train_y = np.array(train_y)
    test_x = np.array(test_x)
    test_y = np.array(test_y)
    train_xNPSize = (train_x.nbytes)/1000000
    train_yNPSize = (train_y.nbytes)/1000000
    test_xNPSize = (test_x.nbytes)/1000000
    test_yNPSize = (test_y.nbytes)/1000000
    
    print("saved test_y")
    print("train_x Size stays the same", train_xSize == train_xNPSize)
    print("train_y Size stays the same", train_ySize == train_yNPSize)
    print("test_x Size stays the same", test_xSize == test_xNPSize)
    print("test_y Size stays the same", test_ySize == test_yNPSize)
    print("train_xNPSize (text) size:", train_xNPSize,"megabytes")
    print("train_yNPSize (labels) size:", train_yNPSize,"megabytes")
    print("test_xNPSize (text) size:", test_xNPSize,"megabytes")
    print("test_yNPSize (labels) size:", test_yNPSize,"megabytes")
    print("train_xNP length ", len(train_x))
    print("train_xNP dimension of element ", train_x.ndim)
    print("train_xNP size ", train_x.size)
    sizes = train_xNPSize, train_yNPSize, test_xNPSize, test_yNPSize
    return train_x, train_y, test_x, test_y, sizes

In [72]:
#kept as part of classification definitions, prevents division by 0 errors
def cheatydiv(x, y):
    return math.nan if y == 0 else x / y

In [73]:
#classification reporting is kept the same for simplicity (ie, no need to reinvent the wheel)
def classification_report_own_single(test_y, predict_y, labels):
    reports = {}
    num2emo = {i: label for i, label in enumerate(labels)}
    decisions = defaultdict(Counter)
    for t, p in zip(test_y, predict_y):
        decisions[t][p] += 1
    for label in decisions:
        tp = decisions[label][label]
        fp = sum(decisions[x][label] for x in decisions if x != label)
        tn = sum(
            decisions[x][y]
            for x in decisions
            for y in decisions[x]
            if x != label and y != label
        )
        fn = sum(decisions[label][y] for y in decisions[label] if y != label)
        precision = tp / (tp + fp) if tp + fp else math.nan
        recall = tp / (tp + fn) if tp + fn else math.nan
        f1 = 2 * cheatydiv((precision * recall), (precision + recall))
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        reports[num2emo[label]] = Report(precision, recall, accuracy, f1, tp, tn, fp, fn)
    return reports

In [74]:
#classification reporting is kept the same for simplicity (ie, no need to reinvent the wheel)
def classification_report_own_multi(test_y, predict_y, labels):
    reports = {}
    num2emo = {i: label for i, label in enumerate(labels)}
    emo2num = {label: i for i, label in enumerate(labels)}
    decisions = defaultdict(Counter)
    for label in labels:
        tp = fp = tn = fn = 0
        for t, p in zip(test_y, predict_y):
            # decisions[t][p] += 1
            tp += bool(t[emo2num[label]] and p[emo2num[label]])
            fp += bool(p[emo2num[label]] and not t[emo2num[label]])
            fn += bool(t[emo2num[label]] and not p[emo2num[label]])
            tn += bool(not t[emo2num[label]] and not p[emo2num[label]])
        precision = tp / (tp + fp) if tp + fp else math.nan
        recall = tp / (tp + fn) if tp + fn else math.nan
        f1 = 2 * cheatydiv((precision * recall), (precision + recall))
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        reports[label] = Report(precision, recall, accuracy, f1, tp, tn, fp, fn)
    return reports

In [75]:
#classification reporting is kept the same for simplicity (ie, no need to reinvent the wheel)
def analyse_results(test_y, predict_y, labels, test, first, second, output, mode):
    print("analyse_results")
    prefix = f"{first}_vs_{second}_{mode}"
    fprefix = output + "/" + prefix
    with open(fprefix + ".txt", "w", encoding="utf-8") as f, open(fprefix + ".json", "w") as g:
        print("hello")
        prec, reca, f1, supp = precision_recall_fscore_support(
            test_y, predict_y, pos_label=None, average="micro"
        )
        accuracy = accuracy_score(test_y, predict_y)
        scoreNameArray = [(prec, "Precision"),(reca, "Recall"),(f1, "F1-score"),(accuracy, "Accuracy")]
        for score, name in scoreNameArray:
            print(name, score, sep="\t", file=f)
            print(name, score, sep="\t")
            
        # print("real:", Counter(test_y), file=f)
        # print("predicted:", Counter(predict_y), file=f)
        
        print(test_y[:10], predict_y[:10], file=f)
        emotions = {i: label for i, label in enumerate(labels)}
        for text, real, predicted, _ in zip(test, test_y, predict_y, range(20)):
            if mode == "multi" and np.array_equal(real, predicted):
                continue
            elif mode == "single" and real == predicted:
                continue
            print(text, "=> predicted:", predicted, ", truth:", real, file=f)
        if mode == "multi":
            results = classification_report_own_multi(test_y, predict_y, labels)
        elif mode == "single":
            results = classification_report_own_single(test_y, predict_y, labels)
        json.dump(
            {
                "precision": prec,
                "recall": reca,
                "f1": f1,
                "accuracy": accuracy,
                "name": prefix,
                **{
                    (emotion + "_" + metric): getattr(results[emotion], metric)
                    for emotion in results
                    for metric in Report._fields
                },
            },
            g,
        )
        g.write("\n")

In [76]:
#used for benchmarking/validating the results of the authors, but not in the final version
#method is kept here for documentation
def hacky_train_test_split(training, train_size=0.8, first=None, second=None):
    tra, tes = [], []
    for example in training:
        if example.get("split") == "train" or example["source"] != second:
            tra.append(example)
        elif example.get("split") == "test":
            tes.append(example)
        else:
            # don't try this at home
            [tes, tra][random.random()<train_size].append(example)
    return tra, tes

In [77]:
#used for benchmarking/validating the results of the authors, but not in the final version
#method is kept here for documentation
def splitTrainAndTestData(training, train_size=0.8, first=None, second=None):
    tra, tes = [], []
    for example in training:
        if example.get("split") == "train" or example["source"] != second:
            tra.append(example)
        elif example.get("split") == "test":
            tes.append(example)
        else:
            # don't try this at home
            [tes, tra][random.random()<train_size].append(example)
    return tra, tes

In [78]:
#this method is used in my testing to generate the combinations that I use in my trials automation
def getPowerset(s):
    x = len(s)
    masks = [1 << i for i in range(x)]
    for i in range(1 << x):
        yield [ss for mask, ss in zip(masks, s) if i & mask]

In [79]:
#this method is used in my testing to generate the combinations that I use in my trials automation
def getPermutations(s):
    subsets = set()
    for L in range(2, 3): #this 
        for subset in itertools.permutations(s, L):
            subsets.add(subset)
    return subsets

In [80]:
#this method is simply in place to get a measure of hard drive space left on my computer
def getHardDriveSpaceLeft():
    total, used, free = shutil.disk_usage("/")
    total = (total // (2**30))
    used = (used // (2**30))
    free = (free // (2**30))
    print("Total: %d GB" % total)
    print("Used: %d GB" % used)
    print("Free: %d GB" % free)
    return total, used, free

In [81]:
def getCrossCorpusValuesWithOrder(possibleChoices):
    #gets the runtime values for cross corpus trials
    #ordering will matter if using the original authors version
    permutations = list(getPermutations(possibleChoices))
    print("permutations length: ",len(permutations))
#         print(permutations)
    corporaSets = []
    for choice in permutations:
#         print("choice ", choice)
        if(len(choice) == 2):
#             print("pair")
            first, second = choice
            firstCorpus, domain1 = first
            secondCorpus, domain2 = second
        corpusPairData = (firstCorpus, secondCorpus, domain1, domain2)
        corporaSets.append(corpusPairData)
    return(corporaSets)

In [82]:
# This method adds the combinations relating to the ALl-VS trials
def getAllVsCorpusValues(possibleChoices):
    corporaSets = []
    for entry in possibleChoices:
        firstCorpus, domain1 = (None, None)
        secondCorpus, domain2 = entry
        corpusPairData = (firstCorpus, secondCorpus, domain1, domain2)
        corporaSets.append(corpusPairData)
    return corporaSets

In [83]:
#Gets the corpora pairs of the same domain
#powerSet is specified because it is only used in the case with a powerSet where ordering does not matter
#but hypothetically, you could put in any list of possible entries
def getCorporaPairsOfSameDomain(powerSet, sizeBoundLower=1, sizeBoundUpper=3):
    for entry in powerSet:
#       if len(entry) < 3 and len(entry) > 0:
        if len(entry) < sizeBoundUpper and len(entry) > sizeBoundLower:
            domainMatch = entry[0][1]
            shouldAppend = True
            for corpus, domain in entry:
                if domain != domainMatch:
                    shouldAppend = False
            if(shouldAppend):
                powerSetCondensed.append(entry)
    print("CorporaPairsOfSameDomain:",len(powerSetCondensed))
    return sameDomainCorporaPairs

In [84]:
#this method adds the trials where the corpus is trained and tested on itself
def getCorporaPairsWithItself(possibleChoices):
    corporaSets = []
    for entry in possibleChoices:
        firstCorpus, domain1 = entry
        secondCorpus, domain2 = entry
        corpusPairData = (firstCorpus, secondCorpus, domain1, domain2)
        corporaSets.append(corpusPairData)
    return corporaSets

In [86]:
def performTrialUsingCorpusPair(corpusPair, verifyResults):
    print("------------------------------",corpusPair,"-------------------------------------------")
    (first, second, domain1, domain2) = corpusPair
    print("Getting data")
    jsonfile = "unified-dataset.jsonl"
    output = "."
    debug = True
    forceMulti = False
    isAllVS = False
    if first == None:
        isAllVS = True
    
    if(verifyResults == False):
        if(first == None):
            first = "all-vs"
        train_xNPFileName = first + "_" + second + "train_xNP" +".npy"
        train_yNPFileName = first + "_" + second + "train_yNP" +".npy"
        test_xNPFileName = first + "_" + second + "test_xNP" +".npy"
        test_yNPFileName = first + "_" + second + "test_yNP" +".npy"
        classifierName = "RandomForestClassifier"
        classiferSaveFile = first+"_"+second+classifierName+".pkl"
        filesValid = (path.exists(train_xNPFileName) 
                       and path.exists(train_yNPFileName)
                       and path.exists(test_xNPFileName)
                       and path.exists(test_yNPFileName)
                       and path.exists(classiferSaveFile))
        print("do pickle files exist?", filesValid)
        if(filesValid):
            print("skipping trial")
            return
    
    
    training_data, testing_data = get_train_test(jsonfile, first,second)
    firstCLF, secondCLF = (["multi", "multi"] if forceMulti else get_clf_mode(training_data, testing_data))
    mode = "multi" if "multi" in [firstCLF, secondCLF] else "single"

    print("Detected mode: {}...".format(mode))
    print(len(training_data), len(testing_data))
    print("Getting wordlist...")
    if debug:
        wordlist = getTop5000Words(training_data)
    else:
        wordlist = getTop5000Words(training_data)
        # wordlist = get_wordlist(training_data)
    print("Getting emotions")
    labels = get_labels(training_data, testing_data, mode=mode)
    print(labels)
    print("Making arrays")
    print("checking for save files")
    if(first == None):
        first = "all-vs"
    train_xNPFileName = first + "_" + second + "train_xNP" +".npy"
    train_yNPFileName = first + "_" + second + "train_yNP" +".npy"
    test_xNPFileName = first + "_" + second + "test_xNP" +".npy"
    test_yNPFileName = first + "_" + second + "test_yNP" +".npy"

    if(path.exists(train_xNPFileName) 
       and path.exists(train_yNPFileName)
       and path.exists(test_xNPFileName)
       and path.exists(test_yNPFileName)):
        print('saved train_xNP as', train_xNPFileName)
        print('saved train_yNP as', train_yNPFileName)
        print('saved test_xNP as', test_xNPFileName)
        print('saved test_yNP as', test_yNPFileName)
        print("loading from np")
        train_x = np.load(train_xNPFileName)
        train_y = np.load(train_yNPFileName)
        test_x = np.load(test_xNPFileName)
        test_y = np.load(test_yNPFileName)
        train_xNPSize = (train_x.nbytes)/1000000
        train_yNPSize = (train_y.nbytes)/1000000
        test_xNPSize = (test_x.nbytes)/1000000
        test_yNPSize = (test_y.nbytes)/1000000
        print("loaded directly from NP.load")
        print("train_xNPSize (text) size loaded:", train_xNPSize,"megabytes")
        print("train_yNPSize (labels) size loaded:", train_yNPSize,"megabytes")
        print("test_xNPSize (text) size loaded:", test_xNPSize,"megabytes")
        print("test_yNPSize (labels) size loaded:", test_yNPSize,"megabytes")
    else:
#         print("training_data", training_data)
#         print("testing_data", testing_data)
        train_x, train_y, test_x, test_y, sizes = make_arrays(training_data, testing_data, wordlist, labels, mode, isAllVS)
        train_xSize, train_ySize, test_xSize, test_ySize = sizes
        if any(not part.size for part in [train_x, train_y, test_x, test_y]):
            print("Train or test empty. Did you misspell the dataset name?")
            return
        #             sys.exit(1)
        print("saving NP arrays")
        np.save(train_xNPFileName, train_x)
        np.save(train_yNPFileName, train_y)
        np.save(test_xNPFileName, test_x)
        np.save(test_yNPFileName, test_y)
        print("NP arrays saved")

    print("Initializing classifier")
    trainClassifier = True
    if debug:
        classifierName = "RandomForestClassifier"
        print("Searching for a ", classifierName)
        classiferSaveFile = first+"_"+second+classifierName+".pkl"
        print(path.exists(classiferSaveFile))
        if(path.exists(classiferSaveFile)):
            trainClassifier = False
            print("Loading classifier from file")
            classifier = joblib.load(classiferSaveFile)
            print("classifier loaded successfully")
        else:
            print("file not found, creating new classifier")
            classifier = RandomForestClassifier()
    elif mode == "single":
        classifierName = "LogisticRegressionCV"
        print("Searching for a ", classifierName)
        classiferSaveFile = first+"_"+second+classifierName+".pkl"
        print(path.exists(classiferSaveFile))
        if(path.exists(classiferSaveFile)):
            trainClassifier = False
            print("Loading classifier from file")
            classifier = joblib.load(classiferSaveFile)
            print("classifier loaded successfully")
        else:
            print("file not found, creating new classifier")
            classifier = LogisticRegressionCV(
                cv=10,
                penalty="l2",
                fit_intercept=True,
                solver="sag",
                scoring="f1",
                refit=True,
                # n_jobs=-1,
                class_weight="balanced",
            )
    else:
        classifierName = "OneVsRestClassifier"
        print("Searching for a ", classifierName)
        classiferSaveFile = first+"_"+second+classifierName+".pkl"
        print(path.exists(classiferSaveFile))
        if(path.exists(classiferSaveFile)):
            trainClassifier = False
            print("Loading classifier from file")
            classifier = joblib.load(classiferSaveFile)
            print("classifier loaded successfully")
        else:
            print("file not found, creating new classifier")
            classifier = OneVsRestClassifier(
                LogisticRegressionCV(
                    cv=10,
                    penalty="l2",
                    fit_intercept=True,
                    solver="sag",
                    scoring="f1",
                    refit=True,
                    class_weight="balanced",
                    tol = 0.1,
                ),
                n_jobs=-1,
            )
    if(trainClassifier):
        print("this is the classifierName: ", classifierName)
        print("Training...")
        print("train_x (text) size:", (train_x.nbytes)/1000000,"megabytes")
        print("train_y (labels) size:", (train_y.nbytes)/1000000,"megabytes")
        print("train_x (text) length:", len(train_x))
        print("train_y (labels) length:", len(train_y))
        print(train_x[:5])
        print(train_y[:5])

        classifier.fit(train_x, train_y)
        print("finished training, classifier size:", sys.getsizeof(classifier)/1000000,"megabytes")
    print("Predicting...")
    if first == "multi" and second == "single":
        predict_y = classifier.predict_proba(test_x)
        helper = np.zeros_like(predict_y)
        helper[range(len(predict_y)), predict_y.argmax(1)] = 1
        predict_y = helper
    else:
        predict_y = classifier.predict(test_x)

    print("Analysing...")

    analyse_results(
        test_y,
        predict_y,
        labels,
        testing_data,
        first,
        second,
        output,
        mode,  # TODO
    )
    if(path.exists(classiferSaveFile)):
        print("classifier already saved")
    else:
#         classiferSaveFile = first+"_"+second+classifierName+".pkl"
        print("classiferSaveFile: ", classiferSaveFile)
        joblib.dump(classifier, classiferSaveFile)
        print("Saved Successfully")
    total, used, free = getHardDriveSpaceLeft()
    if(free < 10):
        sys.exit("Error: less than 10 gb remaining on disk")
    print("-----------------------------------------------------------------------------------------")
    return

In [87]:
def runTrials(version, verifyResults, crossCorpus=True, sameCorpus=True, allVs=False):
    possibleChoices = [('affectivetext','headlines'), ('crowdflower','tweets'), ('dailydialog','conversations'), 
                       ('emoint','tweets'), ('emotion-cause','paragraphs'), ('grounded_emotions','tweets'), 
                       ('isear','descriptions'), ('ssec','tweets'),('tales-emotion','tales'), ('tec','tweets')]
                        #excluded ('emobank','headlines') because it is isn't emotion annotated
                        #and ('electoraltweets','tweets') because it has incompatible annotation
                        #and  ('fb-valence-arousal-anon','tweets') because it isn't emotion annotated
    corporaSets = []
    if version == "previous":
        corporaSets = (getCrossCorpusValuesWithOrder(possibleChoices))
        #this was added to sort the lists by domain of the first, then by the first corpus name, then the second.
        #it is placed in reverse order simply because if it was put in regular order, the largest of the trials would be first
        #sorting in reverse will (loosely) make the smaller trials run first, while having no impact on the ability to obtain all results
        sortedPermutations = sorted(corporaSets, key = lambda x: (x[2], x[0], x[1]), reverse = True)
        sortedPermutations += (getCorporaPairsWithItself(possibleChoices))
        sortedPermutations += (getAllVsCorpusValues(possibleChoices))
        for corpusPair in sortedPermutations:
            performTrialUsingCorpusPair(corpusPair, verifyResults)
    else: #version == "myTrials"
        powerSet = list(getPowerset(possibleChoices))
    
    print("End of program!")

In [88]:
if __name__ == "__main__":
    version = "previous"
#     version = "myTrials"
    crossCorpus = True
    sameCorpus = False
    allVs = False
    verifyResults = False
    runTrials(version, verifyResults, crossCorpus, sameCorpus, allVs)

permutations length:  90
------------------------------ ('tec', 'tales-emotion', 'tweets', 'tales') -------------------------------------------
Getting data
do pickle files exist? True
skipping trial
------------------------------ ('tec', 'ssec', 'tweets', 'tweets') -------------------------------------------
Getting data
do pickle files exist? True
skipping trial
------------------------------ ('tec', 'isear', 'tweets', 'descriptions') -------------------------------------------
Getting data
do pickle files exist? True
skipping trial
------------------------------ ('tec', 'grounded_emotions', 'tweets', 'tweets') -------------------------------------------
Getting data
do pickle files exist? True
skipping trial
------------------------------ ('tec', 'emotion-cause', 'tweets', 'paragraphs') -------------------------------------------
Getting data
do pickle files exist? True
skipping trial
------------------------------ ('tec', 'emoint', 'tweets', 'tweets') ------------------------------

------------------------------ ('tales-emotion', 'tec', 'tales', 'tweets') -------------------------------------------
Getting data
do pickle files exist? True
skipping trial
------------------------------ ('tales-emotion', 'ssec', 'tales', 'tweets') -------------------------------------------
Getting data
do pickle files exist? True
skipping trial
------------------------------ ('tales-emotion', 'isear', 'tales', 'descriptions') -------------------------------------------
Getting data
do pickle files exist? True
skipping trial
------------------------------ ('tales-emotion', 'grounded_emotions', 'tales', 'tweets') -------------------------------------------
Getting data
do pickle files exist? True
skipping trial
------------------------------ ('tales-emotion', 'emotion-cause', 'tales', 'paragraphs') -------------------------------------------
Getting data
do pickle files exist? True
skipping trial
------------------------------ ('tales-emotion', 'emoint', 'tales', 'tweets') ----------

------------------------------ ('dailydialog', 'affectivetext', 'conversations', 'headlines') -------------------------------------------
Getting data
do pickle files exist? True
skipping trial
------------------------------ ('affectivetext', 'affectivetext', 'headlines', 'headlines') -------------------------------------------
Getting data
do pickle files exist? True
skipping trial
------------------------------ ('crowdflower', 'crowdflower', 'tweets', 'tweets') -------------------------------------------
Getting data
do pickle files exist? True
skipping trial
------------------------------ ('dailydialog', 'dailydialog', 'conversations', 'conversations') -------------------------------------------
Getting data
do pickle files exist? True
skipping trial
------------------------------ ('emoint', 'emoint', 'tweets', 'tweets') -------------------------------------------
Getting data
do pickle files exist? True
skipping trial
------------------------------ ('emotion-cause', 'emotion-cause'

100%|████████████████████████████████████████████████████████████████████████| 181699/181699 [01:20<00:00, 2255.41it/s]
100%|██████████████████████████████████████████████████████████████████████████| 39740/39740 [00:15<00:00, 2557.75it/s]


train_x length  181699
train_x dimension of element  5000
train_x (text) size RAW: 1.485992 megabytes
train_y (labels) size RAW: 1.485992 megabytes
test_x (text) size RAW: 0.321096 megabytes
test_y (labels) size RAW: 0.321096 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 3633.98 megabytes
train_yNPSize (labels) size: 5.087572 megabytes
test_xNPSize (text) size: 794.8 megabytes
test_yNPSize (labels) size: 1.11272 megabytes
train_xNP length  181699
train_xNP dimension of element  2
train_xNP size  908495000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 3633.98 megabytes
train_y (labels) size: 5.087572 megabytes
train_x (text) length: 181699
train_y (labels) length: 181699
[

100%|████████████████████████████████████████████████████████████████████████| 118460/118460 [00:57<00:00, 2069.16it/s]
100%|████████████████████████████████████████████████████████████████████████| 102979/102979 [00:50<00:00, 2042.40it/s]


train_x length  118460
train_x dimension of element  5000
train_x (text) size RAW: 1.043552 megabytes
train_y (labels) size RAW: 1.043552 megabytes
test_x (text) size RAW: 0.824456 megabytes
test_y (labels) size RAW: 0.824456 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 2369.2 megabytes
train_yNPSize (labels) size: 3.31688 megabytes
test_xNPSize (text) size: 2059.58 megabytes
test_yNPSize (labels) size: 2.883412 megabytes
train_xNP length  118460
train_xNP dimension of element  2
train_xNP size  592300000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 2369.2 megabytes
train_y (labels) size: 3.31688 megabytes
train_x (text) length: 118460
train_y (labels) length: 118460
[[

100%|████████████████████████████████████████████████████████████████████████| 214337/214337 [01:36<00:00, 2223.07it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:03<00:00, 2329.44it/s]


train_x length  214337
train_x dimension of element  5000
train_x (text) size RAW: 1.8808 megabytes
train_y (labels) size RAW: 1.8808 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 4286.74 megabytes
train_yNPSize (labels) size: 3.429392 megabytes
test_xNPSize (text) size: 142.04 megabytes
test_yNPSize (labels) size: 0.113632 megabytes
train_xNP length  214337
train_xNP dimension of element  2
train_xNP size  1071685000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 4286.74 megabytes
train_y (labels) size: 3.429392 megabytes
train_x (text) length: 214337
train_y (labels) length: 214337
[[

100%|████████████████████████████████████████████████████████████████████████| 219025/219025 [01:37<00:00, 2247.74it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:01<00:00, 2391.62it/s]


train_x length  219025
train_x dimension of element  5000
train_x (text) size RAW: 1.8808 megabytes
train_y (labels) size RAW: 1.8808 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 4380.5 megabytes
train_yNPSize (labels) size: 5.2566 megabytes
test_xNPSize (text) size: 48.28 megabytes
test_yNPSize (labels) size: 0.057936 megabytes
train_xNP length  219025
train_xNP dimension of element  2
train_xNP size  1095125000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 4380.5 megabytes
train_y (labels) size: 5.2566 megabytes
train_x (text) length: 219025
train_y (labels) length: 219025
[[0 0 0 ...

100%|████████████████████████████████████████████████████████████████████████| 218854/218854 [01:35<00:00, 2301.26it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:01<00:00, 2381.76it/s]


train_x length  218854
train_x dimension of element  5000
train_x (text) size RAW: 1.8808 megabytes
train_y (labels) size RAW: 1.8808 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 4377.08 megabytes
train_yNPSize (labels) size: 1.750832 megabytes
test_xNPSize (text) size: 51.7 megabytes
test_yNPSize (labels) size: 0.02068 megabytes
train_xNP length  218854
train_xNP dimension of element  2
train_xNP size  1094270000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 4377.08 megabytes
train_y (labels) size: 1.750832 megabytes
train_x (text) length: 218854
train_y (labels) length: 218854
[[0 0 0

100%|████████████████████████████████████████████████████████████████████████| 213773/213773 [01:44<00:00, 2052.87it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:17<00:00, 441.28it/s]


train_x length  213773
train_x dimension of element  5000
train_x (text) size RAW: 1.8808 megabytes
train_y (labels) size RAW: 1.8808 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 4275.46 megabytes
train_yNPSize (labels) size: 4.27546 megabytes
test_xNPSize (text) size: 153.32 megabytes
test_yNPSize (labels) size: 0.15332 megabytes
train_xNP length  213773
train_xNP dimension of element  2
train_xNP size  1068865000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 4275.46 megabytes
train_y (labels) size: 4.27546 megabytes
train_x (text) length: 213773
train_y (labels) length: 213773
[[0 0

100%|████████████████████████████████████████████████████████████████████████| 216571/216571 [02:14<00:00, 1609.92it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 4868/4868 [00:08<00:00, 604.70it/s]


train_x length  216571
train_x dimension of element  5000
train_x (text) size RAW: 1.8808 megabytes
train_y (labels) size RAW: 1.8808 megabytes
test_x (text) size RAW: 0.043032 megabytes
test_y (labels) size RAW: 0.043032 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 4331.42 megabytes
train_yNPSize (labels) size: 6.063988 megabytes
test_xNPSize (text) size: 97.36 megabytes
test_yNPSize (labels) size: 0.136304 megabytes
train_xNP length  216571
train_xNP dimension of element  2
train_xNP size  1082855000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 4331.42 megabytes
train_y (labels) size: 6.063988 megabytes
train_x (text) length: 216571
train_y (labels) length: 216571
[[0

100%|████████████████████████████████████████████████████████████████████████| 206668/206668 [02:02<00:00, 1680.29it/s]
100%|██████████████████████████████████████████████████████████████████████████| 14771/14771 [00:08<00:00, 1830.05it/s]


train_x length  206668
train_x dimension of element  5000
train_x (text) size RAW: 1.671784 megabytes
train_y (labels) size RAW: 1.671784 megabytes
test_x (text) size RAW: 0.124912 megabytes
test_y (labels) size RAW: 0.124912 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 4133.36 megabytes
train_yNPSize (labels) size: 5.786704 megabytes
test_xNPSize (text) size: 295.42 megabytes
test_yNPSize (labels) size: 0.413588 megabytes
train_xNP length  206668
train_xNP dimension of element  2
train_xNP size  1033340000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 4133.36 megabytes
train_y (labels) size: 5.786704 megabytes
train_x (text) length: 206668
train_y (labels) length: 20666

100%|████████████████████████████████████████████████████████████████████████| 200388/200388 [01:40<00:00, 1987.63it/s]
100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:11<00:00, 1869.24it/s]


train_x length  200388
train_x dimension of element  5000
train_x (text) size RAW: 1.671784 megabytes
train_y (labels) size RAW: 1.671784 megabytes
test_x (text) size RAW: 0.178016 megabytes
test_y (labels) size RAW: 0.178016 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 4007.76 megabytes
train_yNPSize (labels) size: 4.809312 megabytes
test_xNPSize (text) size: 421.02 megabytes
test_yNPSize (labels) size: 0.505224 megabytes
train_xNP length  200388
train_xNP dimension of element  2
train_xNP size  1001940000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 4007.76 megabytes
train_y (labels) size: 4.809312 megabytes
train_x (text) length: 200388
train_y (labels) length: 20038