In [1]:
# Usage:
#     classify_xvsy_logreg.py [options] <first> <second>
#     classify_xvsy_logreg.py [options] --all-vs <second>

# Options:
#     -j --json=<JSONFILE>  Filename of the json file [default: unified-dataset.jsonl]
#     -a --all-vs=<dataset> Dataset name of the testing data
#     -d --debug            Use a small word list and a fast classifier
#     -o --output=<OUTPUT>  Output folder [default: .]
#     -m --force-multi      Force using multi-label classification
#     -k --keep-last        Quit immediately if results file found

import regex as re
import sys
import os
import json
import random
import math
import operator as op
import docopt
import numpy as np
import os.path
from os import path
from tqdm import tqdm

from collections import Counter, defaultdict, namedtuple

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report
from scipy.spatial import distance
import joblib
import shutil

np.random.seed(0)
random.seed(0)

Report = namedtuple(
    "Report", ["precision", "recall", "accuracy", "f1", "tp", "tn", "fp", "fn"]
)

PATTERN_TOKENS = re.compile(r"[a-z]+")




In [2]:
def cheatydiv(x, y):
    return math.nan if y == 0 else x / y

In [3]:
def get_labels(train, test, operation=op.and_, mode="multi"):
    """Return a list of the emotional intersection of two sources."""
    emotions = set()
    if mode == "single":
        emotions.add("noemo")
    train_emotions = set(
        emotion
        for data in train
        for emotion in data["emotions"]
        if data["emotions"][emotion] is not None
    )
    # print(train_emotions)
    test_emotions = set(
        emotion
        for emotion in test[0]["emotions"]
        if test[0]["emotions"][emotion] is not None
    )
    # print(test_emotions)
    return list(emotions | operation(train_emotions, test_emotions))

In [4]:
def get_emotion(emovals, labels, emotions, mode="multi"):
#     print("get emotion mode ", mode)
#     print("emovals ",emovals)
#     print("labels ",labels)
#     print("emotions ",emotions)
    
    if mode == "single":
        truthy = len(list(filter(bool, emovals.values())))
        if truthy == 1:
            emotion = [v for v in emovals if emovals[v]][0]
        elif truthy == 0:
            emotion = "noemo"
        else:
            # emotion = sorted(
            #     ((k, v) for k, v in emovals.items() if v),
            #     key=lambda x: x[1],
            #     reverse=True,
            # )[0][0]
            raise ValueError("Dataset marked as 'single' contains multiple emotions")
        return emotions.get(emotion, emotions.get("noemo"))
    else:
        el = [int((emovals[label] or 0) > 0.1) for label in labels]
        return np.array(el)

In [5]:
def get_vector(text, wordlist):
    tokens = set(tokenize(text))
    return [1 if word in tokens else 0 for word in wordlist]

In [6]:
def make_arrays(train, test, words, labels, mode="multi", all_vs=False):
    emotions = {label: x for x, label in enumerate(labels)}
    print("emotions in make_arrays: ", emotions)
    train_x, train_y, test_x, test_y = [], [], [], []
    
    print("train super raw: ", sys.getsizeof(train)/1000000)
#     print("train:", train)

    for data in tqdm(train):
        # Discard examples where we don't have all selected emotions
        if (mode == "single" or all_vs or all(data["emotions"][emo] is not None for emo in labels)):
            train_y.append(get_emotion(data["emotions"], labels, emotions, mode))
            train_x.append(get_vector(data["text"], words))
    for data in tqdm(test):
        test_y.append(get_emotion(data["emotions"], labels, emotions, mode))
        test_x.append(get_vector(data["text"], words))
#     joblib.dump(train_x, train_xFileName)
#     joblib.dump(train_y, train_yFileName)
#     joblib.dump(test_x, test_xFileName)
#     joblib.dump(test_y, test_yFileName)
#     print("saved train and test data")
#     print("train_x item:", train_x[0])
    print("train_x length ", len(train_x))
    print("train_x dimension of element ", len(train_x[0]))
    train_xSize = sys.getsizeof(train_x)/1000000
    train_ySize = sys.getsizeof(train_y)/1000000
    train_xLength = len(train_x)
    train_yLength = len(train_y)
    print("train_x (text) size RAW:", train_xSize,"megabytes")
    print("train_y (labels) size RAW:", train_ySize,"megabytes")
    test_xSize = sys.getsizeof(test_x)/1000000
    test_ySize = sys.getsizeof(test_y)/1000000
    test_xLength = len(test_x)
    test_yLength = len(test_y)
    print("test_x (text) size RAW:", test_xSize,"megabytes")
    print("test_y (labels) size RAW:", test_ySize,"megabytes")
#     if(saveProcessedArrays):
#         print('saved train_x as', train_xFileName)
#         print('saved train_y as', train_yFileName)
#         print('saved test_x as', test_xFileName)
#         print('saved test_y as', test_yFileName)

    train_x = np.array(train_x)
    train_y = np.array(train_y)
    test_x = np.array(test_x)
    test_y = np.array(test_y)
    train_xNPSize = (train_x.nbytes)/1000000
    train_yNPSize = (train_y.nbytes)/1000000
    test_xNPSize = (test_x.nbytes)/1000000
    test_yNPSize = (test_y.nbytes)/1000000
    
#     train_xFileName = first + "_" + second + "train_x" +".pkl"
#     train_yFileName = first + "_" + second + "train_y" +".pkl"
#     test_xFileName = first + "_" + second + "test_x" +".pkl"
#     test_yFileName = first + "_" + second + "test_y" +".pkl"
#     if(path.exists(train_xFileName) 
#        and path.exists(train_yFileName)
#        and path.exists(test_xFileName)
#        and path.exists(test_yFileName)): #if processed arrays exist, load them in
#         print("successfully located paths for train and test data")
#         train_x1 = joblib.load(train_xFileName)
#         train_y1 = joblib.load(train_yFileName)
#         test_x1 = joblib.load(test_xFileName)
#         test_y1 = joblib.load(test_yFileName)
#         print("loaded train and test data")
#         train_x1 = np.array(train_x1)
#         train_y1 = np.array(train_y1)
#         test_x1 = np.array(test_x1)
#         test_y1 = np.array(test_y1)
#         train_x1NPSize = (train_x1.nbytes)/1000000
#         train_y1NPSize = (train_y1.nbytes)/1000000
#         test_x1NPSize = (test_x1.nbytes)/1000000
#         test_y1NPSize = (test_y1.nbytes)/1000000
#         print("train_x Size stays the same loaded", train_xSize == train_x1NPSize)
#         print("train_y Size stays the same loaded", train_ySize == train_y1NPSize)
#         print("test_x Size stays the same loaded", test_xSize == test_xNPSize)
#         print("test_y Size stays the same loaded", test_ySize == test_y1NPSize)
#         print("train_xNPSize (text) size loaded:", train_x1NPSize,"megabytes")
#         print("train_yNPSize (labels) size loaded:", train_y1NPSize,"megabytes")
#         print("test_xNPSize (text) size loaded:", test_x1NPSize,"megabytes")
#         print("test_yNPSize (labels) size loaded:", test_y1NPSize,"megabytes")
    print("saved test_y")
    print("train_x Size stays the same", train_xSize == train_xNPSize)
    print("train_y Size stays the same", train_ySize == train_yNPSize)
    print("test_x Size stays the same", test_xSize == test_xNPSize)
    print("test_y Size stays the same", test_ySize == test_yNPSize)
    print("train_xNPSize (text) size:", train_xNPSize,"megabytes")
    print("train_yNPSize (labels) size:", train_yNPSize,"megabytes")
    print("test_xNPSize (text) size:", test_xNPSize,"megabytes")
    print("test_yNPSize (labels) size:", test_yNPSize,"megabytes")
    print("train_xNP length ", len(train_x))
    print("train_xNP dimension of element ", train_x.ndim)
    print("train_xNP size ", train_x.size)
    sizes = train_xNPSize, train_yNPSize, test_xNPSize, test_yNPSize
    return train_x, train_y, test_x, test_y, sizes

In [7]:
def filtered_texts(filename, source):
    with open(filename) as f:
        for line in f:
            data = json.loads(line)
            if data["source"] == source:
                yield data["text"]

In [8]:
def classification_report_own_single(test_y, predict_y, labels):
    reports = {}
    num2emo = {i: label for i, label in enumerate(labels)}
    decisions = defaultdict(Counter)
    for t, p in zip(test_y, predict_y):
        decisions[t][p] += 1
    for label in decisions:
        tp = decisions[label][label]
        fp = sum(decisions[x][label] for x in decisions if x != label)
        tn = sum(
            decisions[x][y]
            for x in decisions
            for y in decisions[x]
            if x != label and y != label
        )
        fn = sum(decisions[label][y] for y in decisions[label] if y != label)
        precision = tp / (tp + fp) if tp + fp else math.nan
        recall = tp / (tp + fn) if tp + fn else math.nan
        f1 = 2 * cheatydiv((precision * recall), (precision + recall))
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        reports[num2emo[label]] = Report(
            precision, recall, accuracy, f1, tp, tn, fp, fn
        )
    return reports

In [9]:
def classification_report_own_multi(test_y, predict_y, labels):
    reports = {}
    num2emo = {i: label for i, label in enumerate(labels)}
    emo2num = {label: i for i, label in enumerate(labels)}
    decisions = defaultdict(Counter)
    for label in labels:
        tp = fp = tn = fn = 0
        for t, p in zip(test_y, predict_y):
            # decisions[t][p] += 1
            tp += bool(t[emo2num[label]] and p[emo2num[label]])
            fp += bool(p[emo2num[label]] and not t[emo2num[label]])
            fn += bool(t[emo2num[label]] and not p[emo2num[label]])
            tn += bool(not t[emo2num[label]] and not p[emo2num[label]])
        precision = tp / (tp + fp) if tp + fp else math.nan
        recall = tp / (tp + fn) if tp + fn else math.nan
        f1 = 2 * cheatydiv((precision * recall), (precision + recall))
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        reports[label] = Report(precision, recall, accuracy, f1, tp, tn, fp, fn)
    return reports

In [10]:
def tokenize(text):
    return re.findall(r"\p{L}+", text.lower())


In [11]:
# this is bad. memory error for all_vs (too many words...)
def get_wordlist(dataset):
    """Get a bag of words from a dataset."""
    bag = set()
    for data in dataset:
        bag.update({token for token in tokenize(data["text"])})
    return list(bag)

In [12]:
# ask roman what would be a good vocab here?
def get_wordlist_debug(dataset):
    """Get a bag of words from a dataset."""
    bag = Counter()
    for data in dataset:
        bag.update({token for token in tokenize(data["text"])})
    return list(map(op.itemgetter(0), bag.most_common(5000)))


In [13]:
def hacky_train_test_split(training, train_size=0.8, first=None, second=None):
    tra, tes = [], []
    for example in training:
        if example.get("split") == "train" or example["source"] != second:
            tra.append(example)
        elif example.get("split") == "test":
            tes.append(example)
        else:
            # don't try this at home
            [tes, tra][random.random()<train_size].append(example)
    return tra, tes

In [14]:
def get_train_test(jsonfile, train, test):
    print("get_train_test param:")
    print("json ", jsonfile)
    print("train ", train)
    print("test ", test)
#     same = test in train.split(",") #used if train and test corpus are same
    training, testing = [], []
    count1 = 0
    count2 = 0
    count3 = 0
    count4 = 0
    with open(jsonfile) as f:
        for line in f:
            data = json.loads(line)
            if(data["source"] == test):
                count1 += 1
            if(data["source"] != test):
                count2 += 1
            if(train == None and data["source"] != test):
                count3 += 1
                training.append(data)
            elif data["source"] == test:
                count4 += 1
                testing.append(data)
            elif(data["source"] in train.split(",")):
                count3 += 1
                training.append(data)
    print("there were ", count1, " entries that were in test and ", count2, "that were not in test",
          "and ", count3, " that were in train")
    print("test was appended ", count4, " times")
#     if same:
#         training, testing = hacky_train_test_split(training, train_size=0.8, first=train, second=test)
    return training, testing

In [15]:
def get_clf_mode(train, test):
    """ Detect whether we are in single-label to single-label mode or not. """
    first = "single"
    for example in train:
        if example.get("labeled", "multi") == "multi":
            first = "multi"
    print(first)
    for example in test:
        if example.get("labeled", "multi") == "multi":
            return first, "multi"
    print("oof")
    return first, "single"

In [16]:
def analyse_results(test_y, predict_y, labels, test, first, second, output, mode):
    print("analyse_results")
    prefix = f"{first}_vs_{second}_{mode}"
    fprefix = output + "/" + prefix
    with open(fprefix + ".txt", "w", encoding="utf-8") as f, open(fprefix + ".json", "w") as g:
        print("hello")
        # print(confusion_matrix(test_y, predict_y), file=f)
        prec, reca, f1, supp = precision_recall_fscore_support(
            test_y, predict_y, pos_label=None, average="micro"
        )
        accuracy = accuracy_score(test_y, predict_y)
        scoreNameArray = [(prec, "Precision"),(reca, "Recall"),(f1, "F1-score"),(accuracy, "Accuracy")]
        for score, name in scoreNameArray:
            print(name, score, sep="\t", file=f)
            print(name, score, sep="\t")

        # print("real:", Counter(test_y), file=f)
        # print("predicted:", Counter(predict_y), file=f)

        print(test_y[:10], predict_y[:10], file=f)
        emotions = {i: label for i, label in enumerate(labels)}
        for text, real, predicted, _ in zip(test, test_y, predict_y, range(20)):
            if mode == "multi" and np.array_equal(real, predicted):
                continue
            elif mode == "single" and real == predicted:
                continue
            print(text, "=> predicted:", predicted, ", truth:", real, file=f)
        if mode == "multi":
            results = classification_report_own_multi(test_y, predict_y, labels)
        elif mode == "single":
            results = classification_report_own_single(test_y, predict_y, labels)
        json.dump(
            {
                "precision": prec,
                "recall": reca,
                "f1": f1,
                "accuracy": accuracy,
                "name": prefix,
                **{
                    (emotion + "_" + metric): getattr(results[emotion], metric)
                    for emotion in results
                    for metric in Report._fields
                },
            },
            g,
        )
        g.write("\n")

In [17]:
def getPowerset(s):
    x = len(s)
    masks = [1 << i for i in range(x)]
    for i in range(1 << x):
        yield [ss for mask, ss in zip(masks, s) if i & mask]

In [18]:
import itertools
def getPermutations(s):
    subsets = set()
    for L in range(2, 3):
        for subset in itertools.permutations(s, L):
#             print(subset)
            subsets.add(subset)
    return subsets

In [19]:
def getHardDriveSpaceLeft():
    total, used, free = shutil.disk_usage("/")
    total = (total // (2**30))
    used = (used // (2**30))
    free = (free // (2**30))
    print("Total: %d GiB" % total)
    print("Used: %d GiB" % used)
    print("Free: %d GiB" % free)
    return total, used, free

In [23]:
if __name__ == "__main__":
#     args = docopt.docopt(__doc__, version="0.0.1")
#     args = {'--all-vs': True,
#      '--debug': True,
#      '--force-multi': False,
#      '--json': 'unified-dataset.jsonl',
#      '--output': '.',
#      '<first>': 'tec',
#      '<second>': 'affectivetext'}
#     print(args)
    
    possibleChoices = [('affectivetext','headlines'), ('crowdflower','tweets'), ('dailydialog','conversations'), 
                       ('emobank','headlines'), ('emoint','tweets'), 
                       ('emotion-cause','paragraphs'), ('grounded_emotions','tweets'), ('isear','descriptions'),
                       ('ssec','tweets'),('tales-emotion','tales'), ('tec','tweets')] #('electoraltweets','tweets') <-run trials with this later
    
    #     print(possibleChoices)
    permutations = list(getPermutations(possibleChoices))
    powerSet = list(getPowerset(possibleChoices))
#     print("powerset: ", powerSet)
    print("permutations length: ",len(permutations))
#     print(permutations)
    corporaSets = []
    for choice in permutations:
#         print("choice ", choice)
        if(len(choice) == 2):
#             print("pair")
            first, second = choice
            firstCorpus, domain1 = first
            secondCorpus, domain2 = second
#         else:
# #             print("less than 2")
#             firstCorpus, domain1 = choice[0]
#             secondCorpus, domain2 = choice[0] #repeat
        corpusPairData = (firstCorpus, secondCorpus, domain1, domain2)
        corporaSets.append(corpusPairData)
#     print(corporaSets)
    sortedPermutations = sorted(corporaSets, key = lambda x: (x[2], x[0], x[1]), reverse = True)
    for entry in possibleChoices:
        firstCorpus, domain1 = entry
        secondCorpus, domain2 = entry
        corpusPairData = (firstCorpus, secondCorpus, domain1, domain2)
        sortedPermutations.append(corpusPairData)
    for entry in possibleChoices:
        firstCorpus, domain1 = (None, None)
        secondCorpus, domain2 = entry
        corpusPairData = (firstCorpus, secondCorpus, domain1, domain2)
        sortedPermutations.append(corpusPairData)
    for entry in sortedPermutations:
        print(entry)
#     powerSetCondensedGood = []
#     powerSetCondensedBad = []
#     powerSetCondensed = []
#     for entry in powerSet:
# #         if len(entry) < 3 and len(entry) > 0:
#             if len(entry) == 2:
#                 domainMatch = entry[0][1]
#                 appendGood = True
#                 for corpus, domain in entry:
#                     if domain != domainMatch:
#                         appendGood = False
#                 if(appendGood):
#                     print("good entry",entry)
#                     powerSetCondensedGood.append(entry)
#                 else:
#                     print("bad entry",entry)
#                     powerSetCondensedBad.append(entry)
#             if len(entry) < 3 and len(entry) > 0:
#                 powerSetCondensed.append(entry)
#     print("powerSetCondensed length: ",len(powerSetCondensed))
#     print("powerSetCondensedGood length: ",len(powerSetCondensedGood))
#     print("powerSetCondensedGood: ", powerSetCondensedGood)
#     print("powerSetCondensedBad length: ",len(powerSetCondensedBad))
#     print("powerSetCondensedBad: ", powerSetCondensedBad)
#     print(powerSetCondensedGood[0])
    
#     example1 = ('ssec', 'tec', 'tweets', 'tweets')
#     example2 = (None, 'affectivetext', None, 'headlines')
    for entry in sortedPermutations:
        print(entry)
        (first, second, domain1, domain2) = entry
        print("Getting data")
        jsonfile = "unified-dataset.jsonl"
#         first = example2[0] #use first = None if you want to do ALl vs
#         second = example2[1]
    #     first = "isear" #use first = None if you want to do ALl vs
    #     second = "crowdflower"

        output = "."
        debug = True
        forceMulti = False
        isAllVS = False
        if first == None:
            isAllVS = True

        training_data, testing_data = get_train_test(jsonfile, first,second)
        firstCLF, secondCLF = (["multi", "multi"] if forceMulti else get_clf_mode(training_data, testing_data))
        mode = "multi" if "multi" in [firstCLF, secondCLF] else "single"

        print("Detected mode: {}...".format(mode))
        print(len(training_data), len(testing_data))
        print("Getting wordlist...")
        if debug:
            wordlist = get_wordlist_debug(training_data)
        else:
            wordlist = get_wordlist_debug(training_data)
            # wordlist = get_wordlist(training_data)
        print("Getting emotions")
        labels = get_labels(training_data, testing_data, mode=mode)
        print(labels)
        print("Making arrays")
        print("checking for save files")
        if(first == None):
            first = "all-vs"
        train_xNPFileName = first + "_" + second + "train_xNP" +".npy"
        train_yNPFileName = first + "_" + second + "train_yNP" +".npy"
        test_xNPFileName = first + "_" + second + "test_xNP" +".npy"
        test_yNPFileName = first + "_" + second + "test_yNP" +".npy"

        if(path.exists(train_xNPFileName) 
           and path.exists(train_yNPFileName)
           and path.exists(test_xNPFileName)
           and path.exists(test_yNPFileName)):
            print('saved train_xNP as', train_xNPFileName)
            print('saved train_yNP as', train_yNPFileName)
            print('saved test_xNP as', test_xNPFileName)
            print('saved test_yNP as', test_yNPFileName)
            print("loading from np")
            train_x = np.load(train_xNPFileName)
            train_y = np.load(train_yNPFileName)
            test_x = np.load(test_xNPFileName)
            test_y = np.load(test_yNPFileName)
            train_xNPSize = (train_x.nbytes)/1000000
            train_yNPSize = (train_y.nbytes)/1000000
            test_xNPSize = (test_x.nbytes)/1000000
            test_yNPSize = (test_y.nbytes)/1000000
            print("loaded directly from NP.load")
            print("train_xNPSize (text) size loaded:", train_xNPSize,"megabytes")
            print("train_yNPSize (labels) size loaded:", train_yNPSize,"megabytes")
            print("test_xNPSize (text) size loaded:", test_xNPSize,"megabytes")
            print("test_yNPSize (labels) size loaded:", test_yNPSize,"megabytes")
        else:
            train_x, train_y, test_x, test_y, sizes = make_arrays(training_data, testing_data, wordlist, labels, mode, isAllVS)
            train_xSize, train_ySize, test_xSize, test_ySize = sizes
            if any(not part.size for part in [train_x, train_y, test_x, test_y]):
                print("Train or test empty. Did you misspell the dataset name?")
                continue
            #             sys.exit(1)
            print("saving NP arrays")
            np.save(train_xNPFileName, train_x)
            np.save(train_yNPFileName, train_y)
            np.save(test_xNPFileName, test_x)
            np.save(test_yNPFileName, test_y)
    #         joblib.dump(train_x, train_xNPFileName)
    #         joblib.dump(train_y, train_yNPFileName)
    #         joblib.dump(test_x, test_xNPFileName)
    #         joblib.dump(test_y, test_yNPFileName)
            print("NP arrays saved")
        
        print("Initializing classifier")
        trainClassifier = True
        if debug:
            classifierName = "RandomForestClassifier"
            print("Searching for a ", classifierName)
            classiferSaveFile = first+"_"+second+classifierName+".pkl"
            print(path.exists(classiferSaveFile))
            if(path.exists(classiferSaveFile)):
                trainClassifier = False
                print("Loading classifier from file")
                classifier = joblib.load(classiferSaveFile)
                print("classifier loaded successfully")
            else:
                print("file not found, creating new classifier")
                classifier = RandomForestClassifier()
        elif mode == "single":
            classifierName = "LogisticRegressionCV"
            print("Searching for a ", classifierName)
            classiferSaveFile = first+"_"+second+classifierName+".pkl"
            print(path.exists(classiferSaveFile))
            if(path.exists(classiferSaveFile)):
                trainClassifier = False
                print("Loading classifier from file")
                classifier = joblib.load(classiferSaveFile)
                print("classifier loaded successfully")
            else:
                print("file not found, creating new classifier")
                classifier = LogisticRegressionCV(
                    cv=10,
                    penalty="l2",
                    fit_intercept=True,
                    solver="sag",
                    scoring="f1",
                    refit=True,
                    # n_jobs=-1,
                    class_weight="balanced",
                )
        else:
            classifierName = "OneVsRestClassifier"
            print("Searching for a ", classifierName)
            classiferSaveFile = first+"_"+second+classifierName+".pkl"
            print(path.exists(classiferSaveFile))
            if(path.exists(classiferSaveFile)):
                trainClassifier = False
                print("Loading classifier from file")
                classifier = joblib.load(classiferSaveFile)
                print("classifier loaded successfully")
            else:
                print("file not found, creating new classifier")
                classifier = OneVsRestClassifier(
                    LogisticRegressionCV(
                        cv=10,
                        penalty="l2",
                        fit_intercept=True,
                        solver="sag",
                        scoring="f1",
                        refit=True,
                        class_weight="balanced",
                        tol = 0.1,
                    ),
                    n_jobs=-1,
                )
        if(trainClassifier):
            print("this is the classifierName: ", classifierName)
            print("Training...")
            print("train_x (text) size:", (train_x.nbytes)/1000000,"megabytes")
            print("train_y (labels) size:", (train_y.nbytes)/1000000,"megabytes")
            print("train_x (text) length:", len(train_x))
            print("train_y (labels) length:", len(train_y))
            print(train_x[:5])
            print(train_y[:5])

            classifier.fit(train_x, train_y)
            print("finished training, classifier size:", sys.getsizeof(classifier)/1000000,"megabytes")
        print("Predicting...")
        if first == "multi" and second == "single":
            predict_y = classifier.predict_proba(test_x)
            helper = np.zeros_like(predict_y)
            helper[range(len(predict_y)), predict_y.argmax(1)] = 1
            predict_y = helper
        else:
            predict_y = classifier.predict(test_x)

        print("Analysing...")

        analyse_results(
            test_y,
            predict_y,
            labels,
            testing_data,
            first,
            second,
            output,
            mode,  # TODO
        )
        if(path.exists(classiferSaveFile)):
            print("classifier already saved")
        else:
    #         classiferSaveFile = first+"_"+second+classifierName+".pkl"
            print("classiferSaveFile: ", classiferSaveFile)
            joblib.dump(classifier, classiferSaveFile)
            print("Saved Successfully")
        total, used, free = getHardDriveSpaceLeft()
        if(free < 10):
            sys.exit("Error: less than 10 gb remaining on disk")
        print("-----------------------------------------------------------------------------------------")
print("End of program!")

permutations length:  110
('tec', 'tales-emotion', 'tweets', 'tales')
('tec', 'ssec', 'tweets', 'tweets')
('tec', 'isear', 'tweets', 'descriptions')
('tec', 'grounded_emotions', 'tweets', 'tweets')
('tec', 'emotion-cause', 'tweets', 'paragraphs')
('tec', 'emoint', 'tweets', 'tweets')
('tec', 'emobank', 'tweets', 'headlines')
('tec', 'dailydialog', 'tweets', 'conversations')
('tec', 'crowdflower', 'tweets', 'tweets')
('tec', 'affectivetext', 'tweets', 'headlines')
('ssec', 'tec', 'tweets', 'tweets')
('ssec', 'tales-emotion', 'tweets', 'tales')
('ssec', 'isear', 'tweets', 'descriptions')
('ssec', 'grounded_emotions', 'tweets', 'tweets')
('ssec', 'emotion-cause', 'tweets', 'paragraphs')
('ssec', 'emoint', 'tweets', 'tweets')
('ssec', 'emobank', 'tweets', 'headlines')
('ssec', 'dailydialog', 'tweets', 'conversations')
('ssec', 'crowdflower', 'tweets', 'tweets')
('ssec', 'affectivetext', 'tweets', 'headlines')
('grounded_emotions', 'tec', 'tweets', 'tweets')
('grounded_emotions', 'tales-emo

test  ssec
there were  4868  entries that were in test and  216571 that were not in test and  21051  that were in train
test was appended  4868  times
single
Detected mode: multi...
21051 4868
Getting wordlist...
Getting emotions
['anger', 'joy', 'surprise', 'disgust', 'sadness', 'fear']
Making arrays
checking for save files
saved train_xNP as tec_ssectrain_xNP.npy
saved train_yNP as tec_ssectrain_yNP.npy
saved test_xNP as tec_ssectest_xNP.npy
saved test_yNP as tec_ssectest_yNP.npy
loading from np
loaded directly from NP.load
train_xNPSize (text) size loaded: 421.02 megabytes
train_yNPSize (labels) size loaded: 0.505224 megabytes
test_xNPSize (text) size loaded: 97.36 megabytes
test_yNPSize (labels) size loaded: 0.116832 megabytes
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 421.02 megabytes
train_y (labels) size: 0.505224 megabytes
train

100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:09<00:00, 2291.43it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7666/7666 [00:03<00:00, 2128.66it/s]


train_x length  21051
train_x dimension of element  5000
train_x (text) size RAW: 0.178016 megabytes
train_y (labels) size RAW: 0.178016 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 421.02 megabytes
train_yNPSize (labels) size: 0.084204 megabytes
test_xNPSize (text) size: 153.32 megabytes
test_yNPSize (labels) size: 0.030664 megabytes
train_xNP length  21051
train_xNP dimension of element  2
train_xNP size  105255000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 421.02 megabytes
train_y (labels) size: 0.084204 megabytes
train_x (text) length: 21051
train_y (labels) length: 21051
[[0 0

100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:07<00:00, 2784.57it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [00:00<00:00, 2920.78it/s]


train_x length  21051
train_x dimension of element  5000
train_x (text) size RAW: 0.178016 megabytes
train_y (labels) size RAW: 0.178016 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 421.02 megabytes
train_yNPSize (labels) size: 0.084204 megabytes
test_xNPSize (text) size: 51.7 megabytes
test_yNPSize (labels) size: 0.01034 megabytes
train_xNP length  21051
train_xNP dimension of element  2
train_xNP size  105255000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 421.02 megabytes
train_y (labels) size: 0.084204 megabytes
train_x (text) length: 21051
train_y (labels) length: 21051
[[0 0 0 ..

100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:07<00:00, 2680.67it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2414/2414 [00:00<00:00, 2828.38it/s]


train_x length  21051
train_x dimension of element  5000
train_x (text) size RAW: 0.178016 megabytes
train_y (labels) size RAW: 0.178016 megabytes
test_x (text) size RAW: 0.02104 megabytes
test_y (labels) size RAW: 0.02104 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 421.02 megabytes
train_yNPSize (labels) size: 0.084204 megabytes
test_xNPSize (text) size: 48.28 megabytes
test_yNPSize (labels) size: 0.009656 megabytes
train_xNP length  21051
train_xNP dimension of element  2
train_xNP size  105255000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 421.02 megabytes
train_y (labels) size: 0.084204 megabytes
train_x (text) length: 21051
train_y (labels) length: 21051
[[0 0 0 

100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:18<00:00, 1157.40it/s]
100%|████████████████████████████████████████████████████████████████████████████| 7102/7102 [00:06<00:00, 1022.73it/s]


train_x length  21051
train_x dimension of element  5000
train_x (text) size RAW: 0.178016 megabytes
train_y (labels) size RAW: 0.178016 megabytes
test_x (text) size RAW: 0.061424 megabytes
test_y (labels) size RAW: 0.061424 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 421.02 megabytes
train_yNPSize (labels) size: 0.084204 megabytes
test_xNPSize (text) size: 142.04 megabytes
test_yNPSize (labels) size: 0.028408 megabytes
train_xNP length  21051
train_xNP dimension of element  2
train_xNP size  105255000
saving NP arrays
NP arrays saved
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 421.02 megabytes
train_y (labels) size: 0.084204 megabytes
train_x (text) length: 21051
train_y (labels) length: 21051
[[0 0

100%|██████████████████████████████████████████████████████████████████████████| 21051/21051 [00:11<00:00, 1864.99it/s]
100%|██████████████████████████████████████████████████████████████████████████| 10062/10062 [00:05<00:00, 1924.87it/s]


train_x length  21051
train_x dimension of element  5000
train_x (text) size RAW: 0.178016 megabytes
train_y (labels) size RAW: 0.178016 megabytes
test_x (text) size RAW: 0.087616 megabytes
test_y (labels) size RAW: 0.087616 megabytes
saved test_y
train_x Size stays the same False
train_y Size stays the same False
test_x Size stays the same False
test_y Size stays the same False
train_xNPSize (text) size: 421.02 megabytes
train_yNPSize (labels) size: 0.0 megabytes
test_xNPSize (text) size: 201.24 megabytes
test_yNPSize (labels) size: 0.0 megabytes
train_xNP length  21051
train_xNP dimension of element  2
train_xNP size  105255000
Train or test empty. Did you misspell the dataset name?
End of program!


In [None]:
classifier = joblib.load("grounded_emotions_emointRandomForestClassifier.pkl")
stringEx = first+"_"+second+classifierName+".pkl"
print("there is a classifier: ", classifier)
analyse_results(
        test_y,
        predict_y,
        labels,
        testing_data,
        first,
        second,
        output,
        mode,  # TODO
    )

In [94]:
get_train_test param:
json  unified-dataset.jsonl
train  grounded_emotions
test  emoint
there were  7102  entries that were in test and  214337 that were not in test and  2585  that were in train
test was appended  7102  times
single
oof
Detected mode: single...
2585 7102
Getting wordlist...
Getting emotions
['joy', 'noemo', 'sadness']
Making arrays
emotions in make_arrays:  {'joy': 0, 'noemo': 1, 'sadness': 2}
train_x (text) size RAW: (0.02104, 'megabytes')
train_y (labels) size RAW: (0.02104, 'megabytes')
test_x (text) size RAW: (0.061424, 'megabytes')
test_y (labels) size RAW: (0.061424, 'megabytes')
Initializing classifier
Searching for a  RandomForestClassifier
Loading classifier from file
classifier loaded successfully
Predicting...
Analysing...
analyse_results
hello
Precision	0.5167558434243875
Recall	0.5167558434243875
F1-score	0.5167558434243875
Accuracy	0.5167558434243875

SyntaxError: invalid syntax (<ipython-input-94-c580fe045f86>, line 1)

In [None]:
there were  1250  entries that were in test and  220189 that were not in test and  2585  that were in train
test was appended  1250  times
single
Detected mode: multi...
2585 1250
Getting wordlist...
Getting emotions
['sadness', 'joy']
Making arrays
emotions in make_arrays:  {'sadness': 0, 'joy': 1}
train_x (text) size RAW: (0.02104, 'megabytes')
train_y (labels) size RAW: (0.02104, 'megabytes')
test_x (text) size RAW: (0.010192, 'megabytes')
test_y (labels) size RAW: (0.010192, 'megabytes')
Initializing classifier
Searching for a  RandomForestClassifier
False
file not found, creating new classifier
this is the classifierName:  RandomForestClassifier
Training...
train_x (text) size: 51.700112 megabytes
train_y (labels) size: 0.020792 megabytes
train_x (text) length: 2585
train_y (labels) length: 2585

In [None]:
train_x length 2585
train_x dimension of element 5000
train_x (text) size RAW: 0.02104 megabytes
train_y (labels) size RAW: 0.02104 megabytes
test_x (text) size RAW: 0.010192 megabytes
test_y (labels) size RAW: 0.010192 megabytes
train_x Size stays the same False
train_x Size stays the same False
train_x Size stays the same False
train_x Size stays the same False
train_xNPSize (text) size: 51.700112 megabytes
train_yNPSize (labels) size: 0.020792 megabytes
test_xNPSize (text) size: 25.000112 megabytes
test_yNPSize (labels) size: 0.010112 megabytes
Initializing classifier
Searching for a  RandomForestClassifier
True
Loading classifier from file
classifier loaded successfully
Predicting...
Analysing...
analyse_results
hello
Precision	0.4912
Recall	0.4838455476753349
F1-score	0.4874950377133783
Accuracy	0.3568
classifier already saved

In [61]:
print("hi")

hi
