In [1]:
from datasets import load_dataset
import datasets
import random
import os
import pandas as pd

In [2]:
task_to_keys = {
            "mnli": ("premise", "hypothesis"),
            "qqp": ("question1", "question2"),
            "rte": ("sentence1", "sentence2"),
            "sst2": ("sentence", None),
            "boolq": ("passage", "question"),
            "copa": ('choice1', 'choice2', 'premise', 'question'),
            "wic": ("start1", "end1", "sentence1", "start2", "end2", "sentence2", "word"),
            "wsc": ("span1_text", "span1_index", "span2_text", "span2_index", "text"),
            "wsc_bool": ("span1_text", "span1_index", "span2_text", "span2_index", "text"),
            "cb": ("premise", "hypothesis"),
            "record": ("passage", "query", "entities"),
            "multirc": ("question", "answer", "paragraph"),
            "rte_superglue": ("premise", "hypothesis"),
            "imdb": ("text", None),
            "ag_news": ("text", None),
            "yelp_review_full": ("text", None),
            "yahoo_answers_topics": ("question_content", "best_answer"),
            "dbpedia_14": ("title", "content"),
            "amazon": ("content", None)}

task_to_labels = {
            "mnli": {"label": ("entailment", "neutral", "contradiction"), "fold": "mnli_csv"},
            "qqp": {"label": ("not_duplicate", "duplicate"), "fold": "qqp_csv"},
            "rte": {"label": ("entailment", "not_entailment"), "fold": "rte_csv"},
            "sst2": { "label": ("negative", "positive"), "fold": "sst2_csv"},
            "boolq": {"label": ("false", "true"), "fold": "boolq_csv"},
            "copa": {"label": ("false", "true"), "fold": "copa_csv"},
            "wic": {"label": ("false", "true"), "fold": "wic_csv"},
            "cb": {"label": ("entailment", "contradiction", "neutral"), "fold": "cb_csv"},
            "multirc": {"label": ("false", "true"), "fold": "multirc_csv"},
            "imdb": {"label": ("negative", "positive"), "fold": "imdb_csv"},
            "ag_news": {"label": ("world", "sports", "business", "science"), "fold": "ag_news_csv"},
            "yelp_review_full": {"label":("terrible", "bad", "middle", "good", "wonderful"), "fold": "yelp_review_full_csv"},
            "yahoo_answers_topics":{"label" : ("society and culture", "science", "health", "education and reference",
                                     "computers and internet", "sports", "business", "entertainment and music",
                                     "family and relationships", "politics and government"), "fold" : "yahoo_answers_csv"},
            "dbpedia_14": {"label": ("company", "educationalinstitution", "artist", "athlete", "officeholder",
                           "meanoftransportation", "building", "naturalplace", "village", "animal",
                           "plant", "album", "film", "writtenwork"), "fold" : "dbpedia_csv"},
            "amazon": {"label": ("terrible", "bad", "middle", "good", "wonderful"), "fold": "amazon_csv"}
        }
glue_datasets = ['cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'mnli_mismatched', 'mnli_matched', 'qnli', 'rte', 'wnli', 'ax']
superglue_datasets = ['copa', 'boolq', 'wic', 'wsc', 'cb', 'record', 'multirc', 'rte_superglue', 'wsc_bool']

In [3]:
# chia vao cac folder theo format run_seed/train(test/valid).txt
runs = [0, 1, 2]
seed = 0
fewshotnum = 10

In [4]:
def preprocess_function(examples, task, label_key):
    keys = task_to_keys[task]
    if keys[1]!=None:
        text = ''
        for key in keys:
            text += key + ': ' + str(examples[key]) + ' '
    else:
        text = examples[keys[0]]

    target = task_to_labels[task]['label'][examples[label_key]]
    return {"text": text, "target": target}

In [5]:
# load all datasets 
# for faster create datasets, change datasets outside the loop: 
dataset_names = [key for key in task_to_labels.keys()]
for run in runs:
    curr_seed = seed + run * 100
    for name in dataset_names:
        # if name != 'amazon':
        #     continue
        outpath = f"longseqtextclsdata/{fewshotnum}/" + task_to_labels[name]["fold"] + '/' + str(run) + '_' + str(curr_seed)
        print("out path: ", outpath)
        if not os.path.exists(outpath):
            os.makedirs(outpath)

        label_key = 'label' if 'yahoo_' not in name else 'topic'
        print("Task label key: ", label_key)
        
        if name == 'mnli':
            dataset_train = load_dataset("LysandreJik/glue-mnli-train", split='train')
            dataset_test = load_dataset("LysandreJik/glue-mnli-train", split='validation')
        elif name == 'amazon':
            # load train
            prefix_path = '/home/nguyen/projects/prompt_cl/datasets/src/data/amazon'
            df = pd.read_csv(os.path.join(prefix_path,"train.csv"), header=None)
            df = df.rename(columns={0: "label", 1: "title", 2: "content"})
            df['label'] = df['label'] - 1
            dataset_train = datasets.Dataset.from_pandas(df)
            
            # load test
            df = pd.read_csv(os.path.join(prefix_path,'test.csv'), header=None)
            df = df.rename(columns={0: "label", 1: "title", 2: "content"})
            df['label'] = df['label'] - 1
            dataset_test = datasets.Dataset.from_pandas(df)
        else:
            if name not in glue_datasets and name not in superglue_datasets:
                print("Task not glue", name)
                dataset_train = load_dataset(name, split='train')
                dataset_test = load_dataset(name, split='test')
            else:
                benchmark = 'glue' if name in glue_datasets else 'super_glue'
                dataset_train = load_dataset(benchmark,
                                             name.replace('_superglue', '').replace('_bool', ''),
                                             split="train")
                dataset_test = load_dataset(benchmark,
                                             name.replace('_superglue', '').replace('_bool', ''),
                                             split="validation")

        # processed datasets:
        dataset_train = dataset_train.map(lambda x: preprocess_function(x, name, label_key))
        dataset_test = dataset_test.map(lambda x: preprocess_function(x, name, label_key))

        all_label = []
        trainresult = {}
        for item in dataset_train:
            if item["target"] not in all_label:
                all_label.append(item["target"])
            # replace all \t by space
            text = item['text'].replace("\t", " ")
            if item["target"] not in trainresult.keys():
                trainresult[item["target"]] = [text]
            else:
                trainresult[item["target"]].append(text)
        
        testresult = {}
        for item in dataset_test:
            # replace all \t by space
            text = item['text'].replace("\t", " ")
            if item["target"] not in testresult.keys():
                testresult[item["target"]] = [text]
            else:
                testresult[item["target"]].append(text)
        
        fewtrainname = os.path.join(outpath, "train.txt")
        fewvalidname = os.path.join(outpath, "valid.txt")
        fewtestname = os.path.join(outpath, "test.txt")
        tousetres = {}
        for key in trainresult.keys():
            if 2 * fewshotnum < len(trainresult[key]):
                thisres = random.sample(trainresult[key], 2 * fewshotnum)
            else:
                thisres = trainresult[key]
            tousetres[key] = thisres

        sampletestres = {}
        for key in testresult.keys():
            sampletestnum = 500
            if sampletestnum < len(testresult[key]):
                thisres = random.sample(testresult[key], sampletestnum)
            else:
                thisres = testresult[key]
            sampletestres[key] = thisres

        tousetrainres = {}
        tousevalidres = {}
        for key in tousetres.keys():
            allres = tousetres[key]
            fortrain = allres[0:fewshotnum]
            forvalid = allres[fewshotnum:2 * fewshotnum]
            tousetrainres[key] = fortrain
            tousevalidres[key] = forvalid
        f = open(fewtrainname,'w')
        for key in tousetrainres.keys():
            for one in tousetrainres[key]:
                f.write(one+"\t"+key + "\n")
        f.close()

        f = open(fewvalidname, 'w')
        for key in tousevalidres.keys():
            for one in tousevalidres[key]:
                f.write(one + "\t" + key + "\n")
        f.close()
        ####test
        f = open(fewtestname, 'w')
        for key in sampletestres.keys():
            for one in sampletestres[key]:
                f.write(one + "\t" + key + "\n")
        f.close()

out path:  longseqtextclsdata/100/mnli_csv/0_0
Task label key:  label


Using custom data configuration LysandreJik--glue-mnli-train-aef4727e80ba5593
Reusing dataset parquet (/home/nguyen/.cache/huggingface/datasets/LysandreJik___parquet/LysandreJik--glue-mnli-train-aef4727e80ba5593/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)
Using custom data configuration LysandreJik--glue-mnli-train-aef4727e80ba5593
Reusing dataset parquet (/home/nguyen/.cache/huggingface/datasets/LysandreJik___parquet/LysandreJik--glue-mnli-train-aef4727e80ba5593/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)


  0%|          | 0/392702 [00:00<?, ?ex/s]

  0%|          | 0/19647 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/qqp_csv/0_0
Task label key:  label


Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/363846 [00:00<?, ?ex/s]

  0%|          | 0/40430 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/rte_csv/0_0
Task label key:  label


Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/2490 [00:00<?, ?ex/s]

  0%|          | 0/277 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/sst2_csv/0_0
Task label key:  label


Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/67349 [00:00<?, ?ex/s]

  0%|          | 0/872 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/boolq_csv/0_0
Task label key:  label


Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/boolq/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)
Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/boolq/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/9427 [00:00<?, ?ex/s]

  0%|          | 0/3270 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/copa_csv/0_0
Task label key:  label


Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/copa/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)
Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/copa/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/400 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/wic_csv/0_0
Task label key:  label


Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/wic/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)
Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/wic/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/5428 [00:00<?, ?ex/s]

  0%|          | 0/638 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/cb_csv/0_0
Task label key:  label


Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/cb/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)
Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/cb/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/250 [00:00<?, ?ex/s]

  0%|          | 0/56 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/multirc_csv/0_0
Task label key:  label


Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/multirc/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)
Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/multirc/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/27243 [00:00<?, ?ex/s]

  0%|          | 0/4848 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/imdb_csv/0_0
Task label key:  label
Task not glue imdb


Reusing dataset imdb (/home/nguyen/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
Reusing dataset imdb (/home/nguyen/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/25000 [00:00<?, ?ex/s]

  0%|          | 0/25000 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/ag_news_csv/0_0
Task label key:  label
Task not glue ag_news


Using custom data configuration default
Reusing dataset ag_news (/home/nguyen/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)
Using custom data configuration default
Reusing dataset ag_news (/home/nguyen/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/120000 [00:00<?, ?ex/s]

  0%|          | 0/7600 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/yelp_review_full_csv/0_0
Task label key:  label
Task not glue yelp_review_full


Reusing dataset yelp_review_full (/home/nguyen/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)
Reusing dataset yelp_review_full (/home/nguyen/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)


  0%|          | 0/650000 [00:00<?, ?ex/s]

  0%|          | 0/50000 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/yahoo_answers_csv/0_0
Task label key:  topic
Task not glue yahoo_answers_topics


Reusing dataset yahoo_answers_topics (/home/nguyen/.cache/huggingface/datasets/yahoo_answers_topics/yahoo_answers_topics/1.0.0/0edb353eefe79d9245d7bd7cac5ae6af19530439da520d6dde1c206ee38f4439)
Reusing dataset yahoo_answers_topics (/home/nguyen/.cache/huggingface/datasets/yahoo_answers_topics/yahoo_answers_topics/1.0.0/0edb353eefe79d9245d7bd7cac5ae6af19530439da520d6dde1c206ee38f4439)


  0%|          | 0/1400000 [00:00<?, ?ex/s]

  0%|          | 0/60000 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/dbpedia_csv/0_0
Task label key:  label
Task not glue dbpedia_14


Reusing dataset dbpedia_14 (/home/nguyen/.cache/huggingface/datasets/dbpedia_14/dbpedia_14/2.0.0/01dab9e10d969eadcdbc918be5a09c9190a24caeae33b10eee8f367a1e3f1f0c)
Reusing dataset dbpedia_14 (/home/nguyen/.cache/huggingface/datasets/dbpedia_14/dbpedia_14/2.0.0/01dab9e10d969eadcdbc918be5a09c9190a24caeae33b10eee8f367a1e3f1f0c)


  0%|          | 0/560000 [00:00<?, ?ex/s]

  0%|          | 0/70000 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/amazon_csv/0_0
Task label key:  label


  0%|          | 0/115000 [00:00<?, ?ex/s]

  0%|          | 0/7600 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/mnli_csv/1_100
Task label key:  label


Using custom data configuration LysandreJik--glue-mnli-train-aef4727e80ba5593
Reusing dataset parquet (/home/nguyen/.cache/huggingface/datasets/LysandreJik___parquet/LysandreJik--glue-mnli-train-aef4727e80ba5593/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)
Using custom data configuration LysandreJik--glue-mnli-train-aef4727e80ba5593
Reusing dataset parquet (/home/nguyen/.cache/huggingface/datasets/LysandreJik___parquet/LysandreJik--glue-mnli-train-aef4727e80ba5593/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)


  0%|          | 0/392702 [00:00<?, ?ex/s]

  0%|          | 0/19647 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/qqp_csv/1_100
Task label key:  label


Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/363846 [00:00<?, ?ex/s]

  0%|          | 0/40430 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/rte_csv/1_100
Task label key:  label


Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/2490 [00:00<?, ?ex/s]

  0%|          | 0/277 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/sst2_csv/1_100
Task label key:  label


Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/67349 [00:00<?, ?ex/s]

  0%|          | 0/872 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/boolq_csv/1_100
Task label key:  label


Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/boolq/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)
Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/boolq/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/9427 [00:00<?, ?ex/s]

  0%|          | 0/3270 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/copa_csv/1_100
Task label key:  label


Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/copa/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)
Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/copa/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/400 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/wic_csv/1_100
Task label key:  label


Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/wic/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)
Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/wic/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/5428 [00:00<?, ?ex/s]

  0%|          | 0/638 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/cb_csv/1_100
Task label key:  label


Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/cb/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)
Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/cb/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/250 [00:00<?, ?ex/s]

  0%|          | 0/56 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/multirc_csv/1_100
Task label key:  label


Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/multirc/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)
Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/multirc/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/27243 [00:00<?, ?ex/s]

  0%|          | 0/4848 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/imdb_csv/1_100
Task label key:  label
Task not glue imdb


Reusing dataset imdb (/home/nguyen/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
Reusing dataset imdb (/home/nguyen/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/25000 [00:00<?, ?ex/s]

  0%|          | 0/25000 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/ag_news_csv/1_100
Task label key:  label
Task not glue ag_news


Using custom data configuration default
Reusing dataset ag_news (/home/nguyen/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)
Using custom data configuration default
Reusing dataset ag_news (/home/nguyen/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/120000 [00:00<?, ?ex/s]

  0%|          | 0/7600 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/yelp_review_full_csv/1_100
Task label key:  label
Task not glue yelp_review_full


Reusing dataset yelp_review_full (/home/nguyen/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)
Reusing dataset yelp_review_full (/home/nguyen/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)


  0%|          | 0/650000 [00:00<?, ?ex/s]

  0%|          | 0/50000 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/yahoo_answers_csv/1_100
Task label key:  topic
Task not glue yahoo_answers_topics


Reusing dataset yahoo_answers_topics (/home/nguyen/.cache/huggingface/datasets/yahoo_answers_topics/yahoo_answers_topics/1.0.0/0edb353eefe79d9245d7bd7cac5ae6af19530439da520d6dde1c206ee38f4439)
Reusing dataset yahoo_answers_topics (/home/nguyen/.cache/huggingface/datasets/yahoo_answers_topics/yahoo_answers_topics/1.0.0/0edb353eefe79d9245d7bd7cac5ae6af19530439da520d6dde1c206ee38f4439)


  0%|          | 0/1400000 [00:00<?, ?ex/s]

  0%|          | 0/60000 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/dbpedia_csv/1_100
Task label key:  label
Task not glue dbpedia_14


Reusing dataset dbpedia_14 (/home/nguyen/.cache/huggingface/datasets/dbpedia_14/dbpedia_14/2.0.0/01dab9e10d969eadcdbc918be5a09c9190a24caeae33b10eee8f367a1e3f1f0c)
Reusing dataset dbpedia_14 (/home/nguyen/.cache/huggingface/datasets/dbpedia_14/dbpedia_14/2.0.0/01dab9e10d969eadcdbc918be5a09c9190a24caeae33b10eee8f367a1e3f1f0c)


  0%|          | 0/560000 [00:00<?, ?ex/s]

  0%|          | 0/70000 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/amazon_csv/1_100
Task label key:  label


  0%|          | 0/115000 [00:00<?, ?ex/s]

  0%|          | 0/7600 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/mnli_csv/2_200
Task label key:  label


Using custom data configuration LysandreJik--glue-mnli-train-aef4727e80ba5593
Reusing dataset parquet (/home/nguyen/.cache/huggingface/datasets/LysandreJik___parquet/LysandreJik--glue-mnli-train-aef4727e80ba5593/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)
Using custom data configuration LysandreJik--glue-mnli-train-aef4727e80ba5593
Reusing dataset parquet (/home/nguyen/.cache/huggingface/datasets/LysandreJik___parquet/LysandreJik--glue-mnli-train-aef4727e80ba5593/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)


  0%|          | 0/392702 [00:00<?, ?ex/s]

  0%|          | 0/19647 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/qqp_csv/2_200
Task label key:  label


Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/363846 [00:00<?, ?ex/s]

  0%|          | 0/40430 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/rte_csv/2_200
Task label key:  label


Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/2490 [00:00<?, ?ex/s]

  0%|          | 0/277 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/sst2_csv/2_200
Task label key:  label


Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/nguyen/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/67349 [00:00<?, ?ex/s]

  0%|          | 0/872 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/boolq_csv/2_200
Task label key:  label


Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/boolq/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)
Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/boolq/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/9427 [00:00<?, ?ex/s]

  0%|          | 0/3270 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/copa_csv/2_200
Task label key:  label


Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/copa/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)
Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/copa/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/400 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/wic_csv/2_200
Task label key:  label


Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/wic/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)
Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/wic/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/5428 [00:00<?, ?ex/s]

  0%|          | 0/638 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/cb_csv/2_200
Task label key:  label


Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/cb/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)
Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/cb/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/250 [00:00<?, ?ex/s]

  0%|          | 0/56 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/multirc_csv/2_200
Task label key:  label


Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/multirc/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)
Reusing dataset super_glue (/home/nguyen/.cache/huggingface/datasets/super_glue/multirc/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/27243 [00:00<?, ?ex/s]

  0%|          | 0/4848 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/imdb_csv/2_200
Task label key:  label
Task not glue imdb


Reusing dataset imdb (/home/nguyen/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
Reusing dataset imdb (/home/nguyen/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/25000 [00:00<?, ?ex/s]

  0%|          | 0/25000 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/ag_news_csv/2_200
Task label key:  label
Task not glue ag_news


Using custom data configuration default
Reusing dataset ag_news (/home/nguyen/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)
Using custom data configuration default
Reusing dataset ag_news (/home/nguyen/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/120000 [00:00<?, ?ex/s]

  0%|          | 0/7600 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/yelp_review_full_csv/2_200
Task label key:  label
Task not glue yelp_review_full


Reusing dataset yelp_review_full (/home/nguyen/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)
Reusing dataset yelp_review_full (/home/nguyen/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)


  0%|          | 0/650000 [00:00<?, ?ex/s]

  0%|          | 0/50000 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/yahoo_answers_csv/2_200
Task label key:  topic
Task not glue yahoo_answers_topics


Reusing dataset yahoo_answers_topics (/home/nguyen/.cache/huggingface/datasets/yahoo_answers_topics/yahoo_answers_topics/1.0.0/0edb353eefe79d9245d7bd7cac5ae6af19530439da520d6dde1c206ee38f4439)
Reusing dataset yahoo_answers_topics (/home/nguyen/.cache/huggingface/datasets/yahoo_answers_topics/yahoo_answers_topics/1.0.0/0edb353eefe79d9245d7bd7cac5ae6af19530439da520d6dde1c206ee38f4439)


  0%|          | 0/1400000 [00:00<?, ?ex/s]

  0%|          | 0/60000 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/dbpedia_csv/2_200
Task label key:  label
Task not glue dbpedia_14


Reusing dataset dbpedia_14 (/home/nguyen/.cache/huggingface/datasets/dbpedia_14/dbpedia_14/2.0.0/01dab9e10d969eadcdbc918be5a09c9190a24caeae33b10eee8f367a1e3f1f0c)
Reusing dataset dbpedia_14 (/home/nguyen/.cache/huggingface/datasets/dbpedia_14/dbpedia_14/2.0.0/01dab9e10d969eadcdbc918be5a09c9190a24caeae33b10eee8f367a1e3f1f0c)


  0%|          | 0/560000 [00:00<?, ?ex/s]

  0%|          | 0/70000 [00:00<?, ?ex/s]

out path:  longseqtextclsdata/100/amazon_csv/2_200
Task label key:  label


  0%|          | 0/115000 [00:00<?, ?ex/s]

  0%|          | 0/7600 [00:00<?, ?ex/s]