In [72]:
# https://www.tensorflow.org/tutorials/load_data/csv

import pandas as pd
from typing import List
import csv

dataset = pd.read_csv(
    "memes900k/captions_val.txt",
    delimiter="\t",
    quoting=csv.QUOTE_NONE, # this is neccesary, because the original input is misquoted
    names=["label", "score", "text"]
)
dataset.drop("score", axis="columns", inplace=True)
dataset

import csv

def output_to_file(path: str, dataset, columns: List[str], header: bool = False):
    df = dataset.copy(deep=True)
    df["label"] = df["label"].apply(lambda x: x[:28])
    df["label"] = df["label"].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))
#     df.drop_duplicates(subset=columns[0], keep=False, inplace=True)
    df.to_csv(path, index=False, quoting=csv.QUOTE_ALL, columns=columns, header=header, encoding='ascii')
    
# output_to_file("captions_test_tag_universal_sep.csv", dataset, ["text", "label"], header=True)
dataset

Unnamed: 0,label,text
0,Y U No,commercial <sep> y u no same volume as show!?
1,Y U No,TED <sep> y u no tell us how you met their mother
2,Y U No,INTERNET <sep> y u nO LET ME STUDY
3,Y U No,Bad grades <sep> y u no smoke weed and get high?
4,Y U No,Green land <sep> Y U No Green?
...,...,...
74995,Bane Permission to Die,YOU THINK YOU KNOW PHYSIO <sep> I WAS BORN IN ...
74996,Bane Permission to Die,You didnt read the customs disclaimer <sep> Yo...
74997,Bane Permission to Die,you may deny my word when <sep> i give you per...
74998,Bane Permission to Die,ONLY WHEN I HAVE YOUR NUMBER <sep> I WILL BUY ...


In [73]:
dataset = pd.read_csv(
    "captions_test_tag_universal_sep.csv",
    names=["text", "label"]
)
dataset

Unnamed: 0,text,label
0,ADV ADV NOUN SEP NOUN VERB DET NOUN NOUN,Y U No
1,NOUN SEP NOUN NOUN NOUN NOUN NOUN .,Y U No
2,NOUN SEP NOUN NOUN NOUN NOUN . . .,Y U No
3,PRON VERB . DET PRON NOUN SEP NOUN NOUN NOUN V...,Y U No
4,NOUN SEP NOUN ADJ DET NOUN VERB ADJ VERB . .,Y U No
...,...,...
74600,ADV NOUN PRT NOUN VERB NOUN SEP ADV PRON VERB ...,Bane Permission to Die
74601,NOUN NOUN . PRON VERB PRON SEP CONJ PRON VERB ...,Bane Permission to Die
74602,NOUN NOUN VERB . SEP ADV ADJ .,Bane Permission to Die
74603,ADV NOUN VERB NOUN SEP PRON VERB PRON NOUN PRT...,Bane Permission to Die


In [74]:
labels = list(set(dataset.label.tolist()))
# labels = [x.replace(" ", "_").encode('ascii', 'ignore').decode('ascii') for x in labels]
labels[:5]

['Anti Joke Chicken',
 'gordo granudo',
 'Chill Out Lemur',
 'American Pride Eagle',
 'Stoner Stanley']

In [75]:
import pathlib

folder_paths = {}

for label in labels:
    folder_name = label.replace(" ", "_").encode('ascii', 'ignore').decode('ascii')
    folder_name = ''.join(x for x in folder_name if x.isalpha() or x == "_")
    folder_paths[label] = folder_name


In [76]:
folder_paths

{'Anti Joke Chicken': 'Anti_Joke_Chicken',
 'gordo granudo': 'gordo_granudo',
 'Chill Out Lemur': 'Chill_Out_Lemur',
 'American Pride Eagle': 'American_Pride_Eagle',
 'Stoner Stanley': 'Stoner_Stanley',
 'obama laughing ': 'obama_laughing_',
 'say what one more time': 'say_what_one_more_time',
 'Donald Trump': 'Donald_Trump',
 'Futurama Fry': 'Futurama_Fry',
 'Overly Attached Girlfriend': 'Overly_Attached_Girlfriend',
 'skyrim stan': 'skyrim_stan',
 'Retail Robin': 'Retail_Robin',
 'mens wearhouse': 'mens_wearhouse',
 'That escalated quickly-Ron B': 'That_escalated_quicklyRon_B',
 'RomneyMakes.com': 'RomneyMakescom',
 'dolan meme': 'dolan_meme',
 'Nio Malvado - Evil Toddler': 'Nio_Malvado__Evil_Toddler',
 'dr. evil quotation marks': 'dr_evil_quotation_marks',
 'you mean to tell me black ki': 'you_mean_to_tell_me_black_ki',
 'Sunny Student': 'Sunny_Student',
 "I don't always guy meme": 'I_dont_always_guy_meme',
 'Technologically Impaired Duc': 'Technologically_Impaired_Duc',
 "And it's 

In [77]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2)
train

Unnamed: 0,text,label
74050,NOUN NOUN NOUN NOUN SEP NOUN NOUN NOUN NOUN NOUN,men in black
55474,NOUN PRT ADV VERB . SEP PRON VERB VERB,Nick Cage
63630,PRON VERB NOUN NOUN NOUN NOUN NOUN NOUN NOUN ....,Not today arya
23946,NOUN NOUN VERB DET ADJ ADJ NOUN SEP DET NOUN V...,Unpopular Opinion Puffin
67995,NOUN SEP VERB ADV VERB NOUN PRT ADP NOUN,Grumpy Cat Santa Hat
...,...,...
60056,NOUN VERB SEP . NOUN .,dr. evil quotation marks
20051,ADV ADJ ADP VERB SEP CONJ VERB ADV ADJ,Not sure if troll
1776,NOUN ADP ADJ NOUN SEP ADV VERB,First World Problems
31579,PRON VERB ADP NOUN SEP PRON VERB ADV ADV NOUN ...,Hipster Ariel


In [78]:
from tqdm import tqdm
import shutil


def output_rows_to_folders(path_prefix: str, df):
    pathlib.Path(path_prefix).mkdir(parents=True, exist_ok=True)
    shutil.rmtree(path_prefix)

    for index, row in tqdm(df.iterrows()):
        text = row["text"]
        label = row["label"]

        folder_path = path_prefix + "/" + folder_paths[label]
        file_path = folder_path +"/"+ str(index).zfill(3)+".txt"

        pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)

        with open(file_path, "w", encoding="utf8") as f:
            f.write(text)


In [80]:
# output_rows_to_folders("custom_dataset/train", train)
# output_rows_to_folders("custom_dataset/test", test)
output_rows_to_folders("pos_dataset/train", train)
output_rows_to_folders("pos_dataset/test", test)

59684it [00:33, 1755.51it/s]
14921it [00:08, 1679.40it/s]


In [None]:
# import tarfile
# import os.path

# def make_tarfile(output_filename, source_dir):
#     with tarfile.open(output_filename, "w:gz") as tar:
#         tar.add(source_dir, arcname=os.path.basename(source_dir))
        
# make_tarfile("custom_dataset.tar.gz", "custom_dataset")