In [1]:
import pandas as pd
from typing import List, Optional
import nltk
from tqdm import tqdm
import csv

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ondre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ondre\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\ondre\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [2]:
dataset = pd.read_csv(
    'memes900k/captions_test.txt',
    delimiter="\t",
    quoting=csv.QUOTE_NONE, # this is neccesary, because the original input contains raw/not-escaped quotes",
    names=['MemeName', 'Score', 'Text']
)
dataset = dataset.drop(columns=['Score'])
dataset.head(5)

Unnamed: 0,MemeName,Text
0,Y U No,Forever alone guy <sep> y u no get cat
1,Y U No,TEAMMATES <sep> Y U NO REVIVE ME?
2,Y U No,GIRLS <sep> Y U SO COMPLICATED ??!
3,Y U No,I 'like' all your pics <sep> Y U No have sex w...
4,Y U No,girls <sep> y u no stop making duck faces?!


In [3]:
for index, row in tqdm(dataset.iterrows()):
    if "here to offer you a great deal" not in row["Text"]: # This is test for one of the unescpaed inputs.
        continue
    print(row)

7389it [00:00, 36605.87it/s]

MemeName                                        Grumpy Cat 
Text        "I'm here to offer you a great deal on <sep> no
Name: 2337, dtype: object


75000it [00:02, 37082.07it/s]


In [4]:
def split_memes(dataset):
    parts = []
        
    for sentence in tqdm(dataset.Text):
        prefix, _, suffix = sentence.partition("<sep>")
        parts.append((prefix, suffix))
    
    dataset["prefix"] = [x[0] for x in parts]
    dataset["suffix"] = [x[1] for x in parts]

    
split_memes(dataset)
dataset.head(5)

100%|███████████████████████████████████████████████████████████████████████| 75000/75000 [00:00<00:00, 1933393.57it/s]


Unnamed: 0,MemeName,Text,prefix,suffix
0,Y U No,Forever alone guy <sep> y u no get cat,Forever alone guy,y u no get cat
1,Y U No,TEAMMATES <sep> Y U NO REVIVE ME?,TEAMMATES,Y U NO REVIVE ME?
2,Y U No,GIRLS <sep> Y U SO COMPLICATED ??!,GIRLS,Y U SO COMPLICATED ??!
3,Y U No,I 'like' all your pics <sep> Y U No have sex w...,I 'like' all your pics,Y U No have sex with me?
4,Y U No,girls <sep> y u no stop making duck faces?!,girls,y u no stop making duck faces?!


In [5]:
def tag_sentence(sentence: str, tagset=None) -> List[str]:
    words: List[str] = nltk.word_tokenize(sentence)
    tagged_words = nltk.pos_tag(words, tagset=tagset)
    tags_only = [word[1] for word in tagged_words]
       
    return tags_only

In [6]:
def tag_sentences_in_column(dataset, column: str, tagset: Optional[str] = None):
    answer = []
    for sentence in tqdm(dataset.loc[:, column]):
        tags = tag_sentence(sentence, tagset=tagset)
        answer.append(" ".join(tags))

    dataset[f"{column}_tagged_{tagset}"] = answer


In [10]:
tagsets = [None, 'universal'] # None is native nltk tagset
# tagsets = ['universal']

In [11]:

for tagset in tagsets:
    dataset['TextWithoutSeparator'] = dataset['Text'].str.replace('<sep>', ' ')
    
    tag_sentences_in_column(dataset, "prefix", tagset)
    tag_sentences_in_column(dataset, "suffix", tagset)
    tag_sentences_in_column(dataset, "TextWithoutSeparator", tagset)

dataset.head(5)

100%|███████████████████████████████████████████████████████████████████████████| 75000/75000 [01:41<00:00, 740.54it/s]
100%|███████████████████████████████████████████████████████████████████████████| 75000/75000 [01:41<00:00, 738.74it/s]
100%|███████████████████████████████████████████████████████████████████████████| 75000/75000 [01:56<00:00, 643.67it/s]
100%|███████████████████████████████████████████████████████████████████████████| 75000/75000 [01:40<00:00, 744.77it/s]
100%|███████████████████████████████████████████████████████████████████████████| 75000/75000 [01:39<00:00, 750.04it/s]
100%|███████████████████████████████████████████████████████████████████████████| 75000/75000 [01:53<00:00, 659.25it/s]


Unnamed: 0,MemeName,Text,prefix,suffix,TextWithoutSeparator,prefix_tagged_universal,prefix_tagged_None,suffix_tagged_None,TextWithoutSeparator_tagged_None,suffix_tagged_universal,TextWithoutSeparator_tagged_universal
0,Y U No,Forever alone guy <sep> y u no get cat,Forever alone guy,y u no get cat,Forever alone guy y u no get cat,ADV ADV NOUN,RB RB NN,NN VBZ DT NN NN,RB RB JJ NNS VBP DT NN NN,NOUN VERB DET NOUN NOUN,ADV ADV ADJ NOUN VERB DET NOUN NOUN
1,Y U No,TEAMMATES <sep> Y U NO REVIVE ME?,TEAMMATES,Y U NO REVIVE ME?,TEAMMATES Y U NO REVIVE ME?,NOUN,NNS,NNP NNP NNP NNP NNP .,NNP NNP NNP NNP NNP NNP .,NOUN NOUN NOUN NOUN NOUN .,NOUN NOUN NOUN NOUN NOUN NOUN .
2,Y U No,GIRLS <sep> Y U SO COMPLICATED ??!,GIRLS,Y U SO COMPLICATED ??!,GIRLS Y U SO COMPLICATED ??!,NOUN,NNS,NNP NNP NNP NNP . . .,NNP NNP NNP NNP NNP . . .,NOUN NOUN NOUN NOUN . . .,NOUN NOUN NOUN NOUN NOUN . . .
3,Y U No,I 'like' all your pics <sep> Y U No have sex w...,I 'like' all your pics,Y U No have sex with me?,I 'like' all your pics Y U No have sex with me?,PRON VERB . DET PRON NOUN,PRP MD '' DT PRP$ NNS,NNP NNP NNP VBP NN IN PRP .,PRP MD '' DT PRP$ NNS NNP NNP NNP VBP NN IN PRP .,NOUN NOUN NOUN VERB NOUN ADP PRON .,PRON VERB . DET PRON NOUN NOUN NOUN NOUN VERB ...
4,Y U No,girls <sep> y u no stop making duck faces?!,girls,y u no stop making duck faces?!,girls y u no stop making duck faces?!,NOUN,NNS,NN JJ DT NN VBG JJ VBZ . .,NNS VBP JJ DT NN VBG JJ VBZ . .,NOUN ADJ DET NOUN VERB ADJ VERB . .,NOUN VERB ADJ DET NOUN VERB ADJ VERB . .


In [25]:
def split_whole_sentence_tags_to_prefix_and_suffix(dataset, tagsets: List[str]):
    for tagset in tagsets:
        parts = []

        for index, row in tqdm(dataset.iterrows()):
            prefix_len = len(row[f"prefix_tagged_{tagset}"].split(" "))
            suffix_len = len(row[f"suffix_tagged_{tagset}"].split(" "))
            total_len = len(row[f"TextWithoutSeparator_tagged_{tagset}"].split(" "))

            if prefix_len + suffix_len == total_len:
                sentence_splitted = row[f"TextWithoutSeparator_tagged_{tagset}"].split(" ")
                parts.append((
                    " ".join(sentence_splitted[:prefix_len]),
                    " ".join(sentence_splitted[prefix_len:])
                ))

            else: # fallback if for some reason the parsing came up with different length
                parts.append((
                    row[f"prefix_tagged_{tagset}"],
                    row[f"suffix_tagged_{tagset}"]
                ))

        dataset[f"prefix_tagged_{tagset}"] = [x[0] for x in parts]
        dataset[f"suffix_tagged_{tagset}"] = [x[1] for x in parts]

split_whole_sentence_tags_to_prefix_and_suffix(dataset, tagsets)

75000it [00:03, 24810.64it/s]
75000it [00:02, 25308.16it/s]


In [14]:
def cleanup_titles(df):
    df["MemeName"] = df["MemeName"].apply(lambda x: x[:28])
    df["MemeName"] = df["MemeName"].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))

cleanup_titles(dataset)

In [19]:
def output_to_file(path: str, dataset, columns: List[str], header: bool = False, drop_duplicates: bool = False):
    df = dataset.copy(deep=True)
    if drop_duplicates:
        df = df.drop_duplicates(subset=columns[:-1], keep=False)
        
    df.to_csv(path, index=False, quoting=csv.QUOTE_ALL, columns=columns, header=header, encoding='ascii')


In [23]:
def count_parts(df, tagsets: List[str]):
    for tagset in tagsets:
        df[f"prefix_tagged_{tagset}_len"] = df[f"prefix_tagged_{tagset}"].apply(lambda x: len(x.split(" ")))
        df[f"suffix_tagged_{tagset}_len"] = df[f"suffix_tagged_{tagset}"].apply(lambda x: len(x.split(" ")))


In [24]:
count_parts(dataset, tagsets)

In [26]:
dataset.head(5)

Unnamed: 0,MemeName,Text,prefix,suffix,TextWithoutSeparator,prefix_tagged_universal,prefix_tagged_None,suffix_tagged_None,TextWithoutSeparator_tagged_None,suffix_tagged_universal,TextWithoutSeparator_tagged_universal,prefix_tagged_None_len,suffix_tagged_None_len,prefix_tagged_universal_len,suffix_tagged_universal_len
0,Y U No,Forever alone guy <sep> y u no get cat,Forever alone guy,y u no get cat,Forever alone guy y u no get cat,ADV ADV ADJ,RB RB JJ,NNS VBP DT NN NN,RB RB JJ NNS VBP DT NN NN,NOUN VERB DET NOUN NOUN,ADV ADV ADJ NOUN VERB DET NOUN NOUN,3,5,3,5
1,Y U No,TEAMMATES <sep> Y U NO REVIVE ME?,TEAMMATES,Y U NO REVIVE ME?,TEAMMATES Y U NO REVIVE ME?,NOUN,NNP,NNP NNP NNP NNP NNP .,NNP NNP NNP NNP NNP NNP .,NOUN NOUN NOUN NOUN NOUN .,NOUN NOUN NOUN NOUN NOUN NOUN .,1,6,1,6
2,Y U No,GIRLS <sep> Y U SO COMPLICATED ??!,GIRLS,Y U SO COMPLICATED ??!,GIRLS Y U SO COMPLICATED ??!,NOUN,NNP,NNP NNP NNP NNP . . .,NNP NNP NNP NNP NNP . . .,NOUN NOUN NOUN NOUN . . .,NOUN NOUN NOUN NOUN NOUN . . .,1,7,1,7
3,Y U No,I 'like' all your pics <sep> Y U No have sex w...,I 'like' all your pics,Y U No have sex with me?,I 'like' all your pics Y U No have sex with me?,PRON VERB . DET PRON NOUN,PRP MD '' DT PRP$ NNS,NNP NNP NNP VBP NN IN PRP .,PRP MD '' DT PRP$ NNS NNP NNP NNP VBP NN IN PRP .,NOUN NOUN NOUN VERB NOUN ADP PRON .,PRON VERB . DET PRON NOUN NOUN NOUN NOUN VERB ...,6,8,6,8
4,Y U No,girls <sep> y u no stop making duck faces?!,girls,y u no stop making duck faces?!,girls y u no stop making duck faces?!,NOUN,NNS,VBP JJ DT NN VBG JJ VBZ . .,NNS VBP JJ DT NN VBG JJ VBZ . .,VERB ADJ DET NOUN VERB ADJ VERB . .,NOUN VERB ADJ DET NOUN VERB ADJ VERB . .,1,9,1,9


In [32]:
output_to_file("universal_tags_only_for_google.csv", dataset, ['prefix_tagged_universal', 'suffix_tagged_universal', 'MemeName'], header=False, drop_duplicates=True)
output_to_file("default_tags_only_for_google.csv", dataset, ['prefix_tagged_None', 'suffix_tagged_None', 'MemeName'], header=False, drop_duplicates=True)
output_to_file("prefix_suffix_for_google.csv", dataset, ['prefix', 'suffix', 'MemeName'], header=True, drop_duplicates=True)

output_to_file("all.csv", dataset, dataset.columns, header=True)

selected_columns = [
    "prefix", "suffix",
    "prefix_tagged_universal", "suffix_tagged_universal",
    "prefix_tagged_universal_len", "suffix_tagged_universal_len",
    "MemeName"
]

output_to_file("selected_to_google.csv", dataset, selected_columns, header=True)


In [34]:
labels = list(set(dataset.MemeName.tolist()))
labels

dataset_20_labels = dataset[dataset["MemeName"].isin(labels[:20])]
output_to_file("selected_20_categories_to_google.csv", dataset_20_labels, selected_columns, header=True)