In [1]:
import pandas as pd
import numpy as np
import json
import re
import os

# --------- Pandas Settings ---------- #
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', -1)

# Load `captions.txt` into pandas dataframe

In [2]:
DATA_DIR = '../data'
FILE_NAME = 'Captions_new.txt'
FILE_PATH = os.path.join(DATA_DIR, FILE_NAME)
SAVE_FILE_PATH = os.path.join(DATA_DIR, 'cleaned_captions.txt')

In [3]:
def clean_txt_file(file_path, pattern_to_match, pattern_to_replace):
    try:
        if os.path.exists(file_path):
            data = []
            matching_count = 0
            line_count = 0
            with open(file_path, 'r') as file_io:
                for line in file_io:
                    line = line.rstrip()
                    if re.search(pattern_to_match, line):
                        new_line = re.sub(pattern_to_match, pattern_to_replace, line, 1)
                        data.append(new_line)
                        matching_count += 1
                    else:
                        print(f'[NON-MATCHING LINE]: {line}')
                    line_count += 1

            print(f'Total Line count: {line_count}')
            print(f'Total Matching count: {matching_count}')
            print(f'Total Non-Matching count: {line_count - matching_count}')
            return data
    except Exception as e:
        print(e)

In [4]:
data = clean_txt_file(FILE_PATH, r'( - ){1}', '-+-')

[NON-MATCHING LINE]: Kermit The Frog Drinking Tea -
[NON-MATCHING LINE]: drunk baby 1 -
[NON-MATCHING LINE]: san juan cholo -
[NON-MATCHING LINE]: san juan cholo -
[NON-MATCHING LINE]: san juan cholo -
[NON-MATCHING LINE]: san juan cholo -
[NON-MATCHING LINE]: san juan cholo -
[NON-MATCHING LINE]: kill yourself guy -
[NON-MATCHING LINE]: kill yourself guy -
[NON-MATCHING LINE]: kill yourself guy -
[NON-MATCHING LINE]: kill yourself guy -
[NON-MATCHING LINE]: crying peter parker -
[NON-MATCHING LINE]: fat chinese kid -
[NON-MATCHING LINE]: Keep calm and -
[NON-MATCHING LINE]: Keep calm and -
[NON-MATCHING LINE]: Keep calm and -
[NON-MATCHING LINE]: Keep calm and -
[NON-MATCHING LINE]: Really Stoned Guy -
[NON-MATCHING LINE]: Socially Awesome Awkward Penguin -
[NON-MATCHING LINE]: really high guy -
[NON-MATCHING LINE]: african kids dancing -
[NON-MATCHING LINE]: african kids dancing -
[NON-MATCHING LINE]: Unlucky Brian Strikes Again -
[NON-MATCHING LINE]: Kermit the frog -
[NON-MATCHING 

[NON-MATCHING LINE]: Donkey Shrek -
[NON-MATCHING LINE]: potential emigrant -
[NON-MATCHING LINE]: potential emigrant -
[NON-MATCHING LINE]: potential emigrant -
[NON-MATCHING LINE]: potential emigrant -
[NON-MATCHING LINE]: Kawaii Uguu -
[NON-MATCHING LINE]: Guffdead -
[NON-MATCHING LINE]: Guffdead -
[NON-MATCHING LINE]: Guffdead -
[NON-MATCHING LINE]: Guffdead -
[NON-MATCHING LINE]: Guffdead -
[NON-MATCHING LINE]: Lumberg -
[NON-MATCHING LINE]: Lumberg -
[NON-MATCHING LINE]: tristolla  -
[NON-MATCHING LINE]: tristolla  -
[NON-MATCHING LINE]: tristolla  -
[NON-MATCHING LINE]: tristolla  -
[NON-MATCHING LINE]: tristolla  -
[NON-MATCHING LINE]: tristolla  -
[NON-MATCHING LINE]: Minecraft Creeper Meme -
[NON-MATCHING LINE]: Minecraft Creeper Meme -
[NON-MATCHING LINE]: Minecraft Creeper Meme -
[NON-MATCHING LINE]: Minecraft Creeper Meme -
[NON-MATCHING LINE]: Minecraft Creeper Meme -
[NON-MATCHING LINE]: Minecraft Creeper Meme -
[NON-MATCHING LINE]: Minecraft Creeper Meme -
[NON-MATCHING

[NON-MATCHING LINE]: Knapped  -
[NON-MATCHING LINE]: Knapped  -
[NON-MATCHING LINE]: Knapped  -
[NON-MATCHING LINE]: Knapped  -
[NON-MATCHING LINE]: Knapped  -
[NON-MATCHING LINE]: Knapped  -
[NON-MATCHING LINE]: Knapped  -
[NON-MATCHING LINE]: Evil Granny -
[NON-MATCHING LINE]: Skuchayuschiy Botan -
[NON-MATCHING LINE]: Cristiano -
[NON-MATCHING LINE]: Colibritany xD -
[NON-MATCHING LINE]: Colibritany xD -
[NON-MATCHING LINE]: Colibritany xD -
[NON-MATCHING LINE]: Colibritany xD -
[NON-MATCHING LINE]: Colibritany xD -
[NON-MATCHING LINE]: Colibritany xD -
[NON-MATCHING LINE]: Colibritany xD -
[NON-MATCHING LINE]: durant harden -
[NON-MATCHING LINE]: cap art -
[NON-MATCHING LINE]: cap art -
[NON-MATCHING LINE]: Kanye Finish -
[NON-MATCHING LINE]: Slash Chameleon -
[NON-MATCHING LINE]: Slash Chameleon -
[NON-MATCHING LINE]: Slash Chameleon -
[NON-MATCHING LINE]: Slash Chameleon -
[NON-MATCHING LINE]: but it is not this day -
[NON-MATCHING LINE]: but it is not this day -
[NON-MATCHING LI

[NON-MATCHING LINE]: NACHO VIDAL MEME -
[NON-MATCHING LINE]: NACHO VIDAL MEME -
[NON-MATCHING LINE]: NACHO VIDAL MEME -
[NON-MATCHING LINE]: NACHO VIDAL MEME -
[NON-MATCHING LINE]: NACHO VIDAL MEME -
[NON-MATCHING LINE]: Soccer Fan -
[NON-MATCHING LINE]: Soccer Fan -
[NON-MATCHING LINE]: Soccer Fan -
[NON-MATCHING LINE]: Soccer Fan -
[NON-MATCHING LINE]: Soccer Fan -
[NON-MATCHING LINE]: First World Metal Problems -
[NON-MATCHING LINE]: First World Metal Problems -
[NON-MATCHING LINE]: X The Musical Student X -
[NON-MATCHING LINE]: X The Musical Student X -
[NON-MATCHING LINE]: X The Musical Student X -
[NON-MATCHING LINE]: X The Musical Student X -
[NON-MATCHING LINE]: X The Musical Student X -
[NON-MATCHING LINE]: X The Musical Student X -
[NON-MATCHING LINE]: drevil -
[NON-MATCHING LINE]: drevil -
[NON-MATCHING LINE]: drevil -
[NON-MATCHING LINE]: Me iria demasiado -
[NON-MATCHING LINE]: Me iria demasiado -
[NON-MATCHING LINE]: Me iria demasiado -
[NON-MATCHING LINE]: Me iria demasi

[NON-MATCHING LINE]: Typical Trance Listener -
[NON-MATCHING LINE]: Typical Trance Listener -
[NON-MATCHING LINE]: Typical Trance Listener -
[NON-MATCHING LINE]: Typical-Krasnodar -
[NON-MATCHING LINE]: pep guardiola -
[NON-MATCHING LINE]: pep guardiola -
[NON-MATCHING LINE]: Alco-cat -
[NON-MATCHING LINE]: Alco-cat -
[NON-MATCHING LINE]: Alco-cat -
[NON-MATCHING LINE]: Soon -
[NON-MATCHING LINE]: Tolyatti -
[NON-MATCHING LINE]: Tolyatti -
[NON-MATCHING LINE]: NIgel Thornberry -
[NON-MATCHING LINE]: Call Center Craig  -
[NON-MATCHING LINE]: Typical student Ravenclaw -
[NON-MATCHING LINE]: Typical student Ravenclaw -
[NON-MATCHING LINE]: Putin Says -
[NON-MATCHING LINE]: Putin Says -
[NON-MATCHING LINE]: niall horan1 -
[NON-MATCHING LINE]: niall horan1 -
[NON-MATCHING LINE]: Inveterate otaku -
[NON-MATCHING LINE]: Synesthete Snail -
[NON-MATCHING LINE]: f yeah -
[NON-MATCHING LINE]: stupid sexy bianca -
[NON-MATCHING LINE]: stupid sexy bianca -
[NON-MATCHING LINE]: stupid sexy bianca -


[NON-MATCHING LINE]: Annoying Tumblr girls -
[NON-MATCHING LINE]: Annoying Tumblr girls -
[NON-MATCHING LINE]: Tongo Dice -
[NON-MATCHING LINE]: Tongo Dice -
[NON-MATCHING LINE]: Tongo Dice -
[NON-MATCHING LINE]: Scumbag Chick Fil A Cow -
[NON-MATCHING LINE]: Scumbag Chick Fil A Cow -
[NON-MATCHING LINE]: Scumbag Chick Fil A Cow -
[NON-MATCHING LINE]: Scumbag Chick Fil A Cow -
[NON-MATCHING LINE]: Scumbag Chick Fil A Cow -
[NON-MATCHING LINE]: Dinosaur Director -
[NON-MATCHING LINE]: Dani Filth -
[NON-MATCHING LINE]: Dani Filth -
[NON-MATCHING LINE]: AdviceCar -
[NON-MATCHING LINE]: Crybmxguy -
[NON-MATCHING LINE]: Crybmxguy -
[NON-MATCHING LINE]: Ah, Yes, Reapers -
[NON-MATCHING LINE]: Disgusted Caco Antibes -
[NON-MATCHING LINE]: Odintsovo -
[NON-MATCHING LINE]: vold -
[NON-MATCHING LINE]: vold -
[NON-MATCHING LINE]: vold -
[NON-MATCHING LINE]: vold -
[NON-MATCHING LINE]: Creeper -
[NON-MATCHING LINE]: Creeper -
[NON-MATCHING LINE]: Creeper -
[NON-MATCHING LINE]: Creeper -
[NON-MATCH

[NON-MATCHING LINE]: Stoner dogs concerned friend -
[NON-MATCHING LINE]: Stoner dogs concerned friend -
[NON-MATCHING LINE]: Stoner dogs concerned friend -
[NON-MATCHING LINE]: Stoner dogs concerned friend -
[NON-MATCHING LINE]: Vivo Al Limite -
[NON-MATCHING LINE]: Vivo Al Limite -
[NON-MATCHING LINE]: Vivo Al Limite -
[NON-MATCHING LINE]: Vivo Al Limite -
[NON-MATCHING LINE]: Vivo Al Limite -
[NON-MATCHING LINE]: Drunk mentor -
[NON-MATCHING LINE]: Drunk mentor -
[NON-MATCHING LINE]: Surprised Cenk -
[NON-MATCHING LINE]: Parkour Boy -
[NON-MATCHING LINE]: Parkour Boy -
[NON-MATCHING LINE]: Foul Bachelor Frog (Alcoholic Anon) -
[NON-MATCHING LINE]: Jesus Facepalm -
[NON-MATCHING LINE]: Jesus Facepalm -
[NON-MATCHING LINE]: Try Hard Emos -
[NON-MATCHING LINE]: Try Hard Emos -
[NON-MATCHING LINE]: wiz khalifa -
[NON-MATCHING LINE]: wiz khalifa -
[NON-MATCHING LINE]: wiz khalifa -
[NON-MATCHING LINE]: wiz khalifa -
[NON-MATCHING LINE]: wiz khalifa -
[NON-MATCHING LINE]: jbismyloveasdfghj

[NON-MATCHING LINE]: Bad Luck Drizzy -
[NON-MATCHING LINE]: Bad Luck Drizzy -
[NON-MATCHING LINE]: Bad Luck Drizzy -
[NON-MATCHING LINE]: Chico Xavequeiro -
[NON-MATCHING LINE]: Chico Xavequeiro -
[NON-MATCHING LINE]: Surprised Hipster -
[NON-MATCHING LINE]: Reel Big Fish! (band) -
[NON-MATCHING LINE]: Expert_girl -
[NON-MATCHING LINE]: DamonDepressao -
[NON-MATCHING LINE]: DamonDepressao -
[NON-MATCHING LINE]: DamonDepressao -
[NON-MATCHING LINE]: DamonDepressao -
[NON-MATCHING LINE]: Poland ball -
[NON-MATCHING LINE]: Sel Gomez -
[NON-MATCHING LINE]: tom delonge -
[NON-MATCHING LINE]: tom delonge -
[NON-MATCHING LINE]: tom delonge -
[NON-MATCHING LINE]: face off -
[NON-MATCHING LINE]: face off -
[NON-MATCHING LINE]: Chilled out Shepard -
[NON-MATCHING LINE]: Chilled out Shepard -
[NON-MATCHING LINE]: The Pit Polar Bear -
[NON-MATCHING LINE]: cool hipster girl -
[NON-MATCHING LINE]: Flamboyant Gay Man -
[NON-MATCHING LINE]: Flamboyant Gay Man -
[NON-MATCHING LINE]: Flamboyant Gay Man 

In [5]:
def write_txt_file(file_path, data):
    try:
        with open(file_path, 'w') as file_io:
            print(f'Writing to file...{file_path}')
            for line in data:
                file_io.write(line + '\n')
            print(f'Completed writing to file :)')
    except Exception as e:
        print(e)
        
write_txt_file(SAVE_FILE_PATH, data)

Writing to file...../data/cleaned_captions.txt
Completed writing to file :)


### Read `cleaned_captions.txt` into pandas dataframe

In [6]:
import warnings
warnings.filterwarnings("ignore")

df = pd.read_table(SAVE_FILE_PATH, sep="-\+-", header=None)
df.columns = ['base_meme_name', 'base_meme_text']
df.head()

Unnamed: 0,base_meme_name,base_meme_text
0,Y U No,meme generator users y u no give me more upvotes?
1,Y U No,steve jobs y u no respawn?!
2,Y U No,commercial y u no same volume as show!?
3,Y U No,KONY Y u no take justin bieber
4,Y U No,Victoria y u no tell us your secret?!


### Drop Nulls

In [7]:
# Drop null entries
df = df.dropna(axis=0)
df.isnull().sum()

base_meme_name    0
base_meme_text    0
dtype: int64

### Remove whitespaces

In [8]:
# Remove whitespaces
df['base_meme_text'] = df['base_meme_text'].str.strip()
df['base_meme_name'] = df['base_meme_name'].str.strip()
df.sample(10)

Unnamed: 0,base_meme_name,base_meme_text
195588,Immature high school kids,When the substitute takes roll lets switch names
299269,Neville Southall,well done you beat liverpool
365705,DOUG CEBOLUDO,OOOOOOO CE TA LOCONA CE TA MARCANO OOOOOOO
108355,novia pesada,i know we started dating yesterday but i love you!!!
119368,Patrick Star Instrument,Is faith's vagina an instrument
193588,sergio freire,mas pajero qe el ramos WENA
96198,The Dude,Am I the only one around here that prefers plain old ice water to kick cotton mouth's ass?
90985,crazy villain,ROBBIE ON A GOOD DAY
60919,dogeee,Such wow many sis
332626,Napoleon Moped Grom,buys lead additive


### Lowercase everything

In [9]:
# Lowercase all strings
df['base_meme_text'] = df['base_meme_text'].str.lower()
df['base_meme_name'] = df['base_meme_name'].str.lower()
df.sample(10)

Unnamed: 0,base_meme_name,base_meme_text
30001,skyrim stan,if i read another meme about taking an arrow to the knee no one will ever adventure again
276306,tyler durden 2,i want you to quack as hard as you can
395762,bandit keith,hmar? sorry i don't speak terrorist
289114,strict policeman,dodging american border patrol since 1996
105810,crazy girlfriend meme heh,yes
57119,uncle dolan pls,fort plz
171915,dubious history teacher,i only get paid.. what!? hollister payslip
7326,batman slap robin,mark zuckerberg is giving away... it's a hoax!
162436,beastguy,come down to dinner. now.
366499,musically diverse metalhead,listens to metal respects other people's tastes


### Set upper and lower bounds to meme text sentence length

In [10]:
# Remove entries whose len is < 5
df = df[(df['base_meme_text'].str.len() > 4) & (df['base_meme_text'].str.len() < 135)]
df.sample(10)

Unnamed: 0,base_meme_name,base_meme_text
134488,socially fed up penguin,jeans i wanted cost seven dollars less last week its world war 3
191103,retouched ecce homo,soy hijo de un dios del rey mono concretamente
73220,sad face guy,goodbye benevolent dictator
64231,horney samus,i'm not a prostitute if i do it for free
51569,never have i been so wrong,i thought people would find my post funny never have i been so wrong
290121,unhappy queen,what a fucking idiot
186039,joe rogan,dan henderson y u no kick leg?
131397,perfect driver,picks up his buddy at 7am lets buddy know he's arrived by honking his dukes of hazzard horn
387173,cool hipster girl,buy an iphone only using instagram
196917,willianss,ja rodam assassins creed sem serras enquanto no console de mesa parece ate o serrado nordestino


In [11]:
df['base_meme_text'].str.len().describe()

count    403007.000000
mean     44.836539    
std      23.659660    
min      5.000000     
25%      27.000000    
50%      41.000000    
75%      58.000000    
max      134.000000   
Name: base_meme_text, dtype: float64

### Clean non-sense punctuations

In [12]:
# Remove erroneous punctuation
pattern = r'^[.,\/#!$%\^&\*;:{}=\-_`~()+]{4,}'
df = df[~df['base_meme_text'].str.contains(pat=pattern, regex=True)]
print(df.shape)
df.sample(10)

(402860, 2)


Unnamed: 0,base_meme_name,base_meme_text
344262,trolltaleba,taco tuesday's the bomb!
205843,fry chilion,"i can eat all i want, and when i'm full, i use the bathroom then i can eat more"
271854,charizard,ash y u no pick me?
326746,soykimschmitz,refund or... i'll share your files in megaupload
71666,good guy greg's dog,you might have seen me in the streets... but shawty you don't know me
377738,oh itachi,mom and dad are so proud of me kill them
335965,queen elizabeth ii,let the war begin
220215,judge dredd,i'am the law
28033,i don't know who you are...,i don't know who you are but i will find you and i will thank you
294978,stalker ex-girlfriend,has shrine of you in closet


### Remove `HTTP` urls in `base_meme_name` and `base_meme_text`

In [13]:
# Remove URLs that start in beginning

df = df[~(df['base_meme_text'].str.startswith('http'))]
df = df[~(df['base_meme_name'].str.startswith('http'))]
print(df.shape)
df.sample(10)

(402688, 2)


Unnamed: 0,base_meme_name,base_meme_text
346522,gamer rage,jason
69392,look at all the things,look at all the places i've been fucked
333837,singing batman live,nobody knows what it's like to be adopted by a butler. now sing everyone!
11799,chill out lemur,justin bieber is a guy? puhleez
119537,meme tierno,ti plego ti plego aiutlami
95112,flip table meme,work all day long pc crash and dont save
207690,manelistul nicolae guta,la ce-ti trebuie swag candi ai shukarim3
94155,advice borat,mnb my name borat
390016,killer film student,aste un favor y... date un tiro
325708,totally looks like....,slender man anonymous


In [14]:
# Remove URLs that are in middle of text

pattern = '[(http)(https)]://'
df = df[~(df['base_meme_text'].str.contains(pat=pattern, regex=True))]
df = df[~(df['base_meme_name'].str.contains(pat=pattern, regex=True))]
print(df.shape)
df.sample(10)

(402618, 2)


Unnamed: 0,base_meme_name,base_meme_text
204835,"really, really, really, really gay guy",oh really now?!? tell me more
250179,shipper squirrel,rjo says i stopped using netbeans when they dropped ruby support
140397,noob explorer dora,guess what... in the most annoying kid ever:)
54589,science cat,"one, particles are attracted to each other like boys and girls"
225816,comunist stupid facebook girl,what!! guns cost 5 billion$
114154,hey girl,hey girl stuff
110797,eating disorder owl,parents judge you for not eating parents judge you for eating normally
44650,bill murray caddyshack,amazing gf broke up with me today but i got my got my grom big bore kit in the mail which is nice.
69878,90s problems,i just cant believe it schlomo wont go down on me
141266,sexto sentido,josh chucky me habla


### Removing memegenerator trolls

In [15]:
# Removing memegenerator trolls
pattern = '(memegenerator).+(fuck)'
df = df[~df['base_meme_text'].str.contains(pat=pattern, regex=True)]
print(df.shape)
df.sample(10)

(402557, 2)


Unnamed: 0,base_meme_name,base_meme_text
268026,bodybuilder problems,did three hour chest workout yesterday soreness not achieved
78336,sparta kick,xbox one?! i game on the pc!!!
370212,advice matt berninger,so happy he was invited just gave him a chance to get out of the city
67526,maury pv,you claim you can fuck with frank write the lie detector said that was a lie
348883,llorona desesperante,i wanna be in jais army
64322,horney samus,you look like you could use a nose job... and i'm not talking about plastic surgery.
147614,towelie,hadid is a towel
16740,sunny student,ballin like a scrub
16806,bear grylls,it's sorta cold better climb inside this hollowed-out camel carcass
210412,bob esponja y la caracola magica,el caracol dice... feo


### Bleach the data

In [16]:
df['base_meme_text'] = df['base_meme_text'].str.replace('nigga', 'ninja')

In [17]:
df[df['base_meme_text'].str.contains('ninja')]

Unnamed: 0,base_meme_name,base_meme_text
2991,conspiracy keanu,what if a ninja is pressing enter every time i type something on google?
3009,conspiracy keanu,what if random erections are actually ninja handjobs
3164,kermit the frog drinking tea,talkn bout ninjas aint loyal but when i zoom in on yo scalp..them edges aint loyal but that's none of my business
3166,kermit the frog drinking tea,you talk shit about strippers & prostitutes... but you be fuckin ninjas for free. #noneofmybusiness
3186,kermit the frog drinking tea,ninjas can make kids but cant be in his kids life but thats none of my business
...,...,...
410452,ranzigen peelaert,ninjas be like slippin
410467,ranzigen peelaert,ninjas be like al cien!
410470,ranzigen peelaert,ninjas be like where them hoes at
410494,ranzigen peelaert,ninjas be like fire up another one


In [18]:
df = df[~df['base_meme_text'].str.contains('nigger')]
print(df.shape)
df.sample(10)

(401947, 2)


Unnamed: 0,base_meme_name,base_meme_text
360215,street-sensei,"sensei say, one who cheat must prepare to dog shit eat"
399899,facebook roleplay ocelot,picnik does not make your profile pic better i don't care who you are.
184946,pikachu enojado,samantha will never be as cute
315911,assburgerin superneuvo,forssan kirkko komein kirkko
169763,high five girl,up top ninja!
166148,skuchayuschiy botan,vater kommt ins buero feierabend
110201,unicorn man,pink fluffy unicorn dancin' on rainbows
141448,pedobear81,she's 18 months old
334714,advice hurt garrus,wear visor aim with other eye
244655,kollegah der boss,die digitalwaage zeigt an 81 gramm das wird auf 100 gestreckt


In [19]:
df = df[~df['base_meme_text'].str.contains('fag')]
print(df.shape)
df.sample(10)

(400951, 2)


Unnamed: 0,base_meme_name,base_meme_text
208527,stereotypical canadian moose,canada here who is this andrew breitbart?
343923,success rdj,siri? fuck apple i got jarvis
240547,awesome advice,always smile :)
29537,es bakans,ooooooooooh i messed up
186789,pompous cyber cat,when ma ballz r sore i dont use rub a535 fgfc820
125177,family guy pepperidge farm,remember when you touched the mirror pools water pepperidge farm remembers
375519,asian guy,stupid tna assholes
248550,victory baby meme,dropped food on the floor no minni hair on it
383047,advice brick,there t lux redbrick are gunge
271036,scumbag terminator,young team yup there at shawlands too


In [20]:
df = df[~df['base_meme_text'].str.contains('cunt')]
print(df.shape)
df.sample(10)

(399975, 2)


Unnamed: 0,base_meme_name,base_meme_text
144987,bender popular,"hey tm why you use ramare? everybody was doin' it, i just wanna be popular"
364766,roommate rabbit,only sleeps when you need to do work only works when you need to sleep
282777,herbert,jack pot
282187,funny asian guy,ching chong ping pong no white house for blackbama
104420,archaic rap,your clothes may have cost a copious amount of currency but look ghastly because vagrants of your kind partake in this i shall not
57406,went full retard,you mean to tell me ... that this isn't call of duty?
362811,army-dog,all of this is possible if you join the army
159358,j walter weatherman,and that's why you don't bring home strange boys you meet in bars
296831,psychology student platypus,"""i hate this class - psychology is all common sense."" if it's all common sense, then why are you failing?"
123502,rocky balboa,we beat the memory leak


# Save dataframe as json

In [23]:
data = os.path.join(DATA_DIR, 'bleached_data.json')
with open(data, 'w') as file_object:
    json.dump(json.loads(df.to_json(orient='records')), file_object, indent=4)