# Clean r/foodporn Posts

In [1]:
# Imports

import numpy as np
import pandas as pd
import re 
import string
from googletrans import Translator
import nltk
from nltk.corpus import words, stopwords

# Setting options

pd.set_option('display.max_colwidth', -1)
nltk.download('stopwords')
nltk.download('words')

# Load stop words
stop_words = stopwords.words('english')
wordlist = words.words()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielwilentz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/danielwilentz/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
# Read in data
df = pd.read_pickle('../scrape/data/posts.pkl')

In [3]:
df.shape

(46739, 1)

In [4]:
# Take a look!
df = df.rename(columns = {0: 'title'})
df.sample(20)

Unnamed: 0,title
22624,[Home made] Finger Millet Porridge
19166,Let them eat cake!
39640,Pesto Pappardelle Recipe
46591,Lemon Poppy Seed Donuts (480p)
17840,"""Celebration Cake"" from Dandilion Chocolate in San Francisco"
28201,The way he topped my burger
14043,First attempt at grilled skirt stake!
25358,Korean Fried Wings With Homemade Gochujang Sauce [837 x 837]
41000,"Lamb chops with a caramel sauce, topped with nuts, with a side of broccoli and roasted potatoes"
22705,I bought two mountains of meat... And combined them.


In [5]:
# Structure analysis
num_words = df['title'].apply(lambda x: len(x.split()))
num_words_mean, num_words_std = np.mean(num_words), np.std(num_words)

In [6]:
print('Mean number of words per post: {} \n STD of words per post: {}'.format(num_words_mean, num_words_std))

Mean number of words per post: 8.77500588373735 
 STD of words per post: 6.085953651711066


# Define some cleaning functions

In [7]:
# Function for removing punctuation
def drop_punc(my_text):
    clean_text = re.sub('[%s]' % re.escape(string.punctuation), ' ', my_text)
    return clean_text

In [8]:
# Function for making all text lowercase
def lower(my_text):
    clean_text = my_text.lower()
    return clean_text

In [9]:
# Function for removing all numbers
def remove_numbers(my_text):
    clean_text = re.sub('\w*\d\w*', '', my_text)
    return clean_text

In [10]:
# Function for removing emojis
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

In [11]:
# Function for removing stop words
def remove_stop(my_text):
    text_list = my_text.split()
    return ' '.join([word for word in text_list if word not in stop_words])

In [12]:
# Function for stripping whitespace
def my_strip(my_text):
    try: return my_text.strip()
    except Exception as e: return None

In [13]:
# Curated list of additional stop-words for this project
my_stop_words = ['oc', 'amp', 'homemade', 'food', 'foodporn', 'v', 'x', 'recipe', 'best', 'il', 'ever',
                'time', 'attempt', 'first', 'ate', 'made', 'home', 'today', 'make', 'friend', 'local',
                'new', 'day', 'birthday', 'like', 'amazing', 'de', 'happy', 'year', 'plate', 'video', 
                'cooked', 'dish', 'house', 'os', 'tried', 'super', 'perfect', 'free', 'way', 'delicious', 
                'good', 'night', 'last', 'porn', 'eating', 'got', 'know']

# Function for removing my stop words
def remove_my_stop(my_text):
    text_list = my_text.split()
    return ' '.join([word for word in text_list if word not in my_stop_words])

### Special function for removing posts that are ads:

In [14]:
# curated list for words pertaining to advertisements:
ad_words = ['coupon', 'coupons', 'promo', 'promos', 'code', 'https', 'http', 'com']
def catch_ads(my_text):
    text_list = my_text.split()
    ad = False
    for word in text_list:
        if word in ad_words:
            ad = True
            break
    if ad: return None
    else: return my_text

### Special function for translating to english

In [15]:
# Function to translate into english
translator = Translator()
def translate(my_text):
    try:
        translated = translator.translate(my_text, dest='en')
        return translated.text
    except Exception as e:
        return my_text
    

The above function takes a long time to run on the full DataFrame.

Alternatively, we can look into the nltk.corpus words of the english language to determine the language of a given post

In [16]:
# Function to detect english
def is_english(my_text):
    if my_text is None:
        return my_text
    text_list = my_text.split()
    english = 0
    non_english = 0
    for word in text_list:
        if word not in wordlist:
            non_english += 1
        else:
            english += 1
    if english > 0.25*non_english:
        return True
    else: return False
            

# Let's see how well our functions work!

In [17]:
test = df.sample(50)

In [18]:
for index, row in test.iterrows():
    text = row[0]
    print(text)
    cleaned = deEmojify(remove_numbers(drop_punc(lower(text))))
    print(cleaned)
    cleaned_no_punc = remove_my_stop(remove_stop(cleaned))
    print(cleaned_no_punc)
    no_ads = my_strip(catch_ads(cleaned_no_punc))
    print(no_ads)
    flag = is_english(no_ads)
    print(flag)
    print()

Yes please!
yes please 
yes please
yes please
True

I want to know how long it's been standard to take food photos from this very deliberate angle. I do it, you do it, and when someone does it with a 1' angle off, I get dizzy. What was the path of this photographic style and what has contributed to its success?
i want to know how long it s been standard to take food photos from this very deliberate angle  i do it  you do it  and when someone does it with a   angle off  i get dizzy  what was the path of this photographic style and what has contributed to its success 
want long standard take photos deliberate angle someone angle get dizzy path photographic style contributed success
want long standard take photos deliberate angle someone angle get dizzy path photographic style contributed success
True

Donuts thanksgiving Donuts t-shirt
donuts thanksgiving donuts t shirt
donuts thanksgiving donuts shirt
donuts thanksgiving donuts shirt
True

Cinnamoroll Pop-up Cafe at Kumoya | Halal Certi

True

Valentines Day Dinner
valentines day dinner
valentines dinner
valentines dinner
True

[Homemade] Andouille sausage gravy and biscuits
 homemade  andouille sausage gravy and biscuits
andouille sausage gravy biscuits
andouille sausage gravy biscuits
True

Sexy Pasta [OC] [3120x4160]
sexy pasta  oc    
sexy pasta
sexy pasta
True

Sushi anyone?🍱🤗
sushi anyone 
sushi anyone
sushi anyone
True



The cleaning looks good! Let's apply it to the whole dataframe:

In [19]:
df['cleaned_title'] = df['title'].apply(lower).apply(drop_punc).apply(remove_numbers).apply(deEmojify)
df['cleaned_title'] = df['cleaned_title'].apply(remove_stop).apply(remove_my_stop).apply(catch_ads)
df['cleaned_title'] = df['cleaned_title'].str.strip()

In [20]:
df.head(50)

Unnamed: 0,title,cleaned_title
0,"Bacon Lollipops from Bell's Eccentric Cafe in Grand Rapids, MI [OC] [2232x2232]",bacon lollipops bell eccentric cafe grand rapids mi
1,Pico de Gallo - Spend With Pennies,pico gallo spend pennies
2,Our birthday dinner at Elizabeth's Chop House in Marquette MI,dinner elizabeth chop marquette mi
3,Artisanal mezcal the best way to celebrate Cinco de Mayo,artisanal mezcal celebrate cinco mayo
4,"Veggie + eggie = ... er, veggie",veggie eggie er veggie
5,MONSTERS INC CUPCAKES - https://www.youtube.com/watch?v=QigrA_sSyeI,
6,Teppanyaki Caramel Popcorn (X-Post From r/damnthatsinteresting),teppanyaki caramel popcorn post r damnthatsinteresting
7,I made my own ratatouille. Here it is before baking,ratatouille baking
8,Homemade Cookie,cookie
9,Chicken and broccoli stir fry [homemade{,chicken broccoli stir fry


Now let's add a new column using the 'is_english' function to try to pick out which observations are in english and which aren't

In [21]:
df['in_english'] = df['cleaned_title'].apply(is_english)

In [22]:
df.sample(50)

Unnamed: 0,title,cleaned_title,in_english
34471,"Beef taco with black beans, cheddar cheese, and sour cream (and a brewski of course). [OC]",beef taco black beans cheddar cheese sour cream brewski course,True
14959,Croissant Dragons,croissant dragons,False
41837,"Slow cooker Italian beef with sautéd mushrooms, roasted bell peppers, pepperoncini and provolone cheese, on a toasted hoagie [960x632] [OC]",slow cooker italian beef sautd mushrooms roasted bell peppers pepperoncini provolone cheese toasted hoagie,True
36183,Chicken&Waffle Benedict! @Beachside Cafe in San Francisco. Delicious! [1440x1440],chicken waffle benedict beachside cafe san francisco,True
2556,Lemon and garlic flavored green beans topped with crushed almonds. Baked chicken thighs and Mac and cheese.This is what I cooked for dinner tonight.,lemon garlic flavored green beans topped crushed almonds baked chicken thighs mac cheese dinner tonight,True
14768,I made a Rose Apple Pie for my friend's birthday!,rose apple pie,True
1322,"(homemade) carbonara, the way it's supposed to be",carbonara supposed,True
22032,"Milk and Honey Dessert at The NoMad Restaurant, NYC! [OC]",milk honey dessert nomad restaurant nyc,True
33935,Chili's grill and bar[1280X1280],chili grill bar,True
11812,"Corn flan with smoked tomatoes and zucchini with a chili oil and zucchini sauce. Art of the Table, Seattle WA. [OC] [1000x667]",corn flan smoked tomatoes zucchini chili oil zucchini sauce art table seattle wa,True


In [23]:
# See how many data points will be lost

df['in_english'].value_counts()

True     42234
False    4211 
Name: in_english, dtype: int64

I'm not too stoked on this translator. It seems to be getting some points wrong, so by using it as a filter, I will be losing potentially useful information.

I may come back and revisit this step either with a stronger translator or by grabbing more data to compensate for the lost stuff.

In [28]:
df[df['in_english'] == False].sample(20)

Unnamed: 0,title,cleaned_title,in_english
27693,PanicSupper,panicsupper,False
36537,A Clqassic [568 x 776],clqassic,False
5276,Amazing Seviche [1024x768],seviche,False
24325,I4es,,False
4588,[I Ate] Kyushu Danji Ramen at Terakawa Ramen in Philadelphia,kyushu danji ramen terakawa ramen philadelphia,False
42617,Porksicles,porksicles,False
38656,NUTELLA[720x1280][OC],nutella,False
9010,Steamed Eggs [3456 × 4608],steamed eggs,False
30778,"The Ratatouille from ""Ratatouille""",ratatouille ratatouille,False
11463,#GARDENCATERING #Ct #Food #GoldenNuggets,gardencatering ct goldennuggets,False


### Grab only the cleaned titles of the dataframe that are in english

In [24]:
cleaned = pd.DataFrame(df[df['in_english']==True]['cleaned_title'])

In [25]:
cleaned.sample(50)

Unnamed: 0,cleaned_title
21454,prime rib sandwich pretzel bun fried egg blue cheese crumbles horseradish cream sauce
23910,pancake heart boyfriend
3830,vegetable cake bunnies
32381,blue collar eats ahi tuna
32213,turkey bacon avocado club plus easy egg
30665,churros grand mayan tequila chocolate sauce moe cantina chicago
30707,fried oysters remoulade
30757,prime rib roast yorkshire puddings pleased results
38995,sushi bento box
13456,thai coconut curried tuna strangewich


### Pickle the data:

In [26]:
cleaned.to_pickle('cleaned_data.pkl')