In [176]:
# Imports

import numpy as np
import pandas as pd
import re 
import string
from googletrans import Translator
import nltk
from nltk.corpus import stopwords

# Setting options

pd.set_option('display.max_colwidth', -1)
nltk.download('stopwords')

# Load stop words
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielwilentz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [146]:
# Read the data
df = pd.read_pickle('../scrape/data/posts.pkl')

In [147]:
# Take a look!
df = df.rename(columns = {0: 'title'})
df.sample(20)

Unnamed: 0,title
17503,"Now thats a calzone. Cleveland, Ohio.(Lime for scale)"
20554,Pates (Savory &amp; Deli Foods) Market in Saudi Arabia Analytics by Category &amp; Cost Type to 2021
16634,how to make friends (with baguette)
22894,Annihilation (2018) - Teaser Trailer
16638,Tiramisu ! I made it!
17855,Chickeeeeeeen and herbs to be grilled
10863,Honey drizzled figs from the backyard tree my grand-pappy planted.
4468,Some Freezing Cold Brew on a Hot Hot Day [3024 × 4032] [OS]
13382,Fun facts to learn about Pineapple for improving your health.
5864,Spinach and tomato dinner


In [108]:
# Structure analysis
num_words = df[0].apply(lambda x: len(x.split()))
num_words_mean, num_words_std = np.mean(num_words), np.std(num_words)

In [109]:
print('Mean number of words per post: {} \n STD of words per post: {}'.format(num_words_mean, num_words_std))

Mean number of words per post: 8.72114065180103 
 STD of words per post: 6.399156977264049


# Define some cleaning functions

In [157]:
# Function for removing punctuation
def drop_punc(my_text):
    clean_text = re.sub('[%s]' % re.escape(string.punctuation), ' ', my_text)
    return clean_text

In [158]:
# Function for making it all text lowercase
def lower(my_text):
    clean_text = my_text.lower()
    return clean_text

In [159]:
# Function for removing all numbers
def remove_numbers(my_text):
    clean_text = re.sub('\w*\d\w*', '', my_text)
    return clean_text

In [160]:
# Function for removing emojis
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

In [161]:
# Function for removing stop words
def remove_stop(my_text):
    text_list = my_text.split()
    return ' '.join([word for word in text_list if word not in stop_words])

In [162]:
# Curated list of additional stop-words for this project
my_stop_words = ['oc', 'amp', 'homemade', 'food', 'foodporn', 'v', 'x', 'recipe']

# Function for removing my stop words
def remove_my_stop(my_text):
    text_list = my_text.split()
    return ' '.join([word for word in text_list if word not in my_stop_words])

### Special function for removing posts that are ads:

In [120]:
# curated list for words pertaining to advertisements:
ad_words = ['coupon', 'coupons', 'company', 'companies', 'promo', 'promos', 'code', 'https']
def catch_ads(my_text):
    text_list = my_text.split()
    ad = False
    for word in text_list:
        if word in ad_words:
            ad = True
            break
    if ad: return None
    else: return my_text

# Let's see how well our functions work!

In [135]:
test = df.sample(50)

In [136]:
for index, row in test.iterrows():
    text = row[0]
    print(text)
    cleaned = deEmojify(remove_numbers(drop_punc(lower(text))))
    print(cleaned)
    cleaned_no_punc = remove_my_stop(remove_stop(cleaned))
    print(cleaned_no_punc)
    no_ads = catch_ads(cleaned_no_punc)
    print(no_ads.strip())
    print()

My first go at Chicken and Waffles
my first go at chicken and waffles
first go chicken waffles
first go chicken waffles

smokes spare ribs
smokes spare ribs
smokes spare ribs
smokes spare ribs

"Mount Olympus" at Zeus in Canterbury, lovely veggie meal
 mount olympus  at zeus in canterbury  lovely veggie meal
mount olympus zeus canterbury lovely veggie meal
mount olympus zeus canterbury lovely veggie meal

Kroketten mit Kartoffel und Gouda – Patatesli Kaşarlı Kroket - Türkische Gerichte
kroketten mit kartoffel und gouda  patatesli kaarl kroket   trkische gerichte
kroketten mit kartoffel und gouda patatesli kaarl kroket trkische gerichte
kroketten mit kartoffel und gouda patatesli kaarl kroket trkische gerichte

Is that a cake!? CAKES SPECIALLY MADE FOR THE BAD &amp; BOUJEE. IG: @Dolce_byJess &amp; @yantaamu. Custom Gucci is possible! Anything is Possible!
is that a cake   cakes specially made for the bad  amp  boujee  ig   dolce byjess  amp   yantaamu  custom gucci is possible  anything

The cleaning looks good! Let's apply it to the whole dataframe:

In [167]:
df['cleaned_title'] = df['title'].apply(lower).apply(drop_punc).apply(remove_numbers).apply(deEmojify)
df['cleaned_title'] = df['cleaned_title'].apply(remove_stop).apply(remove_my_stop).apply(catch_ads)
df['cleaned_title'] = df['cleaned_title'].str.strip()

In [174]:
df.sample(50)

Unnamed: 0,title,cleaned_title
11684,SMITH-CELERY SALAD - the recipe is in the ebook that is given for free to anyone who submits the short survey,smith celery salad ebook given free anyone submits short survey
5707,[I ate] Large Doner kebab with broasted potatoes [OC] [1600 × 1200],ate large doner kebab broasted potatoes
21625,Un Kurabiyesi Tarifi,un kurabiyesi tarifi
22349,Happy Apples (1024×768),happy apples
10308,Tokyo Food Diary &amp; Sushi Eating Show,tokyo diary sushi eating show
16565,[OC] The perfect charcuterie,perfect charcuterie
20953,[Homemade] Peach Caramel Blondie Cheesecake,peach caramel blondie cheesecake
1662,[Homemade] Thai peanut ramen,thai peanut ramen
10817,Down 85lbs and keeping it off!,keeping
21463,"Stuffed mushrooms, the top batch has sausage and the bottom batch has crab meat and shrimp",stuffed mushrooms top batch sausage bottom batch crab meat shrimp


In [193]:
test = df.sample(50)

In [194]:
translator = Translator()

In [196]:
for index, row in test.iterrows():
    text = row['cleaned_title']
    print('raw: ', text)
    try:
        translated = translator.translate(text, dest='en')
        print('translated: ', translated.text)
    except Exception as e:
        print(str(e))
    print()

fianc crushed tagine benefits north african princess
fianc crushed tagine benefits north african princess

thanksgiving prepared oldest aunt mother days preparation finished eating minutes quiet minutes
thanksgiving prepared oldest aunt mother days preparation finished eating minutes quiet minutes

parmigiano bank italy
parmigiano bank italy

roomate made tomato stew eguse soup fried fish goat skin hen pork topped bitter leaf traditional nigerian seasonings one best thints eaten
roomate made tomato stew egusi soup fried fish goat skin hen pork topped bitter leaf traditional nigerian seasonings one best things eaten

first time home made pizza
first time home made pizza

trucks life
trucks life

barbacoa tacos consome right way eat
barbacoa tacos consome right way eat

pancake day england naturally decided make raspberry brownies instead
pancake day england naturally decided make raspberry brownies instead

snowflake dessert patbingsu koreans call
snowflake dessert patbingsu koreans cal