In [1]:
import zipfile
import pandas as pd
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession
import boto3
import io

In [2]:
spark = SparkSession.builder.appName("CleanRecipeDataFrame").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/18 01:33:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Run Below Cell to Read Data In From Local "data" Repo

In [13]:
zip_data_path = 'data/recipes_data.csv.zip'

with zipfile.ZipFile(zip_data_path, 'r') as zip_ref:
    # Assuming there's only one file in the zip
    extracted_file = zip_ref.namelist()[0]
    print(zip_ref.namelist())
    with zip_ref.open(extracted_file) as extracted_data:
        # Read the first 50000 rows using pandas
        df_mini = pd.read_csv(extracted_data, nrows=50000)
df_mini.to_csv('data/recipes_mini.csv', index=False)
df_mini.head(50)

['recipes_data.csv']


Unnamed: 0,title,ingredients,directions,link,source,NER,site
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""bite size shredded rice biscuits"", ""vanilla""...",www.cookbooks.com
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""cream of mushroom soup"", ""beef"", ""sour cream...",www.cookbooks.com
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""pepper"", ""cream cheese"", ""gar...",www.cookbooks.com
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken gravy"", ""cream of mushroom soup"", ""c...",www.cookbooks.com
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""graham cracker crumbs"", ""powdered sugar"", ""p...",www.cookbooks.com
5,Cheeseburger Potato Soup,"[""6 baking potatoes"", ""1 lb. of extra lean gro...","[""Wash potatoes; prick several times with a fo...",www.cookbooks.com/Recipe-Details.aspx?id=20115,Gathered,"[""sour cream"", ""bacon"", ""pepper"", ""extra lean ...",www.cookbooks.com
6,Rhubarb Coffee Cake,"[""1 1/2 c. sugar"", ""1/2 c. butter"", ""1 egg"", ""...","[""Cream sugar and butter."", ""Add egg and beat ...",www.cookbooks.com/Recipe-Details.aspx?id=210288,Gathered,"[""buttermilk"", ""egg"", ""sugar"", ""vanilla"", ""sod...",www.cookbooks.com
7,Scalloped Corn,"[""1 can cream-style corn"", ""1 can whole kernel...","[""Mix together both cans of corn, crackers, eg...",www.cookbooks.com/Recipe-Details.aspx?id=876969,Gathered,"[""egg"", ""pepper"", ""crackers"", ""cream-style cor...",www.cookbooks.com
8,Nolan'S Pepper Steak,"[""1 1/2 lb. round steak (1-inch thick), cut in...","[""Roll steak strips in flour."", ""Brown in skil...",www.cookbooks.com/Recipe-Details.aspx?id=375254,Gathered,"[""oil"", ""tomatoes"", ""green peppers"", ""water"", ...",www.cookbooks.com
9,Millionaire Pie,"[""1 large container Cool Whip"", ""1 large can c...","[""Empty Cool Whip into a bowl."", ""Drain juice ...",www.cookbooks.com/Recipe-Details.aspx?id=794547,Gathered,"[""condensed milk"", ""lemons"", ""graham cracker c...",www.cookbooks.com


## Run Below Cells to Read Data In From AWS S3 Instead

Run following shell script first
```
aws configure
```

In [14]:
mini_path = 'data/recipes_mini.csv'
df = spark.read.format('csv').option('header', True).load(mini_path)
df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|               title|         ingredients|          directions|                link|              source|                 NER|                site|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| No-Bake Nut Cookies|"[""1 c. firmly p...| ""1/2 c. evapora...| ""1/2 tsp. vanil...| ""1/2 c. broken ...| ""2 Tbsp. butter...| ""3 1/2 c. bite ...|
|Jewell Ball'S Chi...|"[""1 small jar c...|            cut up""| ""4 boned chicke...| ""1 can cream of...| ""1 carton sour ...|"[""Place chipped...|
|         Creamy Corn|"[""2 (16 oz.) pk...| ""1 (8 oz.) pkg....|             cubed""|     ""1/3 c. butter|             cubed""| ""1/2 tsp. garli...|
|       Chicken Funny|"[""1 large whole...| ""2 (10 1/2 oz.)...| ""1 (10 1/2 oz.)...| ""1 (6 oz.) box ...|

In [15]:
df_mini.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        50000 non-null  object
 1   ingredients  50000 non-null  object
 2   directions   50000 non-null  object
 3   link         50000 non-null  object
 4   source       50000 non-null  object
 5   NER          50000 non-null  object
 6   site         50000 non-null  object
dtypes: object(7)
memory usage: 2.7+ MB


In [16]:
df_mini.describe()

Unnamed: 0,title,ingredients,directions,link,source,NER,site
count,50000,50000,50000,50000,50000,50000,50000
unique,28273,49990,49882,50000,1,47375,1
top,Chicken Casserole,"[""1 lb. pecan halves"", ""1 Tbsp. seasoned salt""...","[""Mix together and chill.""]",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""sugar"", ""flour"", ""milk"", ""peaches"", ""butter""]",www.cookbooks.com
freq,231,2,11,1,50000,20,50000


In [26]:
df_mini[df_mini['title'] == 'Chicken Casserole'].iloc[0, 1]

'["1 can cream of mushroom soup", "1 can cream of chicken soup", "1 can cream of celery soup", "1 cut up chicken", "1/2 c. melted butter or margarine", "1 1/2 c. uncooked rice"]'

In [27]:
df_mini[df_mini['title'] == 'Chicken Casserole'].iloc[1, 1]

'["1/2 c. raw rice", "1 can French onion soup", "1 can cream of chicken or chicken mushroom soup", "1 (4 oz.) can mushrooms", "bite size pieces of 4 to 6 chicken thighs or any other chicken parts you wish to use"]'

## Data Cleaning

1. Drop columns "link", "source", and "site"
2. Unnest columns "ingredients" and "directions"
3. Create ingredient embedding vector and instruction embedding vector?