# Data Loading
 1. Load all datasets
 2. Perform basic cleaning to make them all the same format
 3. Save them

### Libraries and such

In [1]:
import pandas as pd 
import numpy as np 
import os
import json
import string
import ast
import re
from nltk.corpus import stopwords
from collections import Counter
import utils.processing_functions as pf
stop_words = stopwords.words('english')
file_path = '../../Data/Raw Datasets/' # Adjust to your file path of choice, I have chosen a directory outside of the repo to save on space.

## Check Each Dataset
 - We will only be keeping the ingredients (as lists) and the ID (as an identifier)
 

### [Dataset 1 (set1-f)](https://www.kaggle.com/datasets/kaggle/recipe-ingredients-dataset)
 - This dataset was provided by [Yummly](http://www.yummly.com/) 
 - Viable


In [2]:
data1_train = pd.read_json(f'{file_path}set1-f/train.json')
data1_test = pd.read_json(f'{file_path}set1-f/train.json')

In [3]:
dataset1 = pd.concat([data1_test, data1_train], axis=0)
dataset1 = dataset1.drop(columns='cuisine')
dataset1.head()

Unnamed: 0,id,ingredients
0,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,"[water, vegetable oil, wheat, salt]"
4,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [4]:
dataset1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 79548 entries, 0 to 39773
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           79548 non-null  int64 
 1   ingredients  79548 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.8+ MB


### [Dataset 2 (set2-f)](https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions)
 - Crawled data from Food.com (GeniusKitchen) online recipe aggregator
 - Viable

In [5]:
data2 = pd.read_csv(f'{file_path}set2-f/RAW_recipes.csv')
data2
columns_to_drop = ['name', 'minutes', 'contributor_id', 'submitted', 'tags', 'nutrition', 'n_steps', 'steps', 'description', 'n_ingredients']
dataset2 = data2.drop(columns= columns_to_drop)
dataset2.head()

Unnamed: 0,id,ingredients
0,137739,"['winter squash', 'mexican seasoning', 'mixed ..."
1,31490,"['prepared pizza crust', 'sausage patty', 'egg..."
2,112140,"['ground beef', 'yellow onions', 'diced tomato..."
3,59389,"['spreadable cheese with garlic and herbs', 'n..."
4,44061,"['tomato juice', 'apple cider vinegar', 'sugar..."


 - Our ingredients column is actually a string that looks like a list, we will convert it to a list and move on

In [6]:
print(dataset2['ingredients'].head(5).apply(type))

0    <class 'str'>
1    <class 'str'>
2    <class 'str'>
3    <class 'str'>
4    <class 'str'>
Name: ingredients, dtype: object


In [7]:
dataset2['ingredients'] = dataset2['ingredients'].apply(lambda x: pf.convert_to_list(x))

In [8]:
print(dataset2['ingredients'].head(5).apply(type))

0    <class 'list'>
1    <class 'list'>
2    <class 'list'>
3    <class 'list'>
4    <class 'list'>
Name: ingredients, dtype: object


In [9]:
display(dataset2.head())
dataset2.info()

Unnamed: 0,id,ingredients
0,137739,"[winter squash, mexican seasoning, mixed spice..."
1,31490,"[prepared pizza crust, sausage patty, eggs, mi..."
2,112140,"[ground beef, yellow onions, diced tomatoes, t..."
3,59389,"[spreadable cheese with garlic and herbs, new ..."
4,44061,"[tomato juice, apple cider vinegar, sugar, sal..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           231637 non-null  int64 
 1   ingredients  231637 non-null  object
dtypes: int64(1), object(1)
memory usage: 3.5+ MB


### [Dataset 3 (set3-f)](https://www.kaggle.com/datasets/irkaal/foodcom-recipes-and-reviews)
 - Colaborator for data: Alvin
 - Viable

In [10]:
data3 = pd.read_csv(f'{file_path}set3-f/recipes.csv')
data3.head()

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,38,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,PT24H,PT45M,PT24H45M,1999-08-09T21:46:00Z,Make and share this Low-Fat Berry Blue Frozen ...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,,"c(""Toss 2 cups berries with sugar."", ""Let stan..."
1,39,Biryani,1567,elly9812,PT25M,PT4H,PT4H25M,1999-08-29T13:12:00Z,Make and share this Biryani recipe from Food.com.,"c(""https://img.sndimg.com/food/image/upload/w_...",...,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,,"c(""Soak saffron in warm milk for 5 minutes and..."
2,40,Best Lemonade,1566,Stephen Little,PT5M,PT30M,PT35M,1999-09-05T19:52:00Z,This is from one of my first Good House Keepi...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,,"c(""Into a 1 quart Jar with tight fitting lid, ..."
3,41,Carina's Tofu-Vegetable Kebabs,1586,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03T14:54:00Z,This dish is best prepared a day in advance to...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,4 kebabs,"c(""Drain the tofu, carefully squeezing out exc..."
4,42,Cabbage Soup,1538,Duckie067,PT30M,PT20M,PT50M,1999-09-19T06:19:00Z,Make and share this Cabbage Soup recipe from F...,"""https://img.sndimg.com/food/image/upload/w_55...",...,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,,"c(""Mix everything together and bring to a boil..."


In [11]:
dataset3 = pd.DataFrame()
dataset3['id'] = data3['RecipeId']
dataset3['ingredients'] = data3['RecipeIngredientParts']
dataset3.head()

Unnamed: 0,id,ingredients
0,38,"c(""blueberries"", ""granulated sugar"", ""vanilla ..."
1,39,"c(""saffron"", ""milk"", ""hot green chili peppers""..."
2,40,"c(""sugar"", ""lemons, rind of"", ""lemon, zest of""..."
3,41,"c(""extra firm tofu"", ""eggplant"", ""zucchini"", ""..."
4,42,"c(""plain tomato juice"", ""cabbage"", ""onion"", ""c..."


 - need to do further editing to bring the ingredients list in line with the other datasets
  - remove any that are a single ingredient

In [12]:
dataset3 = dataset3[dataset3['ingredients'].str.startswith('c(') == True]
dataset3.head()

Unnamed: 0,id,ingredients
0,38,"c(""blueberries"", ""granulated sugar"", ""vanilla ..."
1,39,"c(""saffron"", ""milk"", ""hot green chili peppers""..."
2,40,"c(""sugar"", ""lemons, rind of"", ""lemon, zest of""..."
3,41,"c(""extra firm tofu"", ""eggplant"", ""zucchini"", ""..."
4,42,"c(""plain tomato juice"", ""cabbage"", ""onion"", ""c..."


 - convert to a list

In [13]:
dataset3['ingredients'] = dataset3['ingredients'].apply(lambda x: pf.convert_to_list(x, 1))
dataset3.head()

Unnamed: 0,id,ingredients
0,38,"[blueberries, granulated sugar, vanilla yogurt..."
1,39,"[saffron, milk, hot green chili peppers, onion..."
2,40,"[sugar, lemons, rind of, lemon, zest of, fresh..."
3,41,"[extra firm tofu, eggplant, zucchini, mushroom..."
4,42,"[plain tomato juice, cabbage, onion, carrots, ..."


In [14]:
dataset3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 511626 entries, 0 to 522516
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           511626 non-null  int64 
 1   ingredients  511626 non-null  object
dtypes: int64(1), object(1)
memory usage: 11.7+ MB


### [Dataset 4 (set4-f)](https://www.kaggle.com/datasets/wilmerarltstrmberg/recipe-dataset-over-2m)
 - Collaborator for data: Wilmer Arlt Strömberg
 - Viable

In [15]:
data4 = pd.read_csv(f'{file_path}set4-f/recipes_data.csv')
data4.head()

Unnamed: 0,title,ingredients,directions,link,source,NER,site
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""bite size shredded rice biscuits"", ""vanilla""...",www.cookbooks.com
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""cream of mushroom soup"", ""beef"", ""sour cream...",www.cookbooks.com
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""pepper"", ""cream cheese"", ""gar...",www.cookbooks.com
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken gravy"", ""cream of mushroom soup"", ""c...",www.cookbooks.com
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""graham cracker crumbs"", ""powdered sugar"", ""p...",www.cookbooks.com


In [16]:
dataset4 = pd.DataFrame()
dataset4['id'] = data4.index
dataset4['ingredients'] = data4['ingredients']

In [17]:
dataset4

Unnamed: 0,id,ingredients
0,0,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva..."
1,1,"[""1 small jar chipped beef, cut up"", ""4 boned ..."
2,2,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg..."
3,3,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans..."
4,4,"[""1 c. peanut butter"", ""3/4 c. graham cracker ..."
...,...,...
2231137,2231137,"[""1/2 cup chocolate hazelnut spread (recommend..."
2231138,2231138,"[""1 dozen eggs"", ""1 paprika"", ""1 salt and pepp..."
2231139,2231139,"[""150 grams Daikon radish"", ""1 tbsp Sesame oil..."
2231140,2231140,"[""1 cup apple cider"", ""6 tablespoons sugar"", ""..."


- This dataset has all the measurements in the ingredients lists, so will need to drop all of the numbers stop words

In [18]:

dataset4['ingredients'] = dataset4['ingredients'].apply(lambda x: pf.convert_to_list(x))

In [19]:
dataset4['ingredients'] = dataset4['ingredients'].apply(lambda x: pf.remove_stop_words(x, stop_words))
display(dataset4.head())
dataset4.info()

Unnamed: 0,id,ingredients
0,0,"[c firmly packed brown sugar, c evaporated mil..."
1,1,"[small jar chipped beef cut, boned chicken bre..."
2,2,"[oz pkg frozen corn, oz pkg cream cheese cubed..."
3,3,"[large whole chicken, oz cans chicken gravy, o..."
4,4,"[c peanut butter, c graham cracker crumbs, c m..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2231142 entries, 0 to 2231141
Data columns (total 2 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   id           int64 
 1   ingredients  object
dtypes: int64(1), object(1)
memory usage: 34.0+ MB


### [Dataset 5 (set5-f)](https://www.kaggle.com/datasets/sterby/german-recipes-dataset)
 - metadata crawled from chefkoch.de*.
 - Non-Viable


In [20]:
data5 = pd.read_json(f'{file_path}set5-f/recipes.json')
data5

Unnamed: 0,Url,Instructions,Ingredients,Day,Name,Year,Month,Weekday
0,https://www.chefkoch.de/rezepte/185441079701305/,Die Eier hart kochen. Dann pellen und mit eine...,"[600 g Hackfleisch, halb und halb, 800 g Sauer...",1,Gebratener Hasenrücken,2009,January,Thursday
1,https://www.chefkoch.de/rezepte/2718181424631245/,Vorab folgende Bemerkung: Alle Mengen sind Cir...,"[1 kg Strauchtomate(n), 1 Gemüsezwiebel(n), 1 ...",1,Pilz Stroganoff,2017,July,Saturday
2,https://www.chefkoch.de/rezepte/46341015661368/,"Die Kirschen abtropfen lassen, dabei den Saft ...","[1 Glas Kirsche(n), 1 Pck. Vanillepuddingpulve...",1,Kaninchen a la Gioff,2007,January,Monday
3,https://www.chefkoch.de/rezepte/51051018014178/,"Den Spargel säubern, die holzigen Enden abschn...","[500 g Spargel, grüner, 300 ml Brühe oder Fond...",1,Spanisches Knoblauch-Kaninchen,2013,April,Monday
4,https://www.chefkoch.de/rezepte/1555771262860408/,Kohlrabi schälen und klein würfeln. Mit der Br...,"[250 g Kohlrabi, 150 ml Gemüsebrühe, 150 ml Mi...",1,Gnocchi - Hack - Pfanne mit Basilikum,2017,August,Tuesday
...,...,...,...,...,...,...,...,...
12185,https://www.chefkoch.de/rezepte/1547251261244879/,In einem nicht allzu großen Topf die Butter er...,"[3 EL Butter, 800 g Rinderbraten (Schulter ode...",1,Herzhafte Pfannkuchen,2019,January,Tuesday
12186,https://www.chefkoch.de/rezepte/47411016028870/,Gurke waschen und in dünne Scheiben schneiden....,"[1/2 Salatgurke(n), 250 g Käse (Emmentaler), 1...",1,Tomaten-Zucchini-Spaghetti,2004,September,Wednesday
12187,https://www.chefkoch.de/rezepte/1102041216136522/,Die Gurken schälen und längs halbieren. Das In...,"[1 kg Schmorgurke(n), 1 kl. Dose/n Tomate(n), ...",1,Am Spieß gebratenes Schweinefleisch mit Malzex...,2012,September,Saturday
12188,https://www.chefkoch.de/rezepte/897988359371/,"Die Knochen mit der Hälfte des Wurzelwerkes, d...","[Für den Fond:, 500 g Knochen vom Wild, 1/2Bun...",1,Geflügelsalat,2003,February,Saturday


 - Dataset not in english, will skip as we are not currently looking to use a translator on everything

### [Dataset 6 (set6-f)](https://www.kaggle.com/datasets/canggih/indonesian-food-recipes)
 - Collaborator for data: Canggih P Wibowo
 - Non-Viable

In [21]:
data6 = pd.read_csv(f'{file_path}set6-f/dataset-ayam.csv')
data6.head()

Unnamed: 0,Title,Ingredients,Steps,Loves,URL
0,Ayam Woku Manado,1 Ekor Ayam Kampung (potong 12)--2 Buah Jeruk ...,Cuci bersih ayam dan tiriskan. Lalu peras jeru...,1,/id/resep/4473027-ayam-woku-manado
1,Ayam goreng tulang lunak,1 kg ayam (dipotong sesuai selera jangan kecil...,"Haluskan bumbu2nya (BaPut, ketumbar, kemiri, k...",1,/id/resep/4471956-ayam-goreng-tulang-lunak
2,Ayam cabai kawin,1/4 kg ayam--3 buah cabai hijau besar--7 buah ...,Panaskan minyak di dalam wajan. Setelah minyak...,2,/id/resep/4473057-ayam-cabai-kawin
3,Ayam Geprek,250 gr daging ayam (saya pakai fillet)--Secuku...,Goreng ayam seperti ayam krispi--Ulek semua ba...,10,/id/resep/4473023-ayam-geprek
4,Minyak Ayam,400 gr kulit ayam & lemaknya--8 siung bawang p...,Cuci bersih kulit ayam. Sisihkan--Ambil 50 ml ...,4,/id/resep/4427438-minyak-ayam


 - Dataset 6 also not in english, will skip this set

### [Dataset 7 (set7-f)](https://www.kaggle.com/datasets/bhavyadhingra00020/healthy-indian-recipes)
 - Collaborator for data: Bhavya Dhingra
 - Non-Viable


In [22]:
data7 = pd.read_csv(f'{file_path}set7-f/IndianHealthyRecipe.csv')
data7

Unnamed: 0,Dish Name,Description,Spice,Prep Time,Views,Rating,Number of Votes,Serves,Dietary Info,Cook Time,Ingredients,Instructions
0,Pistachio chicken,Mild chicken in a creamy pistachio sauce,mild,Prep 10 mins,11604,4.1,18.0,4.0,['CONTAINS-NUTS'],30 mins,"1) 600g chicken thighs, cut into chunks\n2) 10...","1) Boil enough water to cover the pistachios, ..."
1,Tangy Fried Okra,Bhindi with Amchoor,medium,Prep 10 mins,49899,4.6,35.0,4.0,['VEGETARIAN'],15 mins,1) 500g okra\n2) 3 tbsp mustard oil\n3) 1 onio...,1) Wash and thoroughly dry the okra (kitchen r...
2,Healthy Chicken Korma,Chicken in Creamy Almond Sauce,mild,Prep 10 mins,115569,3.6,163.0,4.0,['CONTAINS-NUTS'],20 mins,1) 6-8 tbsp natural yoghurt\n2) 1 tsp turmeric...,1) Grind your whole spices in a spice grinder....
3,Brown Lentil Dhal,Sabut Masoor di Dhal,medium,Prep 10 mins,146798,3.7,87.0,4.0,['VEGETARIAN'],20 mins,1) 200g brown lentils\n2) Approx. 1L of water\n,1) Place the lentils in the pressure cooker wi...
4,Thari Wala Chicken,Healthy Chicken Curry,medium,Prep 10 mins,262696,3.5,343.0,4.0,['LACTOSE-FREE'],40 mins,1) 8 pieces of chicken (4 legs cut into thigh ...,"1) Skin the chicken, removing any excess fat.\..."
...,...,...,...,...,...,...,...,...,...,...,...,...
144,"Vegan Wild Rice, Cumin & Chickpea Burgers",Channa Chaul Tikki,medium,Prep 10 mins,12641,4.2,9.0,4.0,['VEGETARIAN'],15 mins,1) 200g cooked and cooled wild rice\n2) 200g t...,1) Cook the rice as per the instructions on th...
145,Vegetable Pilau,Mixed Vegetable Rice,medium,Prep 10 mins,69937,4.0,15.0,4.0,['VEGETARIAN'],20 mins,1) 50g peas\n2) 1 carrot (diced small)\n3) 100...,1) Wash the rice until water runs clear and se...
146,Watermelon and Feta Chaat,Indian Style Watermelon and Feta Salad,medium,Prep 40 mins,16652,4.4,11.0,6.0,['VEGETARIAN'],15 mins,1) 250g atta\n2) salt pinch\n3) 1 tsp ajwain\n...,1) Add the flour to a bowl and mix in the salt...
147,Zambezi Baked Sea Bream,Shama with relish,medium,Prep 5 mins,11808,4.0,7.0,2.0,['LACTOSE-FREE'],20 mins,1) 2 whole sea bream (sea bass fillets if you ...,1) Heat your oven to 180ºC\n2) Remove any scal...


- Dataset 7 does not have ingredients, will skip this set

### [Dataset 8 (set8-d)](https://www.kaggle.com/datasets/ai-first/cocktail-ingredients)
 - Collaborators for data: AIFirst, K Scott Mader
 - Start of our cocktail sets
  - Viable

In [23]:
data8 = pd.read_csv(f'{file_path}set8-d/all_drinks.csv')
data8.head()


Unnamed: 0.1,Unnamed: 0,strDrink,dateModified,idDrink,strAlcoholic,strCategory,strDrinkThumb,strGlass,strIBA,strIngredient1,...,strMeasure15,strMeasure2,strMeasure3,strMeasure4,strMeasure5,strMeasure6,strMeasure7,strMeasure8,strMeasure9,strVideo
0,0,'57 Chevy with a White License Plate,2016-07-18 22:49:04,14029,Alcoholic,Cocktail,http://www.thecocktaildb.com/images/media/drin...,Highball glass,,Creme de Cacao,...,,1 oz,,,,,,,,
1,1,1-900-FUK-MEUP,2016-07-18 22:27:04,15395,Alcoholic,Shot,http://www.thecocktaildb.com/images/media/drin...,Old-fashioned glass,,Absolut Kurant,...,,1/4 oz,1/4 oz,1/4 oz,1/4 oz,1/4 oz,1/2 oz,1/4 oz,,
2,2,110 in the shade,2016-02-03 14:51:57,15423,Alcoholic,Beer,http://www.thecocktaildb.com/images/media/drin...,Beer Glass,,Lager,...,,1.5 oz,,,,,,,,
3,3,151 Florida Bushwacker,2016-07-18 22:28:43,14588,Alcoholic,Milk / Float / Shake,http://www.thecocktaildb.com/images/media/drin...,Beer mug,,Malibu rum,...,,1/2 oz,1/2 oz Bacardi,1 oz,1 oz,3 oz,1 oz,1 cup,,
4,4,155 Belmont,2016-10-05 12:36:28,15346,Alcoholic,Cocktail,http://www.thecocktaildb.com/images/media/drin...,White wine glass,,Dark rum,...,,2 shots,1 shot,1 shot,,,,,,


In [24]:
data8.columns

Index(['Unnamed: 0', 'strDrink', 'dateModified', 'idDrink', 'strAlcoholic',
       'strCategory', 'strDrinkThumb', 'strGlass', 'strIBA', 'strIngredient1',
       'strIngredient10', 'strIngredient11', 'strIngredient12',
       'strIngredient13', 'strIngredient14', 'strIngredient15',
       'strIngredient2', 'strIngredient3', 'strIngredient4', 'strIngredient5',
       'strIngredient6', 'strIngredient7', 'strIngredient8', 'strIngredient9',
       'strInstructions', 'strMeasure1', 'strMeasure10', 'strMeasure11',
       'strMeasure12', 'strMeasure13', 'strMeasure14', 'strMeasure15',
       'strMeasure2', 'strMeasure3', 'strMeasure4', 'strMeasure5',
       'strMeasure6', 'strMeasure7', 'strMeasure8', 'strMeasure9', 'strVideo'],
      dtype='object')

In [25]:
columns_to_combine = ['strIngredient1',
       'strIngredient10', 'strIngredient11', 'strIngredient12',
       'strIngredient13', 'strIngredient14', 'strIngredient15',
       'strIngredient2', 'strIngredient3', 'strIngredient4', 'strIngredient5',
       'strIngredient6', 'strIngredient7', 'strIngredient8', 'strIngredient9']
dataset8 = pd.DataFrame()
dataset8['id'] = data8['idDrink']
dataset8['ingredients'] = data8[columns_to_combine].apply(lambda row: row.tolist(), axis = 1)
dataset8.head()

Unnamed: 0,id,ingredients
0,14029,"[Creme de Cacao, nan, nan, nan, nan, nan, nan,..."
1,15395,"[Absolut Kurant, nan, nan, nan, nan, nan, nan,..."
2,15423,"[Lager, nan, nan, nan, nan, nan, nan, Tequila,..."
3,14588,"[Malibu rum, nan, nan, nan, nan, nan, nan, Lig..."
4,15346,"[Dark rum, nan, nan, nan, nan, nan, nan, Light..."


In [26]:
dataset8['ingredients'] = dataset8['ingredients'].apply(pf.remove_nans_from_list)
dataset8

Unnamed: 0,id,ingredients
0,14029,"[Creme de Cacao, Vodka]"
1,15395,"[Absolut Kurant, Grand Marnier, Chambord raspb..."
2,15423,"[Lager, Tequila]"
3,14588,"[Malibu rum, Light rum, 151 proof rum, Dark Cr..."
4,15346,"[Dark rum, Light rum, Vodka, Orange juice]"
...,...,...
541,14065,"[Amaretto, Rum, Kool-Aid]"
542,14594,"[Cointreau, Lemon juice, Ice, Lemon]"
543,15691,"[Beer, Root beer, Lemonade, Coca-Cola, 7-Up, C..."
544,16963,"[Vodka, Ouzo]"


### [Dataset 9 (set9-d)](https://www.kaggle.com/datasets/shuyangli94/cocktails-hotaling-co)
 - This dataset contains cocktails collected by alcohol importer and distiller Hotaling & Co.
 - Viable

In [31]:
data9 = pd.read_csv(f'{file_path}set9-d/hotaling_cocktails - Cocktails.csv')
data9.head()

Unnamed: 0,Cocktail Name,Bartender,Bar/Company,Location,Ingredients,Garnish,Glassware,Preparation,Notes
0,Flor de Amaras,Kelly McCarthy,,Boston,"1.5 oz Mezcal, 1 oz Hibiscus Simple Syrup*, .5...",Marigold Petals,,*Hibiscus Simple Syrup:\n1:1 w/ a cup of dried...,
1,The Happy Place,Elizabeth Montana,Forgery & Verso,San Francisco,"2 oz Junipero Gin, .75 oz House-made Cranberry...","Dehydrated Lemon Wheel, Sprig of Rosemary",,*House-made Cranberry syrup: \n-- 2 cups Fresh...,Junipero Gin 20th Anniversary Signature Cocktail
2,Bon Voyage Pisco Punch,Jon Morales,,San Francisco,"1500 ml BarSol Selecto Italia Pisco, 750 ml Le...",,Punch Bowl,*Pineapple Gomme: \nMix equal parts (1.5 cups)...,
3,Still Life of a Pineapple,Daniel Braganca,Backbar,Somerville,"1.5 oz BarSol Primero Quebranta Pisco, .75 oz ...",,,*Pineapple Syrup:\n<em>Equal parts pineapple b...,
4,The Bittered Valley,Nik Virrey,,Seattle,"1.25 oz Luxardo Maraschino Liqueur, 4 drops Ac...",,,"1st glass ingredients:\nLuxardo Maraschino, Ac...",


In [32]:
print(data9['Ingredients'].head(5).apply(type))

0    <class 'str'>
1    <class 'str'>
2    <class 'str'>
3    <class 'str'>
4    <class 'str'>
Name: Ingredients, dtype: object


In [33]:
dataset9 = pd.DataFrame()
dataset9['id'] = data9.index
dataset9['ingredients'] = data9['Ingredients'].apply(lambda x: x.split(", "))
display(dataset9)
dataset9['ingredients'] = dataset9['ingredients'].apply(lambda x: pf.remove_stop_words(x, stop_words))
display(dataset9.head())
dataset9.info()

Unnamed: 0,id,ingredients
0,0,"[1.5 oz Mezcal, 1 oz Hibiscus Simple Syrup*, ...."
1,1,"[2 oz Junipero Gin, .75 oz House-made Cranberr..."
2,2,"[1500 ml BarSol Selecto Italia Pisco, 750 ml L..."
3,3,"[1.5 oz BarSol Primero Quebranta Pisco, .75 oz..."
4,4,"[1.25 oz Luxardo Maraschino Liqueur, 4 drops A..."
...,...,...
682,682,"[1.5 oz Luxardo Bitter Bianco, .75 oz Tempus F..."
683,683,"[1.5 oz H by HINE, .5 oz Jamaican Rum, 1 tsp C..."
684,684,"[1 oz Luxardo Bitter Bianco, 1 oz Lustau Oloro..."
685,685,"[2 oz Rye Whiskey, 5 oz Punt e Mes, .5 oz Luxa..."


Unnamed: 0,id,ingredients
0,0,"[oz Mezcal, oz Hibiscus Simple Syrup, oz Lime ..."
1,1,"[oz Junipero Gin, oz Housemade Cranberry Syrup..."
2,2,"[ml BarSol Selecto Italia Pisco, ml Lemon Juic..."
3,3,"[oz BarSol Primero Quebranta Pisco, oz Dry Ver..."
4,4,"[oz Luxardo Maraschino Liqueur, drops Acid pho..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 687 entries, 0 to 686
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           687 non-null    int64 
 1   ingredients  687 non-null    object
dtypes: int64(1), object(1)
memory usage: 10.9+ KB


In [34]:
pf.counting(dataset9['ingredients'])

Counter({'oz Lemon Juice': 88,
         'oz Kings Ginger Liqueur': 78,
         'oz Luxardo Maraschino Liqueur': 77,
         'oz Junipero Gin': 75,
         'oz No London Dry Gin': 69,
         'dash Angostura Bitters': 55,
         'oz H HINE': 47,
         'oz Lime Juice': 46,
         'oz Fresh Lemon Juice': 44,
         'oz Pink Pigeon Rum': 35,
         'oz Simple Syrup': 35,
         'oz Simple syrup': 31,
         'oz Luxardo Bitter Bianco': 30,
         'oz Luxardo Sangue Morlacco Cherry Liqueur': 29,
         'oz Lemon juice': 29,
         'oz Luxardo Amaretto di Saschira': 28,
         'oz Mezcal': 25,
         'oz Fresh Lime Juice': 25,
         'oz Karlssons Gold Vodka': 25,
         'oz Luxardo Amaro Abano': 24,
         'oz Luxardo Triplum Triple Sec': 22,
         'oz Luxardo Apricot Liqueur': 21,
         'oz Hophead Vodka': 19,
         'dash Orange Bitters': 19,
         'oz Tempus Fugit Alessio Vermouth di Torino Rosso': 18,
         'oz Luxardo Bitter': 17,
       

### [Dataset 10 (set10-d)](https://www.kaggle.com/datasets/aadyasingh55/cocktails)
 - Collaborators for data: Aadya Singh
 - Viable

In [48]:
data10 = pd.read_csv(f'{file_path}set10-d/final_cocktails.csv')
data10.head()

Unnamed: 0.1,Unnamed: 0,id,name,alcoholic,category,glassType,instructions,drinkThumbnail,ingredients,ingredientMeasures,text
0,0,0,A1,Alcoholic,Cocktail,Cocktail glass,"Pour all ingredients into a cocktail shaker, m...",https://www.thecocktaildb.com/images/media/dri...,"['Gin', 'Grand Marnier', 'Lemon Juice', 'Grena...","['1 3/4 shot ', '1 Shot ', '1/4 Shot', '1/8 Sh...",question Generate a cocktail with Gin Grand Ma...
1,1,1,ABC,Alcoholic,Shot,Shot glass,Layered in a shot glass.,https://www.thecocktaildb.com/images/media/dri...,"['Amaretto', 'Baileys irish cream', 'Cognac']","['1/3 ', '1/3 ', '1/3 ']",question Generate a cocktail with Amaretto Bai...
2,2,2,Ace,Alcoholic,Cocktail,Martini Glass,Shake all the ingredients in a cocktail shaker...,https://www.thecocktaildb.com/images/media/dri...,"['Gin', 'Grenadine', 'Heavy cream', 'Milk', 'E...","['2 shots ', '1/2 shot ', '1/2 shot ', '1/2 sh...",question Generate a cocktail with Gin Grenadin...
3,3,3,ACID,Alcoholic,Shot,Shot glass,Poor in the 151 first followed by the 101 serv...,https://www.thecocktaildb.com/images/media/dri...,"['151 proof rum', 'Wild Turkey']","['1 oz Bacardi ', '1 oz ']",question Generate a cocktail with 151 proof ru...
4,4,4,Adam,Alcoholic,Ordinary Drink,Cocktail glass,"In a shaker half-filled with ice cubes, combin...",https://www.thecocktaildb.com/images/media/dri...,"['Dark rum', 'Lemon juice', 'Grenadine']","['2 oz ', '1 oz ', '1 tsp ']",question Generate a cocktail with Dark rum Lem...


In [49]:
dataset10 = pd.DataFrame()
dataset10['id'] = data10['id']
dataset10['ingredients'] = data10['ingredients'].apply(lambda x: pf.convert_to_list(x))
display(dataset10.head())
dataset10.info()

Unnamed: 0,id,ingredients
0,0,"[Gin, Grand Marnier, Lemon Juice, Grenadine]"
1,1,"[Amaretto, Baileys irish cream, Cognac]"
2,2,"[Gin, Grenadine, Heavy cream, Milk, Egg White]"
3,3,"[151 proof rum, Wild Turkey]"
4,4,"[Dark rum, Lemon juice, Grenadine]"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425 entries, 0 to 424
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           425 non-null    int64 
 1   ingredients  425 non-null    object
dtypes: int64(1), object(1)
memory usage: 6.8+ KB


### [Dataset 11 (set11-d)](https://www.kaggle.com/datasets/laurinbrechter/cocktail-popularity)
 - Collaborators for data: Laurin Brechter
 - Non-Viable

In [50]:
data11 = pd.read_csv(f'{file_path}set11-d/data.csv')
data11.head()

Unnamed: 0.1,Unnamed: 0,place,name,year,month,visitors
0,0,1,'57 Chevy,2001,total,1512
1,1,2,Caipirinha,2001,total,1301
2,2,3,Long Island Icetea,2001,total,903
3,3,4,Piña Colada,2001,total,601
4,4,5,Tall Blonde,2001,total,573


 - Dataset 11 has no relevant data

### [Dataset 12 (set12-d)](https://www.kaggle.com/datasets/jenlooper/mr-boston-cocktail-dataset)
 - Collaborators for data: Jen Looper 
 - Viable

In [51]:
data12 = pd.read_csv(f'{file_path}set12-d/mr-boston-flattened.csv')
data12.head()

Unnamed: 0,name,category,measurement-1,ingredient-1,measurement-2,ingredient-2,measurement-3,ingredient-3,measurement-4,ingredient-4,measurement-5,ingredient-5,measurement-6,ingredient-6,instructions,glass,glass-size
0,Gauguin,Cocktail Classics,2 oz,Light Rum,1 oz,Passion Fruit Syrup,1 oz,Lemon Juice,1 oz,Lime Juice,,,,,Combine ingredients with a cup of crushed ice ...,Old-Fashioned Glass,6 to 8 ounces
1,Fort Lauderdale,Cocktail Classics,1 1/2 oz,Light Rum,1/2 oz,Sweet Vermouth,1/4 oz,Juice of Orange,1/4 oz,Juice of a Lime,,,,,Shake with ice and strain into old-fashioned g...,Old-Fashioned Glass,6 to 8 ounces
2,Apple Pie,Cordials and Liqueurs,3 oz,Apple schnapps,1 oz,Cinnamon schnapps,,Apple slice,,,,,,,Pour into ice-filled old-fashioned glass. Garn...,Old-Fashioned Glass,6 to 8 ounces
3,Cuban Cocktail No. 1,Cocktail Classics,1/2 oz,Juice of a Lime,1/2 oz,Powdered Sugar,2 oz,Light Rum,,,,,,,Shake with ice and strain into cocktail glass.,Cocktail Glass,6 or more ounces
4,Cool Carlos,Cocktail Classics,1 1/2 oz,Dark rum,2 oz,Cranberry Juice,2 oz,Pineapple Juice,1 oz,Orange curacao,1 oz,Sour Mix,,,"Mix all ingredients except curacao with ice, s...",Collins Glass,14 to 16 ounces


In [52]:
display(data12.info())
data12.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990 entries, 0 to 989
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           990 non-null    object
 1   category       990 non-null    object
 2   measurement-1  980 non-null    object
 3   ingredient-1   989 non-null    object
 4   measurement-2  955 non-null    object
 5   ingredient-2   987 non-null    object
 6   measurement-3  847 non-null    object
 7   ingredient-3   894 non-null    object
 8   measurement-4  600 non-null    object
 9   ingredient-4   584 non-null    object
 10  measurement-5  307 non-null    object
 11  ingredient-5   321 non-null    object
 12  measurement-6  71 non-null     object
 13  ingredient-6   159 non-null    object
 14  instructions   990 non-null    object
 15  glass          985 non-null    object
 16  glass-size     979 non-null    object
dtypes: object(17)
memory usage: 131.6+ KB


None

Index(['name', 'category', 'measurement-1', 'ingredient-1', 'measurement-2',
       'ingredient-2', 'measurement-3', 'ingredient-3', 'measurement-4',
       'ingredient-4', 'measurement-5', 'ingredient-5', 'measurement-6',
       'ingredient-6', 'instructions', 'glass', 'glass-size'],
      dtype='object')

In [53]:
columns_to_combine = ['ingredient-1', 'ingredient-2', 'ingredient-3', 'ingredient-4', 'ingredient-5', 'ingredient-6']
dataset12 = pd.DataFrame()
dataset12['id'] = data12.index
dataset12['ingredients'] = data12[columns_to_combine].apply(lambda row: row.tolist(), axis = 1)
dataset12['ingredients'] = dataset12['ingredients'].apply(pf.remove_nans_from_list)
dataset12

Unnamed: 0,id,ingredients
0,0,"[ Light Rum, Passion Fruit Syrup, Lemon Juic..."
1,1,"[ Light Rum, Sweet Vermouth, Juice of Orange..."
2,2,"[ Apple schnapps, Cinnamon schnapps, Apple s..."
3,3,"[ Juice of a Lime, Powdered Sugar, Light Rum]"
4,4,"[ Dark rum, Cranberry Juice, Pineapple Juice..."
...,...,...
985,985,"[Lime wedge, superfine sugar, Gin, Triple Se..."
986,986,"[ Juice of Orange, Dry Vermouth, Sweet Vermo..."
987,987,"[ Gin, Dry Vermouth, Triple Sec]"
988,988,"[ Gin, Triple Sec, Pineapple Juice]"


### [Dataset 13 (set13-d)](https://www.kaggle.com/datasets/kashishparmar02/cocktailcrazedataset)
 - Collaborators for data: Kashish Parmar
 - Non-Viable

In [54]:
data13 = pd.read_csv(f'{file_path}set13-d/data (1).csv')
data13

Unnamed: 0.1,Unnamed: 0,idDrink,strDrink,strCategory,strAlcoholic,strGlass,strIngredient1,strIngredient2,initial
0,0,17222,A1,Cocktail,Alcoholic,Cocktail glass,Gin,Grand Marnier,A
1,1,13501,ABC,Shot,Alcoholic,Shot glass,Amaretto,Baileys irish cream,A
2,2,17225,Ace,Cocktail,Alcoholic,Martini Glass,Gin,Grenadine,A
3,3,14610,ACID,Shot,Alcoholic,Shot glass,151 proof rum,Wild Turkey,A
4,4,17837,Adam,Ordinary Drink,Alcoholic,Cocktail glass,Dark rum,Lemon juice,A
...,...,...,...,...,...,...,...,...,...
421,421,17027,Zima Blaster,Ordinary Drink,Alcoholic,Hurricane glass,Zima,Chambord raspberry liqueur,Z
422,422,14594,Zizi Coin-coin,Punch / Party Drink,Alcoholic,Margarita/Coupette glass,Cointreau,Lemon juice,Z
423,423,15801,Zimadori Zinger,Punch / Party Drink,Alcoholic,Collins glass,Midori melon liqueur,Zima,Z
424,424,14065,Zippy's Revenge,Cocktail,Alcoholic,Old-fashioned glass,Amaretto,Rum,Z


 - Dataset 13 is incomplete with only 2 ingredients per drink it does not have full recipes.

### [Dataset 14 (set14-d)](https://www.kaggle.com/datasets/svetlanagruzdeva/cocktails-data)
 - Collaborators: Svetlana Gruzdeva
 - Viable

In [55]:
data14 = pd.read_csv(f'{file_path}set14-d/data_cocktails.csv')
data14

Unnamed: 0.1,Unnamed: 0,strDrink,strCategory,strGlass,strIngredients,Alc_type,Basic_taste,strInstructions,strMeasures,Value_ml,Value_gr,Garnish_amount,Garnish_type
0,0,'57 Chevy with a White License Plate,Cocktail,Highball glass,Creme De Cacao White,Creamy Liqueur,,1. Fill a rocks glass with ice 2.add white cre...,1 oz white,30.0,,,
1,1,'57 Chevy with a White License Plate,Cocktail,Highball glass,Vodka,Vodka,,1. Fill a rocks glass with ice 2.add white cre...,1 oz,30.0,,,
2,2,1-900-FUK-MEUP,Shot,Old-fashioned glass,Grand Marnier,Triple Sec,,Shake ingredients in a mixing tin filled with ...,1/4 oz,7.5,,,
3,3,1-900-FUK-MEUP,Shot,Old-fashioned glass,Midori Melon Liqueur,Sweet Liqueur,,Shake ingredients in a mixing tin filled with ...,1/4 oz,7.5,,,
4,4,1-900-FUK-MEUP,Shot,Old-fashioned glass,Malibu Rum,Rum,,Shake ingredients in a mixing tin filled with ...,1/4 oz,7.5,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1776,1776,Zorbatini,Cocktail,Cocktail glass,Vodka,Vodka,,Prepare like a Martini. Garnish with a green o...,1 1/4 oz stoli,37.5,,,
1777,1777,Zorbatini,Cocktail,Cocktail glass,Ouzo,Ouzo,,Prepare like a Martini. Garnish with a green o...,1/4 oz,7.5,,,
1778,1778,Zorro,Coffee / Tea,Coffee Mug,Sambuca,Sambuca,,add all and pour black coffee and add whipped ...,2 cl,20.0,,,
1779,1779,Zorro,Coffee / Tea,Coffee Mug,Bailey'S Irish Cream,Creamy Liqueur,,add all and pour black coffee and add whipped ...,2 cl,20.0,,,


In [56]:
dataset14 = pd.DataFrame(data14.groupby('strDrink')['strIngredients'].apply(lambda x: x.tolist())).reset_index()
dataset14['id'] = dataset14.index
dataset14['ingredients'] = dataset14['strIngredients']
dataset14 = dataset14.drop(columns=['strDrink', 'strIngredients'])
display(dataset14.head())
dataset14.info()

Unnamed: 0,id,ingredients
0,0,"[Creme De Cacao White, Vodka]"
1,1,"[Grand Marnier, Midori Melon Liqueur, Malibu R..."
2,2,"[Tequila, Lager]"
3,3,"[Dark Creme De Cacao, Coconut Liqueur, Light R..."
4,4,"[Light Rum, Orange Juice, Vodka, Dark Rum]"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 473 entries, 0 to 472
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           473 non-null    int64 
 1   ingredients  473 non-null    object
dtypes: int64(1), object(1)
memory usage: 7.5+ KB


## Save Datasets:
 - We will keep the following sets:
 - Food
    - Datasets 1/2/3/4
 - Drinks
    - Datasets 8/9/10/12/14

In [57]:
final_food = pd.concat([dataset1, dataset2, dataset3, dataset4], axis = 0)
final_food.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3053953 entries, 0 to 2231141
Data columns (total 2 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   id           int64 
 1   ingredients  object
dtypes: int64(1), object(1)
memory usage: 69.9+ MB


In [58]:
final_drink = pd.concat([dataset8, dataset9, dataset10, dataset12, dataset14], axis = 0)
final_drink.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3121 entries, 0 to 472
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           3121 non-null   int64 
 1   ingredients  3121 non-null   object
dtypes: int64(1), object(1)
memory usage: 73.1+ KB


 - With 3 million rows of data for food and only 3000 rows for drinks I expect to get wildly differing results in terms of effectiveness, we may end up combining the two together but I wanted them seperate for now


In [59]:
final_food.to_csv(f'{file_path}food_full.csv', header=True)
final_drink.to_csv(f'{file_path}drink_full.csv', header=True)

In [60]:
final_food.to_pickle(f'{file_path}food_full.pkl')
final_drink.to_pickle(f'{file_path}drink_full.pkl')