# Recipe Dataset (over 2M) Food

This dataset is a comprehensive collection of recipes from all around the world, ranging from simple dishes like bread to elaborate meals like Swedish midsummer smorgasbords. It is designed to facilitate projects that involve food analysis, recipe generation, or multimedia applications related to culinary arts.

## Used libraries

In [None]:
# General-purpose libraries
import pandas as pd
import numpy as np
import seaborn as sns

# Text-to-Video and NLP Libraries
from transformers import pipeline
from diffusers import StableDiffusionPipeline

# Video and Image Handling
from moviepy import ImageSequenceClip, AudioFileClip
import cv2  # OpenCV for image manipulation

# Text-to-Speech
from gtts import gTTS

# Miscellaneous
import os

## Constants

In [None]:
df_columns = df.columns

## Helper methods

In [None]:
# Function to print dataset.
def print_dataset(text, df):
    print("\n" + text + ":")
    display(df.head())

# Check for noisy data (e.g., special characters or unnecessary brackets)
def find_noisy_data(column):
    noisy_rows = df[column][df[column].str.contains(r"[\\[\\]\\\\]|\\\"")]
    return noisy_rows

## Read Dataset

In [None]:
# Read dataset
df = pd.read_csv("../files/recipes_data.csv")
# df = pd.read_csv("../files/processed_data.csv")

## Model preparation

### Database structure

In [6]:
print_dataset("Dataset", df)


Dataset:


Unnamed: 0,title,ingredients,directions,link,source,NER,site
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""bite size shredded rice biscuits"", ""vanilla""...",www.cookbooks.com
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""cream of mushroom soup"", ""beef"", ""sour cream...",www.cookbooks.com
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""pepper"", ""cream cheese"", ""gar...",www.cookbooks.com
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken gravy"", ""cream of mushroom soup"", ""c...",www.cookbooks.com
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""graham cracker crumbs"", ""powdered sugar"", ""p...",www.cookbooks.com


### Data types

In [7]:
# To gain knowledge about data types, run this command:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2231142 entries, 0 to 2231141
Data columns (total 7 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   title        object
 1   ingredients  object
 2   directions   object
 3   link         object
 4   source       object
 5   NER          object
 6   site         object
dtypes: object(7)
memory usage: 119.2+ MB


### NULL values

In [None]:
# Command for checking for null values:
df.isnull().sum()

title          1
ingredients    0
directions     0
link           0
source         0
NER            0
site           0
dtype: int64

- Handling null values

In [None]:
# Fshirja e rreshtave me vlera null në kolonat me vlera null
df = df.dropna(subset=df_columns)
#new_df.to_csv("../files/Preprocessed_Kosovo_News_Articles_Dataset.csv", index=False)

# Shfaqja e dataseti-it të modifikuar
print(df.isnull().sum())

###Duplicate values

In [None]:
- Duplicate values in dataset

# Command to search duplicates
print("Duplicates: " + str(df.duplicated().sum()))

- Duplicate values in title column

In [None]:
duplicates = df[df['title'].duplicated(keep=False)]
print(duplicates)

- Rows filter based on the title

In [None]:
# Filter the DataFrame for rows where the title is "Cherry Nut Bars"
cherry_nut_bars = df[df['title'] == "Cherry Nut Bars"]

# Display the filtered rows
# print_dataset("cherry_nut_bars", cherry_nut_bars)
print(cherry_nut_bars.to_string())


- Find duplicates in NER column

In [None]:
duplicates = df[df['NER'].duplicated(keep=False)]
print(duplicates)


- Removing nearly duplicate values

In [None]:
# Number of rows before removing duplicates
rows_before = len(df)

# Identify duplicates based on 'title', 'NER', and 'ingredients'
duplicates = df[df.duplicated(subset=['title', 'NER', 'ingredients'], keep=False)]

# Log duplicate rows for verification
print("Duplicate Rows:")
print(duplicates)

# Remove duplicates, keeping only the first occurrence
df_cleaned = df.drop_duplicates(subset=['title', 'NER'], keep='first')

# Number of rows after removing duplicates
rows_after = len(df_cleaned)

# Calculate the number of deleted rows
deleted_rows = rows_before - rows_after

df = df_cleaned

# Logs
print(f"\nRows before removing duplicates: {rows_before}")
print(f"Rows after removing duplicates: {rows_after}")
print(f"Number of rows deleted: {deleted_rows}")


- Rows after removing nearly duplicates

In [None]:
# Filter the DataFrame for rows where the title is "Cherry Nut Bars"
cherry_nut_bars = df[df['title'] == "Cherry Nut Bars"]

# Display the filtered rows
# print_dataset("cherry_nut_bars", cherry_nut_bars)
print(cherry_nut_bars.to_string())

### NaN values

In [None]:
# Iterate through each column in the DataFrame
for column in df_columns:
    nan_count = df[column].isna().sum()  # Count missing (NaN) values in the column
    print(f"The number of missing values detected in the column '{column}' is: {nan_count}")

- Handling NaN values

In [None]:
df = df.dropna(axis=0, how='any')

### Drop columns

In [None]:
# Delete columns 'City' and 'Salary'
columns_to_delete = ['link', 'source', 'site']
df.drop(columns=columns_to_delete, inplace=True)

In [None]:
### Special characters

In [None]:
# Identify titles with special characters
print("Titles with special characters:")
print(df[df['title'].str.contains(r'[^\w\s]', regex=True)])

- Handling special characters

In [None]:
# Remove special characters from titles
df['title'] = df['title'].str.replace(r'[^\w\s]', '', regex=True)

### Noisy data

In [None]:
# Display noisy data in the 'ingredients' column
noisy_data = find_noisy_data('ingredients')
print("Noisy Data in 'ingredients':")
print(noisy_data)

In [None]:
# Display noisy data in the 'directions' column
noisy_data = find_noisy_data('directions')
print("Noisy Data in 'directions':")
print(noisy_data)

#### Handling Noisy Data

In [None]:
# Clean noisy data in the 'ingredients' column
df['ingredients'] = df['ingredients'] \
    .str.replace(r'\\"', '"', regex=True) \
    .str.replace(r'[\[\]]', '', regex=True) \
    .str.replace(r'\\', '', regex=True) \
    .str.strip()

# Verify the cleaned column
print("Cleaned Ingredients Column:")
print(df['ingredients'].head())

In [None]:
# Clean noisy data in the 'ingredients' column
df['directions'] = df['directions'] \
    .str.replace(r'\\"', '"', regex=True) \
    .str.replace(r'[\[\]]', '', regex=True) \
    .str.replace(r'\\', '', regex=True) \
    .str.strip()

# Verify the cleaned column
print("Cleaned Ingredients Column:")
print(df['directions'].head())