# Recipe Dataset (over 2M) Food

This dataset is a comprehensive collection of recipes from all around the world, ranging from simple dishes like bread to elaborate meals like Swedish midsummer smorgasbords. It is designed to facilitate projects that involve food analysis, recipe generation, or multimedia applications related to culinary arts.

## Used libraries

In [None]:
# General-purpose libraries
import pandas as pd
import numpy as np
import seaborn as sns

# Text-to-Video and NLP Libraries
from transformers import pipeline
from diffusers import StableDiffusionPipeline

# Video and Image Handling
from moviepy import ImageSequenceClip, AudioFileClip
import cv2  # OpenCV for image manipulation

# Text-to-Speech
from gtts import gTTS

# Miscellaneous
import os

## Constants

In [None]:
df_columns = df.columns

## Helper methods

In [None]:
# Function to print dataset.
def print_dataset(text, df):
    print("\n" + text + ":")
    display(df.head())

# Check for noisy data (e.g., special characters or unnecessary brackets)
def find_noisy_data(column):
    noisy_rows = df[column][df[column].str.contains(r"[\\[\\]\\\\]|\\\"")]
    return noisy_rows

## Read Dataset

In [None]:
# Read dataset
df = pd.read_csv("../files/recipes_data.csv")
# df = pd.read_csv("../files/processed_data.csv")

## Model preparation

### Database structure

In [6]:
print_dataset("Dataset", df)


Dataset:


Unnamed: 0,title,ingredients,directions,link,source,NER,site
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""bite size shredded rice biscuits"", ""vanilla""...",www.cookbooks.com
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""cream of mushroom soup"", ""beef"", ""sour cream...",www.cookbooks.com
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""pepper"", ""cream cheese"", ""gar...",www.cookbooks.com
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken gravy"", ""cream of mushroom soup"", ""c...",www.cookbooks.com
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""graham cracker crumbs"", ""powdered sugar"", ""p...",www.cookbooks.com


### Data types

In [7]:
# To gain knowledge about data types, run this command:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2231142 entries, 0 to 2231141
Data columns (total 7 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   title        object
 1   ingredients  object
 2   directions   object
 3   link         object
 4   source       object
 5   NER          object
 6   site         object
dtypes: object(7)
memory usage: 119.2+ MB


### NULL values

In [None]:
# Command for checking for null values:
df.isnull().sum()

title          1
ingredients    0
directions     0
link           0
source         0
NER            0
site           0
dtype: int64

- Handling null values

In [None]:
# Fshirja e rreshtave me vlera null në kolonat me vlera null
df = df.dropna(subset=df_columns)
#new_df.to_csv("../files/Preprocessed_Kosovo_News_Articles_Dataset.csv", index=False)

# Shfaqja e dataseti-it të modifikuar
print(df.isnull().sum())

### Duplicate values

- Duplicate values in dataset

In [None]:
# Command to search duplicates
print("Duplicates: " + str(df.duplicated().sum()))

- Duplicate values in title column

In [None]:
duplicates = df[df['title'].duplicated(keep=False)]
print(duplicates)

- Rows filter based on the title

In [None]:
# Filter the DataFrame for rows where the title is "Cherry Nut Bars"
cherry_nut_bars = df[df['title'] == "Cherry Nut Bars"]

# Display the filtered rows
# print_dataset("cherry_nut_bars", cherry_nut_bars)
print(cherry_nut_bars.to_string())


- Find duplicates in NER column

In [None]:
duplicates = df[df['NER'].duplicated(keep=False)]
print(duplicates)


- Removing nearly duplicate values

In [None]:
# Number of rows before removing duplicates
rows_before = len(df)

# Identify duplicates based on 'title', 'NER', and 'ingredients'
duplicates = df[df.duplicated(subset=['title', 'NER', 'ingredients'], keep=False)]

# Log duplicate rows for verification
print("Duplicate Rows:")
print(duplicates)

# Remove duplicates, keeping only the first occurrence
df_cleaned = df.drop_duplicates(subset=['title', 'NER'], keep='first')

# Number of rows after removing duplicates
rows_after = len(df_cleaned)

# Calculate the number of deleted rows
deleted_rows = rows_before - rows_after

df = df_cleaned

# Logs
print(f"\nRows before removing duplicates: {rows_before}")
print(f"Rows after removing duplicates: {rows_after}")
print(f"Number of rows deleted: {deleted_rows}")


- Rows after removing nearly duplicates

In [None]:
# Filter the DataFrame for rows where the title is "Cherry Nut Bars"
cherry_nut_bars = df[df['title'] == "Cherry Nut Bars"]

# Display the filtered rows
# print_dataset("cherry_nut_bars", cherry_nut_bars)
print(cherry_nut_bars.to_string())

### NaN values

In [None]:
# Iterate through each column in the DataFrame
for column in df_columns:
    nan_count = df[column].isna().sum()  # Count missing (NaN) values in the column
    print(f"The number of missing values detected in the column '{column}' is: {nan_count}")

- Handling NaN values

In [None]:
df = df.dropna(axis=0, how='any')

### Drop columns

In [None]:
# Delete columns 'City' and 'Salary'
columns_to_delete = ['link', 'source', 'site']
df.drop(columns=columns_to_delete, inplace=True)

In [None]:
### Special characters

In [None]:
# Identify titles with special characters
print("Titles with special characters:")
print(df[df['title'].str.contains(r'[^\w\s]', regex=True)])

- Handling special characters

In [None]:
# Remove special characters from titles
df['title'] = df['title'].str.replace(r'[^\w\s]', '', regex=True)

### Noisy data

In [None]:
# Display noisy data in the 'ingredients' column
noisy_data = find_noisy_data('ingredients')
print("Noisy Data in 'ingredients':")
print(noisy_data)

In [None]:
# Display noisy data in the 'directions' column
noisy_data = find_noisy_data('directions')
print("Noisy Data in 'directions':")
print(noisy_data)

#### Handling Noisy Data

In [None]:
# Clean noisy data in the 'ingredients' column
df['ingredients'] = df['ingredients'] \
    .str.replace(r'\\"', '"', regex=True) \
    .str.replace(r'[\[\]]', '', regex=True) \
    .str.replace(r'\\', '', regex=True) \
    .str.strip()

# Verify the cleaned column
print("Cleaned Ingredients Column:")
print(df['ingredients'].head())

In [None]:
# Clean noisy data in the 'ingredients' column
df['directions'] = df['directions'] \
    .str.replace(r'\\"', '"', regex=True) \
    .str.replace(r'[\[\]]', '', regex=True) \
    .str.replace(r'\\', '', regex=True) \
    .str.strip()

# Verify the cleaned column
print("Cleaned Ingredients Column:")
print(df['directions'].head())

### Outliers

In [None]:
# Count the frequency of each unique string
frequency_counts = df['title'].value_counts()

# Define a threshold for rare occurrences (e.g., frequency = 1)
outliers = frequency_counts[frequency_counts == 1]

# Display outliers
print("Outliers based on frequency:")
print(outliers)

- Find groups data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Krijimi i një TF-IDF vektorizuesi
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['title'])

# Klasterizimi me K-Means
num_clusters = 3  # Numri i grupeve
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)

# Shtimi i grupeve për secilin titull
clusters = kmeans.labels_

# Shfaqja e rezultateve
for title, cluster in zip(df['title'], clusters):
    print(f"'{title}' është në grupin {cluster}")

- Clustering recipe titles by category

In [None]:
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import pandas as pd

# Get 50% of the data
df = df.sample(frac=0.5, random_state=42)

# Reset the index to align with PCA results
df = df.reset_index(drop=True)

# Handle missing values
df['title'] = df['title'].fillna('')
df['ingredients'] = df['ingredients'].fillna('')

# TF-IDF Vectorization on Titles
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(df['title'])

print(f"TF-IDF Matrix Shape: {X.shape}")

# K-Means Clustering
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)

# Assign Cluster Labels
clusters = kmeans.labels_
df['Cluster'] = clusters

# Manually Define Categories for Clusters
cluster_names = {
    0: 'Dessert Recipes',
    1: 'Dinner Recipes',
    2: 'Breakfast Recipes'
}

# Assign Category Names
df['Category'] = df['Cluster'].map(cluster_names)

# Print Grouped Titles by Cluster
print("\n--- Titles Grouped by Cluster ---")
for cluster, category in cluster_names.items():
    print(f"\n{category}:")
    titles_in_group = df[df['Cluster'] == cluster]['title'].tolist()
    for title in titles_in_group:
        print(f"- {title}")

# Dimensionality Reduction with PCA for Visualization
pca = PCA(n_components=2, random_state=42)
X_reduced = pca.fit_transform(X.toarray())
print(f"PCA Reduced Shape: {X_reduced.shape}")

# Visualization with Category Names
plt.figure(figsize=(10, 7))
for cluster, category in cluster_names.items():
    points = X_reduced[df['Cluster'] == cluster]
    plt.scatter(points[:, 0], points[:, 1], label=category)

# Removed Add Titles as Labels (plt.text)
# for i, (x, y) in enumerate(X_reduced):
#     plt.text(x, y, df.iloc[i]['title'], fontsize=8)

plt.title("Clustering of Recipe Titles by Category")
plt.xlabel("Title")
plt.ylabel("Category")
plt.legend()
plt.show()

### NLP Video Generation

In [None]:
from diffusers import StableDiffusionPipeline
import os

# Load Stable Diffusion model
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
pipe = pipe.to("cpu")  # Use CPU instead of GPU

# Create a directory for generated images
os.makedirs("generated_frames", exist_ok=True)

print("Stable Diffusion model loaded successfully in CPU mode!")

#### Generate Images for Recipe Steps

In [None]:
# Reset the index
sample_df = sample_df.reset_index(drop=True)

# Select the first recipe
recipe_title = sample_df['title'][0]
recipe_steps = sample_df['directions'][0]  # This is already a list

frames = []  # To store generated image paths
for i, step in enumerate(recipe_steps):
    prompt = f"Artistic representation of: {step}"
    image = pipe(prompt).images[0]
    frame_path = f"generated_frames/{recipe_title.replace(' ', '_')}_step_{i+1}.png"
    image.save(frame_path)
    frames.append(frame_path)

print(f"Generated {len(frames)} images for '{recipe_title}' recipe.")

#### Combine Frames into a Video

In [None]:
import os
import imageio_ffmpeg

ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
print("FFmpeg binary path:", ffmpeg_path)

# Explicitly set ffmpeg path
os.environ["IMAGEIO_FFMPEG_EXE"] = "/opt/homebrew/bin/ffmpeg"

# Verify the ffmpeg path
print("Using FFmpeg binary at:", imageio_ffmpeg.get_ffmpeg_exe())

In [None]:
# Directory containing frames
frame_directory = "generated_frames"

# Get all image files in the directory
all_frames = sorted([f for f in os.listdir(frame_directory) if f.endswith(".png")])

# Group frames by their prefix
frame_groups = {}
for frame in all_frames:
    prefix = "_".join(frame.split("_")[:-2])  # Extract prefix (everything except step_x)
    if prefix not in frame_groups:
        frame_groups[prefix] = []
    frame_groups[prefix].append(os.path.join(frame_directory, frame))

# Create a video for each group
fps = 1  # Frames per second
output_directory = "../videos/"
os.makedirs(output_directory, exist_ok=True)

for prefix, frames in frame_groups.items():
    video_path = os.path.join(output_directory, f"{prefix}.mp4")
    clip = ImageSequenceClip(frames, fps=fps)
    clip.write_videofile(video_path, codec="libx264")
    print(f"Video saved: {video_path}")

## NER Analysis

### Segmentation

Segmentation in NLP involves breaking down a larger piece of text into smaller, meaningful units such as sentences or paragraphs.

In [None]:
# Check the type of the `directions` column
print(type(df['directions'][0]))

# If `directions` is already a list, skip the parsing step
# Flatten and segment the directions into smaller pieces
segmented_directions = []
for index, row in df.iterrows():
    for step in row['directions']:
        # Split steps into smaller segments using `.split('. ')`
        segments = step.split('. ')
        for segment in segments:
            # Avoid adding empty strings
            if segment.strip():
                segmented_directions.append({
                    "title": row["title"],
                    "direction_segment": segment.strip()  # Clean whitespace
                })

# Convert the segmented directions into a new DataFrame
segmented_df = pd.DataFrame(segmented_directions)

# Export the segmented DataFrame to a CSV file
segmented_df.to_csv("../files/1_segmentation.csv", index=False)

# Print the first few rows for verification
print(segmented_df.head())

### Tokenization

Tokenization is used in MLP to split paragraphs and sentences into smaller units that can be more easily assigned meaning.

In [None]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')

# Check if `directions` is a list or a string
if isinstance(df['directions'][0], str):
    import ast
    df['directions'] = df['directions'].apply(ast.literal_eval)

# Segment and tokenize the directions
segmented_tokenized_directions = []
for index, row in df.iterrows():
    for step in row['directions']:
        # Segment the step into smaller parts
        segments = step.split('. ')
        for segment in segments:
            # Clean whitespace and ensure the segment is not empty
            clean_segment = segment.strip()
            if clean_segment:
                try:
                    # Tokenize the segment into words
                    tokens = word_tokenize(clean_segment)
                    segmented_tokenized_directions.append({
                        "title": row["title"],
                        "direction_segment": clean_segment,
                        "tokens": tokens
                    })
                except Exception as e:
                    print(f"Error tokenizing: {clean_segment}. Error: {e}")

# Convert the results into a DataFrame
segmented_tokenized_df = pd.DataFrame(segmented_tokenized_directions)

# Convert the tokens column to a string for proper CSV export
segmented_tokenized_df['tokens'] = segmented_tokenized_df['tokens'].apply(lambda x: ' '.join(x))

# Export the segmented and tokenized DataFrame to a CSV file
segmented_tokenized_df.to_csv("../files/2_tokenization.csv", index=False)

# Print the first few rows for verification
print(segmented_tokenized_df.head())

### Stop words

Stop words are a set of commonly used words in a language. Examples of stop words in English are “a,” “the,” “is,” “are,” etc. Stop words are commonly used in Text Mining and Natural Language Processing (NLP) to eliminate words that are so widely used that they carry very little useful information.

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download necessary NLTK resources
nltk.download('punkt')       # Tokenizer
nltk.download('stopwords')   # Stop words list

# Ensure `directions` is a list
if isinstance(df['directions'][0], str):
    import ast
    df['directions'] = df['directions'].apply(ast.literal_eval)

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

# Segment, tokenize, and remove stop words from directions
segmented_tokenized_directions = []
for index, row in df.iterrows():
    for step in row['directions']:
        # Segment the step into smaller parts
        segments = step.split('. ')
        for segment in segments:
            # Clean whitespace and ensure the segment is not empty
            clean_segment = segment.strip()
            if clean_segment:
                try:
                    # Tokenize the segment into words
                    tokens = word_tokenize(clean_segment)
                    
                    # Remove stop words from the token list
                    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
                    
                    segmented_tokenized_directions.append({
                        "title": row["title"],
                        "direction_segment": clean_segment,
                        "tokens": filtered_tokens
                    })
                except Exception as e:
                    print(f"Error tokenizing: {clean_segment}. Error: {e}")

# Convert the results into a DataFrame
segmented_tokenized_df = pd.DataFrame(segmented_tokenized_directions)

# Convert the tokens column to a string for proper CSV export
segmented_tokenized_df['tokens'] = segmented_tokenized_df['tokens'].apply(lambda x: ' '.join(x))

# Export the segmented and tokenized DataFrame to a CSV file
segmented_tokenized_df.to_csv("../files/3_stopwords.csv", index=False)

# Print the first few rows for verification
print(segmented_tokenized_df.head())

### Stemming

Stemming is a text preprocessing technique in natural language processing (NLP). Specifically, it is the process of reducing inflected form of a word to one so-called “stem,” or root form, also known as a “lemma” in linguistics.N

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download necessary NLTK resources
nltk.download('punkt')       # Tokenizer
nltk.download('stopwords')   # Stop words list

# Initialize the PorterStemmer
stemmer = PorterStemmer()

# Ensure `directions` is a list
if isinstance(df['directions'][0], str):
    import ast
    df['directions'] = df['directions'].apply(ast.literal_eval)

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

# Segment, tokenize, remove stop words, and apply stemming
segmented_tokenized_directions = []
for index, row in df.iterrows():
    for step in row['directions']:
        # Segment the step into smaller parts
        segments = step.split('. ')
        for segment in segments:
            # Clean whitespace and ensure the segment is not empty
            clean_segment = segment.strip()
            if clean_segment:
                try:
                    # Tokenize the segment into words
                    tokens = word_tokenize(clean_segment)
                    
                    # Remove stop words from the token list
                    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
                    
                    # Apply stemming to the filtered tokens
                    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
                    
                    segmented_tokenized_directions.append({
                        "title": row["title"],
                        "direction_segment": clean_segment,
                        "tokens": stemmed_tokens
                    })
                except Exception as e:
                    print(f"Error tokenizing: {clean_segment}. Error: {e}")

# Convert the results into a DataFrame
segmented_tokenized_df = pd.DataFrame(segmented_tokenized_directions)

# Convert the tokens column to a string for proper CSV export
segmented_tokenized_df['tokens'] = segmented_tokenized_df['tokens'].apply(lambda x: ' '.join(x))

# Export the segmented and tokenized DataFrame to a CSV file
segmented_tokenized_df.to_csv("../files/4_stemming.csv", index=False)

# Print the first few rows for verification
print(segmented_tokenized_df.head())

### Lemmatization

Lemmatization is a text pre-processing technique used in natural language processing (NLP) models to break a word down to its root meaning to identify similarities. For example, a lemmatization algorithm would reduce the word better to its root word, or lemme, good.

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download necessary NLTK resources
nltk.download('punkt')       # Tokenizer
nltk.download('stopwords')   # Stop words list
nltk.download('wordnet')     # WordNet for lemmatization
nltk.download('omw-1.4')     # WordNet lemmatizer's dependency

# Initialize the WordNetLemmatizer and PorterStemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Ensure `directions` is a list
if isinstance(df['directions'][0], str):
    import ast
    df['directions'] = df['directions'].apply(ast.literal_eval)

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

# Segment, tokenize, remove stop words, and apply both stemming and lemmatization
segmented_tokenized_directions = []
for index, row in df.iterrows():
    for step in row['directions']:
        # Segment the step into smaller parts
        segments = step.split('. ')
        for segment in segments:
            # Clean whitespace and ensure the segment is not empty
            clean_segment = segment.strip()
            if clean_segment:
                try:
                    # Tokenize the segment into words
                    tokens = word_tokenize(clean_segment)
                    
                    # Remove stop words from the token list
                    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
                    
                    # Apply lemmatization first
                    lemmatized_tokens = [lemmatizer.lemmatize(word.lower()) for word in filtered_tokens]
                    
                    # Apply stemming to lemmatized tokens
                    stemmed_tokens = [stemmer.stem(word) for word in lemmatized_tokens]
                    
                    segmented_tokenized_directions.append({
                        "title": row["title"],
                        "direction_segment": clean_segment,
                        "tokens_after_lemmatization": lemmatized_tokens,
                        "tokens_after_stemming": stemmed_tokens
                    })
                except Exception as e:
                    print(f"Error processing: {clean_segment}. Error: {e}")

# Convert the results into a DataFrame
segmented_tokenized_df = pd.DataFrame(segmented_tokenized_directions)

# Convert the token columns to strings for proper CSV export
segmented_tokenized_df['tokens_after_lemmatization'] = segmented_tokenized_df['tokens_after_lemmatization'].apply(lambda x: ' '.join(x))
segmented_tokenized_df['tokens_after_stemming'] = segmented_tokenized_df['tokens_after_stemming'].apply(lambda x: ' '.join(x))

# Export the segmented, tokenized, and processed DataFrame to a CSV file
segmented_tokenized_df.to_csv("../files/5_lemmatization.csv", index=False)

# Print the first few rows for verification
print(segmented_tokenized_df.head())

### POS Tagging

Part-of-speech (POS) tagging is the process of labeling words in a text with their corresponding parts of speech in natural language processing (NLP). It helps algorithms understand the grammatical structure and meaning of a text.

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag
import ast

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

# Initialize the WordNetLemmatizer and PorterStemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Verify the directions column
print("Sample directions column:")
print(df['directions'].iloc[0])
print(type(df['directions'].iloc[0]))

# Convert directions to lists if necessary
if isinstance(df['directions'][0], str):
    try:
        df['directions'] = df['directions'].apply(ast.literal_eval)
    except Exception as e:
        print(f"Error converting directions: {e}")

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

# Process directions
segmented_tokenized_directions = []
for index, row in df.iterrows():
    for step in row['directions']:
        segments = step.split('. ')
        for segment in segments:
            clean_segment = segment.strip()
            print(f"Processing segment: '{clean_segment}'")
            if clean_segment:
                try:
                    tokens = word_tokenize(clean_segment)
                    print(f"Tokens: {tokens}")

                    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
                    lemmatized_tokens = [lemmatizer.lemmatize(word.lower()) for word in filtered_tokens]
                    stemmed_tokens = [stemmer.stem(word) for word in lemmatized_tokens]
                    pos_tags = pos_tag(lemmatized_tokens)

                    segmented_tokenized_directions.append({
                        "title": row["title"],
                        "direction_segment": clean_segment,
                        "tokens_after_lemmatization": lemmatized_tokens,
                        "tokens_after_stemming": stemmed_tokens,
                        "POS_tags": pos_tags
                    })

                except Exception as e:
                    print(f"Error processing row {index} segment: '{clean_segment}'. Error: {e}")

# Check if any data was generated
if not segmented_tokenized_directions:
    raise ValueError("No valid data was generated. Please check your input data or processing logic.")

# Convert to DataFrame and export
segmented_tokenized_df = pd.DataFrame(segmented_tokenized_directions)
segmented_tokenized_df.to_csv("../files/6_pos_tagging.csv", index=False)
print(segmented_tokenized_df.head())

### Feature Extraction

In natural language processing (NLP), feature extraction is a fundamental task that involves converting raw text data into a format that can be easily processed by machine learning algorithms. There are various techniques available for feature extraction in NLP, each with its own strengths and weaknesses.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Kombino të gjitha segmentet e "directions" për çdo recetë
df['all_directions'] = df['directions'].apply(lambda x: ' '.join(x))

# Përdor TF-IDF për ekstraktim të veçorive
vectorizer = TfidfVectorizer(stop_words='english', max_features=10)  # Top 10 fjalë
tfidf_matrix = vectorizer.fit_transform(df['all_directions'])

# Konverto TF-IDF në DataFrame për lexim të lehtë
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df['title'] = df['title']

# Printo rezultatet
print(tfidf_df)

### NER

Named entity recognition (NER) is a natural language processing (NLP) method that extracts information from text. NER involves detecting and categorizing important information in text known as named entities.

In [None]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize NLP tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Custom NER component
@Language.component("custom_ner")
def custom_ner(doc):
    ingredient_list = nlp.get_pipe("custom_ner").cfg["ingredient_list"]
    spans = []
    for token in doc:
        if token.text.lower() in ingredient_list:
            spans.append(Span(doc, token.i, token.i + 1, label="INGREDIENT"))
    doc.ents = list(doc.ents) + spans  # Add custom entities to SpaCy's entities
    return doc

# Convert `ingredients` and `directions` columns to lists
for col in ['ingredients', 'directions']:
    if isinstance(df[col][0], str):
        df[col] = df[col].apply(ast.literal_eval)

# Dynamically generate the ingredient list
ingredient_list = set()
for ingredients in df['ingredients']:
    for ingredient in ingredients:
        tokens = word_tokenize(ingredient.lower())  # Tokenize each ingredient
        filtered_tokens = [word for word in tokens if word.isalpha()]  # Keep only alphabetic words
        ingredient_list.update(filtered_tokens)  # Add to the ingredient list

ingredient_list = list(ingredient_list)  # Convert to a list

print("Generated Ingredient List:", ingredient_list)

# Add custom NER to SpaCy pipeline
nlp.add_pipe("custom_ner", last=True)
nlp.get_pipe("custom_ner").cfg = {"ingredient_list": ingredient_list}  # Add ingredient list to the pipe config

# NLP Processing and NER Extraction
ner_results = []
for index, row in df.iterrows():
    combined_text = ' '.join(row['directions'])  # Combine all steps into one string
    doc = nlp(combined_text)  # Process text using SpaCy
    
    # Extract entities
    ner_list = []
    for ent in doc.ents:
        ner_list.append(f"{ent.text} ({ent.label_})")
    
    ner_results.append(', '.join(ner_list))

# Add the NER results to the dataset
df['NLP_NER'] = ner_results

# Export to CSV
df[['title', 'ingredients', 'NER', 'NLP_NER']].to_csv("../files/final_nlp_ner_results.csv", index=False)

# Print the results
print(df[['title', 'NLP_NER']])