## NER Analysis

In [16]:
import pandas as pd
from nltk.tokenize import word_tokenize
from spacy.language import Language
from spacy.tokens import Span
import nltk
nltk.download('punkt')  # Only needed the first time
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bleronaidrizi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/bleronaidrizi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bleronaidrizi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bleronaidrizi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
df = pd.read_csv("../files/recipes_data1.csv")

### Segmentation

Segmentation in NLP involves breaking down a larger piece of text into smaller, meaningful units such as sentences or paragraphs.

In [5]:
# Check the type of the `directions` column
print(type(df['directions'][0]))

# If `directions` is already a list, skip the parsing step
# Flatten and segment the directions into smaller pieces
segmented_directions = []
for index, row in df.iterrows():
    for step in row['directions']:
        # Split steps into smaller segments using `.split('. ')`
        segments = step.split('. ')
        for segment in segments:
            # Avoid adding empty strings
            if segment.strip():
                segmented_directions.append({
                    "title": row["title"],
                    "direction_segment": segment.strip()  # Clean whitespace
                })

# Convert the segmented directions into a new DataFrame
segmented_df = pd.DataFrame(segmented_directions)

# Export the segmented DataFrame to a CSV file
segmented_df.to_csv("../files/1_segmentation.csv", index=False)

# Print the first few rows for verification
print(segmented_df.head())

<class 'str'>
                 title direction_segment
0  No-Bake Nut Cookies                 [
1  No-Bake Nut Cookies                 "
2  No-Bake Nut Cookies                 I
3  No-Bake Nut Cookies                 n
4  No-Bake Nut Cookies                 a


### Tokenization

Tokenization is used in MLP to split paragraphs and sentences into smaller units that can be more easily assigned meaning.

In [9]:
# Check if `directions` is a list or a string
if isinstance(df['directions'][0], str):
    import ast
    df['directions'] = df['directions'].apply(ast.literal_eval)

# Segment and tokenize the directions
segmented_tokenized_directions = []
for index, row in df.iterrows():
    for step in row['directions']:
        # Segment the step into smaller parts
        segments = step.split('. ')
        for segment in segments:
            # Clean whitespace and ensure the segment is not empty
            clean_segment = segment.strip()
            if clean_segment:
                try:
                    # Tokenize the segment into words
                    tokens = word_tokenize(clean_segment)
                    segmented_tokenized_directions.append({
                        "title": row["title"],
                        "direction_segment": clean_segment,
                        "tokens": tokens
                    })
                except Exception as e:
                    print(f"Error tokenizing: {clean_segment}. Error: {e}")

# Convert the results into a DataFrame
segmented_tokenized_df = pd.DataFrame(segmented_tokenized_directions)

# Convert the tokens column to a string for proper CSV export
segmented_tokenized_df['tokens'] = segmented_tokenized_df['tokens'].apply(lambda x: ' '.join(x))

# Export the segmented and tokenized DataFrame to a CSV file
segmented_tokenized_df.to_csv("../files/2_tokenization.csv", index=False)

# Print the first few rows for verification
print(segmented_tokenized_df.head())

                 title                                  direction_segment  \
0  No-Bake Nut Cookies  In a heavy 2-quart saucepan, mix brown sugar, ...   
1  No-Bake Nut Cookies  Stir over medium heat until mixture bubbles al...   
2  No-Bake Nut Cookies                       Boil and stir 5 minutes more   
3  No-Bake Nut Cookies                                     Take off heat.   
4  No-Bake Nut Cookies              Stir in vanilla and cereal; mix well.   

                                              tokens  
0  In a heavy 2-quart saucepan , mix brown sugar ...  
1  Stir over medium heat until mixture bubbles al...  
2                       Boil and stir 5 minutes more  
3                                    Take off heat .  
4            Stir in vanilla and cereal ; mix well .  


### Stop words

Stop words are a set of commonly used words in a language. Examples of stop words in English are “a,” “the,” “is,” “are,” etc. Stop words are commonly used in Text Mining and Natural Language Processing (NLP) to eliminate words that are so widely used that they carry very little useful information.

In [10]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download necessary NLTK resources
nltk.download('punkt')       # Tokenizer
nltk.download('stopwords')   # Stop words list

# Ensure `directions` is a list
if isinstance(df['directions'][0], str):
    import ast
    df['directions'] = df['directions'].apply(ast.literal_eval)

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

# Segment, tokenize, and remove stop words from directions
segmented_tokenized_directions = []
for index, row in df.iterrows():
    for step in row['directions']:
        # Segment the step into smaller parts
        segments = step.split('. ')
        for segment in segments:
            # Clean whitespace and ensure the segment is not empty
            clean_segment = segment.strip()
            if clean_segment:
                try:
                    # Tokenize the segment into words
                    tokens = word_tokenize(clean_segment)
                    
                    # Remove stop words from the token list
                    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
                    
                    segmented_tokenized_directions.append({
                        "title": row["title"],
                        "direction_segment": clean_segment,
                        "tokens": filtered_tokens
                    })
                except Exception as e:
                    print(f"Error tokenizing: {clean_segment}. Error: {e}")

# Convert the results into a DataFrame
segmented_tokenized_df = pd.DataFrame(segmented_tokenized_directions)

# Convert the tokens column to a string for proper CSV export
segmented_tokenized_df['tokens'] = segmented_tokenized_df['tokens'].apply(lambda x: ' '.join(x))

# Export the segmented and tokenized DataFrame to a CSV file
segmented_tokenized_df.to_csv("../files/3_stopwords.csv", index=False)

# Print the first few rows for verification
print(segmented_tokenized_df.head())

                 title                                  direction_segment  \
0  No-Bake Nut Cookies  In a heavy 2-quart saucepan, mix brown sugar, ...   
1  No-Bake Nut Cookies  Stir over medium heat until mixture bubbles al...   
2  No-Bake Nut Cookies                       Boil and stir 5 minutes more   
3  No-Bake Nut Cookies                                     Take off heat.   
4  No-Bake Nut Cookies              Stir in vanilla and cereal; mix well.   

                                              tokens  
0  heavy 2-quart saucepan , mix brown sugar , nut...  
1             Stir medium heat mixture bubbles top .  
2                                Boil stir 5 minutes  
3                                        Take heat .  
4                   Stir vanilla cereal ; mix well .  


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bleronaidrizi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bleronaidrizi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Stemming

Stemming is a text preprocessing technique in natural language processing (NLP). Specifically, it is the process of reducing inflected form of a word to one so-called “stem,” or root form, also known as a “lemma” in linguistics.

In [11]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download necessary NLTK resources
nltk.download('punkt')       # Tokenizer
nltk.download('stopwords')   # Stop words list

# Initialize the PorterStemmer
stemmer = PorterStemmer()

# Ensure `directions` is a list
if isinstance(df['directions'][0], str):
    import ast
    df['directions'] = df['directions'].apply(ast.literal_eval)

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

# Segment, tokenize, remove stop words, and apply stemming
segmented_tokenized_directions = []
for index, row in df.iterrows():
    for step in row['directions']:
        # Segment the step into smaller parts
        segments = step.split('. ')
        for segment in segments:
            # Clean whitespace and ensure the segment is not empty
            clean_segment = segment.strip()
            if clean_segment:
                try:
                    # Tokenize the segment into words
                    tokens = word_tokenize(clean_segment)
                    
                    # Remove stop words from the token list
                    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
                    
                    # Apply stemming to the filtered tokens
                    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
                    
                    segmented_tokenized_directions.append({
                        "title": row["title"],
                        "direction_segment": clean_segment,
                        "tokens": stemmed_tokens
                    })
                except Exception as e:
                    print(f"Error tokenizing: {clean_segment}. Error: {e}")

# Convert the results into a DataFrame
segmented_tokenized_df = pd.DataFrame(segmented_tokenized_directions)

# Convert the tokens column to a string for proper CSV export
segmented_tokenized_df['tokens'] = segmented_tokenized_df['tokens'].apply(lambda x: ' '.join(x))

# Export the segmented and tokenized DataFrame to a CSV file
segmented_tokenized_df.to_csv("../files/4_stemming.csv", index=False)

# Print the first few rows for verification
print(segmented_tokenized_df.head())

                 title                                  direction_segment  \
0  No-Bake Nut Cookies  In a heavy 2-quart saucepan, mix brown sugar, ...   
1  No-Bake Nut Cookies  Stir over medium heat until mixture bubbles al...   
2  No-Bake Nut Cookies                       Boil and stir 5 minutes more   
3  No-Bake Nut Cookies                                     Take off heat.   
4  No-Bake Nut Cookies              Stir in vanilla and cereal; mix well.   

                                              tokens  
0  heavi 2-quart saucepan , mix brown sugar , nut...  
1                stir medium heat mixtur bubbl top .  
2                                  boil stir 5 minut  
3                                        take heat .  
4                   stir vanilla cereal ; mix well .  


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bleronaidrizi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bleronaidrizi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Lemmatization

Lemmatization is a text pre-processing technique used in natural language processing (NLP) models to break a word down to its root meaning to identify similarities. For example, a lemmatization algorithm would reduce the word better to its root word, or lemme, good.

In [12]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download necessary NLTK resources
nltk.download('punkt')       # Tokenizer
nltk.download('stopwords')   # Stop words list
nltk.download('wordnet')     # WordNet for lemmatization
nltk.download('omw-1.4')     # WordNet lemmatizer's dependency

# Initialize the WordNetLemmatizer and PorterStemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Ensure `directions` is a list
if isinstance(df['directions'][0], str):
    import ast
    df['directions'] = df['directions'].apply(ast.literal_eval)

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

# Segment, tokenize, remove stop words, and apply both stemming and lemmatization
segmented_tokenized_directions = []
for index, row in df.iterrows():
    for step in row['directions']:
        # Segment the step into smaller parts
        segments = step.split('. ')
        for segment in segments:
            # Clean whitespace and ensure the segment is not empty
            clean_segment = segment.strip()
            if clean_segment:
                try:
                    # Tokenize the segment into words
                    tokens = word_tokenize(clean_segment)
                    
                    # Remove stop words from the token list
                    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
                    
                    # Apply lemmatization first
                    lemmatized_tokens = [lemmatizer.lemmatize(word.lower()) for word in filtered_tokens]
                    
                    # Apply stemming to lemmatized tokens
                    stemmed_tokens = [stemmer.stem(word) for word in lemmatized_tokens]
                    
                    segmented_tokenized_directions.append({
                        "title": row["title"],
                        "direction_segment": clean_segment,
                        "tokens_after_lemmatization": lemmatized_tokens,
                        "tokens_after_stemming": stemmed_tokens
                    })
                except Exception as e:
                    print(f"Error processing: {clean_segment}. Error: {e}")

# Convert the results into a DataFrame
segmented_tokenized_df = pd.DataFrame(segmented_tokenized_directions)

# Convert the token columns to strings for proper CSV export
segmented_tokenized_df['tokens_after_lemmatization'] = segmented_tokenized_df['tokens_after_lemmatization'].apply(lambda x: ' '.join(x))
segmented_tokenized_df['tokens_after_stemming'] = segmented_tokenized_df['tokens_after_stemming'].apply(lambda x: ' '.join(x))

# Export the segmented, tokenized, and processed DataFrame to a CSV file
segmented_tokenized_df.to_csv("../files/5_lemmatization.csv", index=False)

# Print the first few rows for verification
print(segmented_tokenized_df.head())

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bleronaidrizi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bleronaidrizi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bleronaidrizi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/bleronaidrizi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


                 title                                  direction_segment  \
0  No-Bake Nut Cookies  In a heavy 2-quart saucepan, mix brown sugar, ...   
1  No-Bake Nut Cookies  Stir over medium heat until mixture bubbles al...   
2  No-Bake Nut Cookies                       Boil and stir 5 minutes more   
3  No-Bake Nut Cookies                                     Take off heat.   
4  No-Bake Nut Cookies              Stir in vanilla and cereal; mix well.   

                          tokens_after_lemmatization  \
0  heavy 2-quart saucepan , mix brown sugar , nut...   
1              stir medium heat mixture bubble top .   
2                                 boil stir 5 minute   
3                                        take heat .   
4                   stir vanilla cereal ; mix well .   

                               tokens_after_stemming  
0  heavi 2-quart saucepan , mix brown sugar , nut...  
1                stir medium heat mixtur bubbl top .  
2                                  

### POS Tagging

Part-of-speech (POS) tagging is the process of labeling words in a text with their corresponding parts of speech in natural language processing (NLP). It helps algorithms understand the grammatical structure and meaning of a text.

In [13]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag
import ast

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

# Initialize the WordNetLemmatizer and PorterStemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Verify the directions column
print("Sample directions column:")
print(df['directions'].iloc[0])
print(type(df['directions'].iloc[0]))

# Convert directions to lists if necessary
if isinstance(df['directions'][0], str):
    try:
        df['directions'] = df['directions'].apply(ast.literal_eval)
    except Exception as e:
        print(f"Error converting directions: {e}")

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

# Process directions
segmented_tokenized_directions = []
for index, row in df.iterrows():
    for step in row['directions']:
        segments = step.split('. ')
        for segment in segments:
            clean_segment = segment.strip()
            print(f"Processing segment: '{clean_segment}'")
            if clean_segment:
                try:
                    tokens = word_tokenize(clean_segment)
                    print(f"Tokens: {tokens}")

                    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
                    lemmatized_tokens = [lemmatizer.lemmatize(word.lower()) for word in filtered_tokens]
                    stemmed_tokens = [stemmer.stem(word) for word in lemmatized_tokens]
                    pos_tags = pos_tag(lemmatized_tokens)

                    segmented_tokenized_directions.append({
                        "title": row["title"],
                        "direction_segment": clean_segment,
                        "tokens_after_lemmatization": lemmatized_tokens,
                        "tokens_after_stemming": stemmed_tokens,
                        "POS_tags": pos_tags
                    })

                except Exception as e:
                    print(f"Error processing row {index} segment: '{clean_segment}'. Error: {e}")

# Check if any data was generated
if not segmented_tokenized_directions:
    raise ValueError("No valid data was generated. Please check your input data or processing logic.")

# Convert to DataFrame and export
segmented_tokenized_df = pd.DataFrame(segmented_tokenized_directions)
segmented_tokenized_df.to_csv("../files/6_pos_tagging.csv", index=False)
print(segmented_tokenized_df.head())

Sample directions column:
['In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.', 'Stir over medium heat until mixture bubbles all over top.', 'Boil and stir 5 minutes more. Take off heat.', 'Stir in vanilla and cereal; mix well.', 'Using 2 teaspoons, drop and shape into 30 clusters on wax paper.', 'Let stand until firm, about 30 minutes.']
<class 'list'>
Processing segment: 'In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.'
Tokens: ['In', 'a', 'heavy', '2-quart', 'saucepan', ',', 'mix', 'brown', 'sugar', ',', 'nuts', ',', 'evaporated', 'milk', 'and', 'butter', 'or', 'margarine', '.']
Processing segment: 'Stir over medium heat until mixture bubbles all over top.'
Tokens: ['Stir', 'over', 'medium', 'heat', 'until', 'mixture', 'bubbles', 'all', 'over', 'top', '.']
Processing segment: 'Boil and stir 5 minutes more'
Tokens: ['Boil', 'and', 'stir', '5', 'minutes', 'more']
Processing segment: 'Take off heat.

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bleronaidrizi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bleronaidrizi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bleronaidrizi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bleronaidrizi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/bleronaidrizi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Feature Extraction

In natural language processing (NLP), feature extraction is a fundamental task that involves converting raw text data into a format that can be easily processed by machine learning algorithms. There are various techniques available for feature extraction in NLP, each with its own strengths and weaknesses.

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Kombino të gjitha segmentet e "directions" për çdo recetë
df['all_directions'] = df['directions'].apply(lambda x: ' '.join(x))

# Përdor TF-IDF për ekstraktim të veçorive
vectorizer = TfidfVectorizer(stop_words='english', max_features=10)  # Top 10 fjalë
tfidf_matrix = vectorizer.fit_transform(df['all_directions'])

# Konverto TF-IDF në DataFrame për lexim të lehtë
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df['title'] = df['title']

# Printo rezultatet
print(tfidf_df)

    chicken  chocolate     cream      dish  ingredients   minutes       mix  \
0  0.000000   0.000000  0.000000  0.000000     0.000000  0.418244  0.503853   
1  0.707107   0.000000  0.353553  0.353553     0.000000  0.000000  0.353553   
2  0.000000   0.000000  0.000000  0.000000     0.707107  0.000000  0.000000   
3  0.691269   0.000000  0.230423  0.230423     0.000000  0.191272  0.000000   
4  0.000000   0.885664  0.000000  0.000000     0.357274  0.296570  0.000000   

       pour      size      stir                  title  
0  0.000000  0.000000  0.755780    No-Bake Nut Cookies  
1  0.353553  0.000000  0.000000  Jewell Ball'S Chicken  
2  0.000000  0.000000  0.707107            Creamy Corn  
3  0.230423  0.571207  0.000000          Chicken Funny  
4  0.000000  0.000000  0.000000   Reeses Cups(Candy)    


### NER

Named entity recognition (NER) is a natural language processing (NLP) method that extracts information from text. NER involves detecting and categorizing important information in text known as named entities.

In [None]:
# Initialize NLP tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Custom NER component
@Language.component("custom_ner")
def custom_ner(doc):
    ingredient_list = nlp.get_pipe("custom_ner").cfg["ingredient_list"]
    spans = []
    for token in doc:
        if token.text.lower() in ingredient_list:
            spans.append(Span(doc, token.i, token.i + 1, label="INGREDIENT"))
    doc.ents = list(doc.ents) + spans  # Add custom entities to SpaCy's entities
    return doc

# Convert `ingredients` and `directions` columns to lists
for col in ['ingredients', 'directions']:
    if isinstance(df[col][0], str):
        df[col] = df[col].apply(ast.literal_eval)

# Dynamically generate the ingredient list
ingredient_list = set()
for ingredients in df['ingredients']:
    for ingredient in ingredients:
        tokens = word_tokenize(ingredient.lower())  # Tokenize each ingredient
        filtered_tokens = [word for word in tokens if word.isalpha()]  # Keep only alphabetic words
        ingredient_list.update(filtered_tokens)  # Add to the ingredient list

ingredient_list = list(ingredient_list)  # Convert to a list

print("Generated Ingredient List:", ingredient_list)

# Add custom NER to SpaCy pipeline
nlp.add_pipe("custom_ner", last=True)
nlp.get_pipe("custom_ner").cfg = {"ingredient_list": ingredient_list}  # Add ingredient list to the pipe config

# NLP Processing and NER Extraction
ner_results = []
for index, row in df.iterrows():
    combined_text = ' '.join(row['directions'])  # Combine all steps into one string
    doc = nlp(combined_text)  # Process text using SpaCy
    
    # Extract entities
    ner_list = []
    for ent in doc.ents:
        ner_list.append(f"{ent.text} ({ent.label_})")
    
    ner_results.append(', '.join(ner_list))

# Add the NER results to the dataset
df['NLP_NER'] = ner_results

# Export to CSV
df[['title', 'ingredients', 'NER', 'NLP_NER']].to_csv("../files/final_nlp_ner_results.csv", index=False)

# Print the results
print(df[['title', 'NLP_NER']])