In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from transformers import pipeline
import json
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


def parse_ingredient_list(ingredient_string):
    """Convert a string representation of a list into an actual Python list."""
    return json.loads(ingredient_string)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print("GPU Available:", torch.cuda.is_available())
print("CUDA Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

Using device: cuda
GPU Available: True
CUDA Device: NVIDIA GeForce RTX 4060


In [3]:
resep_df = pd.read_csv('../dataset/dataset-resep/recipes_sample.csv')
resep_df = resep_df.drop(columns=['link', 'source', 'site'])
resep_df['ingredients'] = resep_df['ingredients'].apply(parse_ingredient_list)
resep_df['directions'] = resep_df['directions'].apply(parse_ingredient_list)
resep_df['NER'] = resep_df['NER'].apply(parse_ingredient_list)
resep_df = resep_df.drop(columns=['title', 'directions'])
# Keep only rows where len(ingredients) == len(NER)
resep_df = resep_df[resep_df['ingredients'].apply(len) == resep_df['NER'].apply(len)].reset_index(drop=True)


In [4]:
def sort_ner_by_similarity(ingredients, ner_list):
    """
    Sorts the NER list to align with the ingredients list using TF-IDF and cosine similarity.

    Parameters:
    - ingredients: List of ingredient descriptions.
    - ner_list: List of Named Entity Recognitions (NER).

    Returns:
    - Sorted NER list matching the order of ingredients.
    """
    if len(ingredients) != len(ner_list):
        return None  # Return None if lengths do not match

    # Initialize TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Compute TF-IDF for both lists
    tfidf_matrix = vectorizer.fit_transform(ingredients + ner_list)

    # Split the TF-IDF matrix
    tfidf_ingredients = tfidf_matrix[:len(ingredients)]
    tfidf_ner = tfidf_matrix[len(ingredients):]

    # Compute cosine similarity between each ingredient and all NER labels
    similarity_matrix = cosine_similarity(tfidf_ingredients, tfidf_ner)

    # Find the best matching NER for each ingredient
    sorted_ner = []
    used_indices = set()

    for i in range(len(ingredients)):
        # Find the best match that hasn't been used yet
        best_match_idx = np.argmax(similarity_matrix[i])
        while best_match_idx in used_indices:
            similarity_matrix[i][best_match_idx] = -1  # Ignore already used matches
            best_match_idx = np.argmax(similarity_matrix[i])
        
        sorted_ner.append(ner_list[best_match_idx])
        used_indices.add(best_match_idx)

    return sorted_ner

# Apply the function to each row of the dataframe
resep_df['NER'] = resep_df.apply(lambda row: sort_ner_by_similarity(row['ingredients'], row['NER']), axis=1)


In [5]:
resep_df

Unnamed: 0,ingredients,NER
0,"[1 1/2 pound flank steak, 1/2 c. finely minced...","[flank steak, green onions, red wine, soy sauc..."
1,"[3 to 4 carrots, 1 1/2 Tbsp. butter, 1/3 c. br...","[carrots, butter, brown sugar, lemon rind]"
2,"[4.5 Cups Flour, 1.5 Tsp Salt, Pinch Baking Po...","[flour, salt, baking powder, sugar, crisco, eg..."
3,"[2 c. crushed small thin pretzels (sticks), 3/...","[thin pretzels, margarine]"
4,"[3/4 cup sugar, 1/2 cup fresh orange juice, 1/...","[sugar, orange juice, lemon juice]"
...,...,...
6975,[1 (6 ounce) package STOVE TOP Stuffing Mix fo...,"[stove, turkey, carrots, mayonnaise, leftover ..."
6976,"[3 cups rolled oats, 3/4 cup fresh orange juic...","[rolled oats, fresh orange juice, milk, sugar,..."
6977,"[4 summer squash, 2 cups orange marmalade, 1 c...","[summer, orange marmalade, shredded coconut, m..."
6978,[2 medium butternut squash baked and cut into ...,"[butternut, russet potatoes, fingerling potato..."


In [7]:
def explode_ingredients(df):
    """
    Transforms the dataset so that each row contains only one ingredient and its corresponding NER tag.
    """
    # Create a new dataframe by exploding the ingredients and NER columns
    df_exploded = df.explode(["ingredients", "NER"], ignore_index=True)
    
    return df_exploded

# Apply the function to transform the dataset
resep_df = explode_ingredients(resep_df)

# Display the first few rows of the transformed dataframe
resep_df


Unnamed: 0,ingredients,NER
0,1 1/2 pound flank steak,flank steak
1,1/2 c. finely minced green onions (scallions),green onions
2,1/2 c. dry red wine,red wine
3,1/4 c. soy sauce,soy sauce
4,3 tbsp. salad oil,salad oil
...,...,...
55013,1/2 c. chopped celery,celery
55014,1 tsp. salt,salt
55015,1 tsp. black pepper,black pepper
55016,1/2 c. cooking oil,cooking oil


In [78]:
resep_df.to_csv('resep.csv')

In [8]:
tokenizer = AutoTokenizer.from_pretrained("dslim/distilbert-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/distilbert-NER")

model.to(device)

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [77]:
resep_df

Unnamed: 0,ingredients,NER
0,1 1/2 pound flank steak,flank steak
1,1/2 c. finely minced green onions (scallions),green onions
2,1/2 c. dry red wine,red wine
3,1/4 c. soy sauce,soy sauce
4,3 tbsp. salad oil,salad oil
...,...,...
55013,1/2 c. chopped celery,celery
55014,1 tsp. salt,salt
55015,1 tsp. black pepper,black pepper
55016,1/2 c. cooking oil,cooking oil


In [28]:
import torch


def tokenize_ner(examples):
    """
    Tokenizes the ingredients text and labels NER tokens.
    
    Args:
    examples (dict): A batch of examples containing 'ingredients' and 'NER'.

    Returns:
    dict: A dictionary with input_ids, attention_mask, and labels.
    """
    texts = examples["ingredients"]
    ner_entities = examples["NER"]
    
    # Tokenize the batch of texts
    encodings = tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    
    labels_batch = []
    
    for text, ner_entity, input_ids in zip(texts, ner_entities, encodings["input_ids"]):
        # Initialize labels with 0 (non-NER)
        labels = [0] * len(input_ids)
        
        # Tokenize the NER entity separately
        ner_tokens = tokenizer.tokenize(ner_entity)
        
        # Tokenize the full text
        input_tokens = tokenizer.tokenize(text)
        
        # Find and mark NER tokens
        for i in range(len(input_tokens) - len(ner_tokens) + 1):
            if input_tokens[i : i + len(ner_tokens)] == ner_tokens:
                token_start = i + 1  # Adjust for [CLS] token at the start
                for j in range(len(ner_tokens)):
                    labels[token_start + j] = 1  # Mark as NER
                break
        
        labels_batch.append(labels)
    
    return {
        "input_ids": encodings["input_ids"].tolist(),
        "attention_mask": encodings["attention_mask"].tolist(),
        "labels": labels_batch
    }

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(resep_df)

# Process dataset and remove unused columns
dataset = dataset.map(tokenize_ner, batched=True, remove_columns=["ingredients", "NER"])

# Split dataset into train and test
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

Map: 100%|██████████| 55018/55018 [00:06<00:00, 8468.55 examples/s]


In [29]:
print((train_dataset['input_ids'][0]))
print((train_dataset['attention_mask'][0]))
print((train_dataset['labels'][0]))

[101, 124, 6471, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [30]:
# Training arguments
training_args = TrainingArguments(
    fp16=True,
    output_dir="./ner_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0044,0.003663
2,0.0029,0.003521
3,0.0017,0.003604


TrainOutput(global_step=4128, training_loss=0.02516202608036787, metrics={'train_runtime': 419.625, 'train_samples_per_second': 314.667, 'train_steps_per_second': 9.837, 'total_flos': 4313470039248384.0, 'train_loss': 0.02516202608036787, 'epoch': 3.0})

In [31]:
# Save the fine-tuned model
model.save_pretrained("./ner_finetuned")
tokenizer.save_pretrained("./ner_finetuned")

('./ner_finetuned\\tokenizer_config.json',
 './ner_finetuned\\special_tokens_map.json',
 './ner_finetuned\\vocab.txt',
 './ner_finetuned\\added_tokens.json',
 './ner_finetuned\\tokenizer.json')

In [32]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

# Load the fine-tuned model and tokenizer
model_path = "./ner_finetuned"  # Path where the trained model is saved
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Create an NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")


Device set to use cuda:0


In [50]:
# Example input string
ingredient_text = "green meat"

# Get NER predictions
ner_results = ner_pipeline(ingredient_text)

# Print results
for entity in ner_results:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.4f}")


Entity: green, Label: PER, Score: 0.9989
Entity: meat, Label: PER, Score: 0.9989
