<a href="https://colab.research.google.com/github/DefinitelyKev/nlp_bert_ml_recipe_models/blob/main/Segementation_model_new_version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Libraries**
---



In [None]:
import numpy as np
import pandas as pd
import re
from tqdm.auto import tqdm
import torch
from torch.optim import AdamW
from torch.nn.functional import softmax
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import get_linear_schedule_with_warmup
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import warnings

warnings.filterwarnings("ignore", message="The secret `HF_TOKEN` does not exist in your Colab secrets.")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Dateset Preprocessing**

---



In [None]:
# RECIPE_NUMBER_OF_ROWS = 40000

df_1 = pd.read_csv('/content/drive/MyDrive/raw_data/name_ingredient_step_tag.csv')
df_2 = pd.read_csv('/content/drive/MyDrive/raw_data/nist_verified.csv')
df_3 = pd.read_csv('/content/drive/MyDrive/raw_data/nist_verified1.csv')

combined_df = pd.concat([df_1, df_2, df_3], ignore_index=True)
combined_df_no_duplicates = combined_df.drop_duplicates()
df = combined_df_no_duplicates.reset_index(drop=True)

print(f"Total number of rows after removing duplicates: {len(df)}")

Total number of rows after removing duplicates: 28573


In [None]:
# Function to remove HTML anchor tags
def clean_text(text):
    text = text.lower()
    text = re.sub(r"(\d+)(?=[^\d\s/\-,.])", r"\1 ", text)
    # text = re.sub(r"(?<!\w)-", "", text)
    text = re.sub(r"[^\x00-\x7F]+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [None]:
def balance_dataset(df):
    label_counts = df['tag'].value_counts()
    max_count = label_counts.drop(['ingredient', 'step', 'other']).max()
    target_count = lambda category: min(df[df['tag'] == category].shape[0], int(max_count * 1.50))

    balanced_df = pd.concat([
        df[df['tag'] == 'ingredient'].sample(n=target_count('ingredient'), random_state=42),
        df[df['tag'] == 'step'].sample(n=target_count('step'), random_state=42),
        df[df['tag'] == 'other'].sample(n=target_count('other'), random_state=42),
        df[~df['tag'].isin(['ingredient', 'step', 'other'])]
    ])

    # Get new label counts
    new_label_counts = balanced_df['tag'].value_counts()

    print("New label counts:")
    print(new_label_counts)

    new_label_percentages = (new_label_counts / len(balanced_df)) * 100
    print("\nNew percentage distribution:")
    for label, percentage in new_label_percentages.items():
        print(f"{label}: {percentage:.2f}%")

    return balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
def plot_tag_distribution(df, title):
    tag_counts = df['tag'].value_counts()
    plt.figure(figsize=(12, 6))
    tag_counts.plot(kind='bar')
    plt.title(title)
    plt.xlabel('Tag')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# Preprocess the data
df = df[df['tag'] != 'notes'].reset_index(drop=True)
df['previous_line'] = df['previous_line'].fillna("")
df['next_line'] = df['next_line'].fillna("")
df['current_line'] = df['current_line'].apply(clean_text)
df['previous_line'] = df['previous_line'].apply(clean_text)
df['next_line'] = df['next_line'].apply(clean_text)
df['line_number'] = df['line_number'].astype(str)

df = balance_dataset(df)

# Modify data preparation to include context
df['text'] = df['line_number'] + '[SEP]' + '[PREV] ' + df['previous_line'] + ' [CURR] ' + df['current_line'] + ' [NEXT] ' + df['next_line']

New label counts:
tag
ingredient             2689
step                   2689
other                  2689
sub-headings           1921
recipe-sub-headings    1147
name                   1041
tags                    887
servings                420
Name: count, dtype: int64

New percentage distribution:
ingredient: 19.94%
step: 19.94%
other: 19.94%
sub-headings: 14.25%
recipe-sub-headings: 8.51%
name: 7.72%
tags: 6.58%
servings: 3.12%


# **Model Definition**

---



In [None]:
# Hyperparameters and Settings
BATCH_SIZE = 32
EPOCHS = 3
MAX_LENGTH = 512
LEARNING_RATE = 2e-5
EPSILON = 1e-8

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased", token=access_token)

# Add special tokens
special_tokens_dict = {'additional_special_tokens': ['[PREV]', '[CURR]', '[NEXT]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



In [None]:
def encode_texts(texts, tokenizer):
    return tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        max_length=MAX_LENGTH,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",
    )

In [None]:
# Encode labels
le = LabelEncoder()
df['tag'] = le.fit_transform(df['tag'])

# Split data
X, y = df['text'], df['tag']
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.1, random_state=42)

train_encodings = encode_texts(train_texts, tokenizer)
val_encodings = encode_texts(val_texts, tokenizer)

# Create datasets
train_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    torch.tensor(train_labels.values, dtype=torch.long)
)

val_dataset = TensorDataset(
    val_encodings['input_ids'],
    val_encodings['attention_mask'],
    torch.tensor(val_labels.values, dtype=torch.long)
)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=BATCH_SIZE)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=BATCH_SIZE)

In [None]:
# Modify the model configuration to handle the new tokens and segment IDs
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(le.classes_),
    token=access_token
)
model.resize_token_embeddings(len(tokenizer))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=EPSILON)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Helper function
def compute_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.mean(pred_flat == labels_flat)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Training Model**

---



In [None]:
# Training loop
for epoch in range(0, EPOCHS):
    model.train()
    total_train_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch+1} Training")):
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch
        model.zero_grad()

        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        if step % 10 == 0:
            print(f"Epoch: {epoch + 1}, Step: {step}, Loss: {loss.item()}")

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss:.4f}")

    # Validation loop
    model.eval()
    total_val_loss = 0
    total_val_accuracy = 0
    all_labels = []
    all_preds = []
    for batch in tqdm(validation_dataloader, desc="Validating"):
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)

        # Accumulate validation loss
        total_val_loss += outputs.loss.item()

        logits = outputs.logits.detach().cpu().numpy()
        label_ids = labels.to("cpu").numpy()

        # Accumulate validation accuracy
        total_val_accuracy += compute_accuracy(logits, label_ids)

        all_labels.extend(label_ids)
        all_preds.extend(np.argmax(logits, axis=1))

    # Calculate average validation loss and accuracy
    avg_val_loss = total_val_loss / len(validation_dataloader)
    avg_val_accuracy = total_val_accuracy / len(validation_dataloader)

    print(f"Validation Loss: {avg_val_loss:.4f}")
    print(f"Validation Accuracy: {avg_val_accuracy:.4f}")

    # Calculate other metrics
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted', zero_division=1)
    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")

Epoch 1 Training:   0%|          | 0/380 [00:00<?, ?it/s]

Epoch: 1, Step: 0, Loss: 2.144796371459961
Epoch: 1, Step: 10, Loss: 1.9044201374053955
Epoch: 1, Step: 20, Loss: 1.5412940979003906
Epoch: 1, Step: 30, Loss: 1.5996307134628296
Epoch: 1, Step: 40, Loss: 1.3039884567260742
Epoch: 1, Step: 50, Loss: 1.2020869255065918
Epoch: 1, Step: 60, Loss: 0.9764518141746521
Epoch: 1, Step: 70, Loss: 1.0949344635009766
Epoch: 1, Step: 80, Loss: 0.8602173328399658
Epoch: 1, Step: 90, Loss: 0.7120237946510315
Epoch: 1, Step: 100, Loss: 0.7021281123161316
Epoch: 1, Step: 110, Loss: 0.5336941480636597
Epoch: 1, Step: 120, Loss: 0.7163440585136414
Epoch: 1, Step: 130, Loss: 0.688702404499054
Epoch: 1, Step: 140, Loss: 0.3355995714664459
Epoch: 1, Step: 150, Loss: 0.4099465608596802
Epoch: 1, Step: 160, Loss: 0.49045687913894653
Epoch: 1, Step: 170, Loss: 0.27999699115753174
Epoch: 1, Step: 180, Loss: 0.3561209440231323
Epoch: 1, Step: 190, Loss: 0.5054215788841248
Epoch: 1, Step: 200, Loss: 0.14687801897525787
Epoch: 1, Step: 210, Loss: 0.336225658655166

Validating:   0%|          | 0/43 [00:00<?, ?it/s]

Validation Loss: 0.1598
Validation Accuracy: 0.9622
Validation Precision: 0.9614
Validation Recall: 0.9615
Validation F1 Score: 0.9611


Epoch 2 Training:   0%|          | 0/380 [00:00<?, ?it/s]

Epoch: 2, Step: 0, Loss: 0.05422581732273102
Epoch: 2, Step: 10, Loss: 0.1992984265089035
Epoch: 2, Step: 20, Loss: 0.14500024914741516
Epoch: 2, Step: 30, Loss: 0.14236392080783844
Epoch: 2, Step: 40, Loss: 0.37014642357826233
Epoch: 2, Step: 50, Loss: 0.14101721346378326
Epoch: 2, Step: 60, Loss: 0.20360511541366577
Epoch: 2, Step: 70, Loss: 0.06495071947574615
Epoch: 2, Step: 80, Loss: 0.26638758182525635
Epoch: 2, Step: 90, Loss: 0.27076053619384766
Epoch: 2, Step: 100, Loss: 0.04185399413108826
Epoch: 2, Step: 110, Loss: 0.29496055841445923
Epoch: 2, Step: 120, Loss: 0.0399206206202507
Epoch: 2, Step: 130, Loss: 0.18034005165100098
Epoch: 2, Step: 140, Loss: 0.21254287660121918
Epoch: 2, Step: 150, Loss: 0.025819871574640274
Epoch: 2, Step: 160, Loss: 0.06128700450062752
Epoch: 2, Step: 170, Loss: 0.21080708503723145
Epoch: 2, Step: 180, Loss: 0.1532842516899109
Epoch: 2, Step: 190, Loss: 0.02796051651239395
Epoch: 2, Step: 200, Loss: 0.34044569730758667
Epoch: 2, Step: 210, Loss:

Validating:   0%|          | 0/43 [00:00<?, ?it/s]

Validation Loss: 0.1195
Validation Accuracy: 0.9709
Validation Precision: 0.9704
Validation Recall: 0.9703
Validation F1 Score: 0.9703


Epoch 3 Training:   0%|          | 0/380 [00:00<?, ?it/s]

Epoch: 3, Step: 0, Loss: 0.05225297063589096
Epoch: 3, Step: 10, Loss: 0.021008934825658798
Epoch: 3, Step: 20, Loss: 0.052595119923353195
Epoch: 3, Step: 30, Loss: 0.10757499933242798
Epoch: 3, Step: 40, Loss: 0.022568179294466972
Epoch: 3, Step: 50, Loss: 0.09405261278152466
Epoch: 3, Step: 60, Loss: 0.018312791362404823
Epoch: 3, Step: 70, Loss: 0.35049331188201904
Epoch: 3, Step: 80, Loss: 0.022846810519695282
Epoch: 3, Step: 90, Loss: 0.014734281226992607
Epoch: 3, Step: 100, Loss: 0.09493360668420792
Epoch: 3, Step: 110, Loss: 0.024385705590248108
Epoch: 3, Step: 120, Loss: 0.0983184352517128
Epoch: 3, Step: 130, Loss: 0.024860385805368423
Epoch: 3, Step: 140, Loss: 0.15372876822948456
Epoch: 3, Step: 150, Loss: 0.2726602852344513
Epoch: 3, Step: 160, Loss: 0.2577438950538635
Epoch: 3, Step: 170, Loss: 0.19628998637199402
Epoch: 3, Step: 180, Loss: 0.38934746384620667
Epoch: 3, Step: 190, Loss: 0.02483525313436985
Epoch: 3, Step: 200, Loss: 0.027280259877443314
Epoch: 3, Step: 21

Validating:   0%|          | 0/43 [00:00<?, ?it/s]

Validation Loss: 0.1222
Validation Accuracy: 0.9724
Validation Precision: 0.9719
Validation Recall: 0.9718
Validation F1 Score: 0.9717


# **Saving Model**

---



In [None]:
import os
import pickle

# function to save the model
def save_model_scaler(model, le, model_path, le_path):
    os.makedirs(model_path, exist_ok=True)
    os.makedirs(le_path, exist_ok=True)

    # Save the model, tokenizer and scaler
    torch.save(model.state_dict(), os.path.join(model_path, 'model_state.bin'))
    with open(os.path.join(le_path, 'le.pkl'), 'wb') as mlb_file:
        pickle.dump(le, mlb_file)

    print(f"Model saved to {model_path}")
    print(f"Scaler saved to {le_path}")

# Define paths for model
model_path = '/content/drive/MyDrive/new_segmentation_model_v2/model'
le_path = '/content/drive/MyDrive/new_segmentation_model_v2/model_le'

# Assuming model training and other setups are done, and model is ready to be saved
save_model_scaler(model, le, model_path, le_path)

Model saved to /content/drive/MyDrive/new_segmentation_model_v2/model
Scaler saved to /content/drive/MyDrive/new_segmentation_model_v2/model_le


# **Loading Model**

---



# **Making Predictions**

---



In [None]:
def predict(text):
    lines = text.splitlines()
    texts = [clean_text(line) for line in lines if line.strip()]

    contextualized_texts = []
    for i in range(len(texts)):
        prev_line = texts[i-1] if i > 0 else ""
        curr_line = texts[i]
        next_line = texts[i+1] if i < len(texts)-1 else ""
        contextualized_texts.append(f"[PREV] {prev_line} [CURR] {curr_line} [NEXT] {next_line}")

    encoded_input = encode_texts(contextualized_texts, tokenizer)
    encoded_input = {key: val.to(device) for key, val in encoded_input.items()}

    model.eval()
    with torch.no_grad():
        outputs = model(**encoded_input)

    probs = softmax(outputs.logits, dim=1)
    predicted_class = probs.argmax(dim=1)
    predicted_label = le.inverse_transform(predicted_class.cpu().numpy())

    for i, (text, label) in enumerate(zip(texts, predicted_label)):
        print(f"Line {i+1}: {text} : Label: {label}")

In [None]:
example_text = """
CHOCOLATE CHIP COOKIES
These are a quick, easy, wholesome and delicious muffin. So wholesome it would even be suitable for breakfast.
Serves 3
Ingredients:
2 1/4 cups all-purpose flour
1 tsp baking soda
1 cup butter, softened
3/4 cup granulated sugar
2 large eggs
2 cups semisweet chocolate chips
Instructions:
1. Preheat oven to 375°F (190°C).
2. Mix flour and baking soda in a bowl.
3. Cream butter and sugar until light and fluffy.
4. Beat in eggs.
5. Gradually stir in flour mixture.
6. Fold in chocolate chips.
7. Drop by rounded tablespoons onto baking sheets.
8. Bake for 9 to 11 minutes or until golden brown.
9. Cool on baking sheets for 2 minutes; remove to wire racks to cool completely.
#dessert#nice&easy#GoodforMe
"""

predict(example_text)

Line 1: chocolate chip cookies : Label: name
Line 2: these are a quick, easy, wholesome and delicious muffin. so wholesome it would even be suitable for breakfast. : Label: other
Line 3: serves 3 : Label: servings
Line 4: ingredients: : Label: sub-headings
Line 5: 2 1/4 cups all-purpose flour : Label: ingredient
Line 6: 1 tsp baking soda : Label: ingredient
Line 7: 1 cup butter, softened : Label: ingredient
Line 8: 3/4 cup granulated sugar : Label: ingredient
Line 9: 2 large eggs : Label: ingredient
Line 10: 2 cups semisweet chocolate chips : Label: ingredient
Line 11: instructions: : Label: sub-headings
Line 12: 1. preheat oven to 375 f (190 c). : Label: step
Line 13: 2. mix flour and baking soda in a bowl. : Label: step
Line 14: 3. cream butter and sugar until light and fluffy. : Label: step
Line 15: 4. beat in eggs. : Label: step
Line 16: 5. gradually stir in flour mixture. : Label: step
Line 17: 6. fold in chocolate chips. : Label: step
Line 18: 7. drop by rounded tablespoons onto 

In [None]:
import os
import csv
def predict_and_save(text, output_file, model, tokenizer, le, device):
    lines = text.splitlines()
    texts = [clean_text(line) for line in lines if line.strip()]

    data = []
    for i in range(len(texts)):
        prev_line = texts[i-1] if i > 0 else ""
        curr_line = texts[i]
        next_line = texts[i+1] if i < len(texts)-1 else ""
        contextualized_text = f"[PREV] {prev_line} [CURR] {curr_line} [NEXT] {next_line}"

        encoded_input = encode_texts([contextualized_text], tokenizer)
        encoded_input = {key: val.to(device) for key, val in encoded_input.items()}

        model.eval()
        with torch.no_grad():
            outputs = model(**encoded_input)

        probs = softmax(outputs.logits, dim=1)
        predicted_class = probs.argmax(dim=1)
        predicted_label = le.inverse_transform(predicted_class.cpu().numpy())[0]

        data.append({
            "line_number": i + 1,
            "previous_line": prev_line,
            "current_line": curr_line,
            "next_line": next_line,
            "tag": predicted_label
        })

    df = pd.DataFrame(data)

    # If the file doesn't exist, write header. If it exists, append without writing the header.
    df.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False, quoting=csv.QUOTE_ALL)

    print(f"Predictions for this recipe have been appended to {output_file}")

In [None]:
# output_file = '/content/drive/MyDrive/predicted_recipes.csv'

# df_recipe = pd.read_csv('/content/drive/MyDrive/raw_data/recipe_or_not.csv')
# df_recipe = df_recipe[df_recipe['tag'] == "recipe"]

# # print(f"Total number of recipes: {len(df_recipe)}")

# # df_sample = df_recipe.sample(n=1200, random_state=42)
# # df_sample = df_sample.reset_index(drop=True)

# for recipe in df_recipe['caption']:
#     predict_and_save(recipe, output_file, model, tokenizer, le, device)

# print(f"All predictions have been saved to {output_file}")