<a href="https://colab.research.google.com/github/DefinitelyKev/nlp_bert_ml_recipe_models/blob/main/Tagging_model_tags.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Libraries**
---



In [None]:
!pip install scikit-multilearn



In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics import f1_score, recall_score, precision_score, hamming_loss
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import DistilBertTokenizer, DistilBertModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
import torch.nn as nn
import torch.nn.functional as F
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from torch.nn import BCEWithLogitsLoss, GELU, Dropout, Linear, Module
from tqdm.auto import tqdm
from skmultilearn.model_selection import iterative_train_test_split
import ast
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Dataset Preprocessing**

---



In [None]:
NUMBER_OF_ROWS = 80000
df = pd.read_csv('/content/drive/MyDrive/recipes_w_search_terms.csv', nrows=NUMBER_OF_ROWS)

In [None]:
# Function to remove HTML anchor tags
def remove_html_tags(text):
    pattern = r'<a href="[^"]*">([^<]*)</a>'
    text = re.sub(pattern, r"\1", text)
    return text.lower()

def clean_text(text):
    if isinstance(text, list):
        return [clean_text_single(t) for t in text]
    else:
        return clean_text_single(text)

def clean_text_single(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r"#\S+", "", text)  # Remove hashtags
    text = re.sub(r"^\d+\.\s+", "", text)  # Remove numbers followed by a dot at the start of text
    text = re.sub(r"(\d+)(?=[^\d\s/\-,.])", r"\1 ", text)  # Add space after numbers, unless followed by punctuation
    text = re.sub(r"(?<!\w)-", "", text)  # Remove '-' if it is at the start of the text
    # text = re.sub(r"[^\w\s/,.-]", "", text)  # Remove punctuation but keep /, -, .
    text = re.sub(r"[^\x00-\x7F]+", "", text)  # Remove non-ASCII characters like emojis
    text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace and strip leading/trailing spaces
    return text

def preprocess_dataset(dataframe, text_columns):
    for column in text_columns:
        dataframe[column] = dataframe[column].apply(lambda x: clean_text(x))
    return dataframe

# Function to remove specific tags
def remove_specific_tags(
    tags_list,
    tags_to_remove={
        "preparation",
        "time-to-make",
        "course",
        "cuisine",
        "easy",
        "main-ingredient",
        "low-in-something",
        "equipment",
        "taste-mood",
        "occasion",
        "number-of-servings",
        "dietary",
        "oven",
        "healthy-2",
        "technique",
        "high-in-something",
        "free-of-something",
        "small-appliance",
        "southern-united-states",
        "heirloom-historical",
        "southwestern-united-states",
        "novelty",
    },
):
    return [tag for tag in tags_list if tag not in tags_to_remove]

In [None]:
# Function to stem and remove stopwords from a text
def process_text(text):
    words = nltk.word_tokenize(text)
    # Process each word to stem and remove stopwords, but keep numbers
    return ' '.join([stemmer.stem(word) if word.isalpha() else word for word in words if word not in stop_words or not word.isalpha()])

# Function to process lists of strings (like ingredients and steps)
def process_list(lst):
    return [process_text(item) for item in lst]

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Preprocessing steps
df.drop(columns=["id", "description", "ingredients", "serving_size", "servings", "search_terms"], inplace=True)
df.rename(columns={"ingredients_raw_str": "ingredients_units"}, inplace=True)
df.dropna(inplace=True)

df['steps'] = df['steps'].apply(ast.literal_eval)

df["ingredients_units"] = df["ingredients_units"].apply(lambda x: " ".join(x.split()))
df["ingredients_units"] = df["ingredients_units"].apply(ast.literal_eval)
df["ingredients_units"] = df["ingredients_units"].apply(lambda lst: [remove_html_tags(item) for item in lst if item.strip()])

df["tags"] = df["tags"].apply(ast.literal_eval)
df["tags"] = df["tags"].apply(lambda x: remove_specific_tags(x))

df = preprocess_dataset(df, ['name', 'steps', 'ingredients_units'])
df['steps_stemmed'] = df['steps'].apply(lambda lst: process_list(lst))


In [None]:
df.head()

Unnamed: 0,name,ingredients_units,steps,tags,ingredients_stemmed,steps_stemmed
0,grilled garlic cheese grits,"[4 cups water, 1 cup uncooked old fashion grit...",[i a sauce pan bring water to a boil slowly ad...,"[side-dishes, eggs-dairy, refrigerator, diabet...","[4 cup water, 1 cup uncook old fashion grit, 1...",[sauc pan bring water boil slowli add grit sal...
1,simple shrimp and andouille jambalaya,"[1 medium onion chopped coarse, 1 medium red b...",[in a food processor pulse the onion red peppe...,"[60-minutes-or-less, main-dish, pork, seafood,...","[1 medium onion chop coars, 1 medium red bell ...",[food processor puls onion red pepper garlic c...
2,black-and-white bean salad,"[1 cup canned white beans rinsed and drained, ...",[in a large bowl combine beans tomato onion an...,"[15-minutes-or-less, north-american, salads, s...","[1 cup can white bean rins drain, 1 cup can bl...","[larg bowl combin bean tomato onion celeri, ge..."
3,crock pot italian zucchini,"[2 zucchini sliced, 2 small yellow squash slic...",[put all ingredients in the crock pot and cook...,"[weeknight, side-dishes, vegetables, beginner-...","[2 zucchini slice, 2 small yellow squash slice...",[put ingredi crock pot cook low 6-8 hour]
4,beef stew with dried cherries,"[3 lbs beef stew meat, 3 tablespoons flour, 1 ...","[preheat oven to 350 f, cut beef into 1 inch c...","[main-dish, beef, meat, 4-hours-or-less]","[3 lb beef stew meat, 3 tablespoon flour, 1 te...","[preheat oven 350 f, cut beef 1 inch cube comb..."


In [None]:
# Filter out tags
df_exploded = df.explode("tags")
tag_counts = df_exploded["tags"].value_counts().nlargest(120)

# Keep only the tags that are in the top 125
tags_to_keep = tag_counts.index.tolist()
df["tags"] = df["tags"].apply(lambda x: [tag for tag in x if tag in tags_to_keep])

# Remove rows with empty tags list
df = df[df["tags"].map(len) > 0]

In [None]:
print(tag_counts[0:50])

tags
main-dish                26947
60-minutes-or-less       25343
4-hours-or-less          21638
meat                     20466
vegetables               18771
desserts                 17959
north-american           17840
30-minutes-or-less       16832
dinner-party             15949
low-sodium               15073
holiday-event            14444
low-carb                 14095
healthy                  13385
3-steps-or-less          12063
vegetarian               12005
low-calorie              11818
low-cholesterol          11720
american                 11666
fruit                    11452
eggs-dairy               11179
15-minutes-or-less       11091
inexpensive              10836
beginner-cook            10598
low-protein              10210
low-saturated-fat        10125
european                  9960
comfort-food              9914
presentation              9847
pasta-rice-and-grains     9744
stove-top                 9537
kid-friendly              9292
poultry                   8981
5-i

# **Model Definition**

---



In [None]:
# Hyperparameters and Settings
BATCH_SIZE = 16
EPOCHS = 5
MAX_LENGTH = 512
LEARNING_RATE = 10e-5
EPSILON = 1e-8
DROPOUT_RATE = 0.3
FOCAL_LOSS_GAMMA = 2.5
FOCAL_LOSS_REDUCTION = 'mean'


In [None]:
# Model Definition
class DistilBertForMultiLabelClassification(torch.nn.Module):
    def __init__(self, num_labels, dropout_rate=DROPOUT_RATE):
        super().__init__()
        self.num_labels = num_labels
        self.distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased", use_auth_token=access_token)

        # Expanded complex structure
        self.dropout = nn.Dropout(dropout_rate)
        self.dense1 = nn.Linear(self.distilbert.config.dim, self.distilbert.config.dim * 2)
        self.activation1 = nn.GELU()
        self.norm1 = nn.LayerNorm(self.distilbert.config.dim * 2)
        self.dense2 = nn.Linear(self.distilbert.config.dim * 2, self.distilbert.config.dim * 2)
        self.activation2 = nn.GELU()
        self.norm2 = nn.LayerNorm(self.distilbert.config.dim * 2)
        self.dense3 = nn.Linear(self.distilbert.config.dim * 2, self.distilbert.config.dim)  # bottleneck layer
        self.activation3 = nn.GELU()
        self.norm3 = nn.LayerNorm(self.distilbert.config.dim)

        # Adding residual connections
        self.residual1 = nn.Linear(self.distilbert.config.dim, self.distilbert.config.dim * 2)
        self.residual2 = nn.Linear(self.distilbert.config.dim * 2, self.distilbert.config.dim * 2)

        # Final classifier layer
        self.classifier = nn.Linear(self.distilbert.config.dim, num_labels)

    def forward(self, input_ids, attention_mask):
        distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = distilbert_output[0]  # (bs, seq_length, dim)
        pooled_output = hidden_state[:, 0]  # use the first token, typically [CLS]

        # Passing through the complex network layers with residual connections
        x = self.dropout(pooled_output)
        x = self.dense1(x) + self.residual1(pooled_output)
        x = self.activation1(x)
        x = self.norm1(x)

        x = self.dense2(x) + self.residual2(x)
        x = self.activation2(x)
        x = self.norm2(x)

        x = self.dense3(x)
        x = self.activation3(x)
        x = self.norm3(x)

        # Classifier
        logits = self.classifier(x)
        return logits

In [None]:
# Loss Definition
class FocalLoss(nn.Module):
    def __init__(self, alpha, gamma=FOCAL_LOSS_GAMMA, reduction=FOCAL_LOSS_REDUCTION, epsilon=EPSILON):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.reduction = reduction
        self.epsilon = epsilon
        if alpha is not None:
            self.alpha = torch.tensor(alpha, dtype=torch.float)
        else:
            self.alpha = None

    def forward(self, inputs, targets):
        # Compute the binary cross-entropy loss
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')

        # Compute the probability of the correct class (pt)
        pt = torch.sigmoid(inputs) * targets + (1 - torch.sigmoid(inputs)) * (1 - targets)
        pt = pt.clamp(min=self.epsilon)

        # Compute the focal loss
        alpha_factor = self.alpha.to(inputs.device) if self.alpha is not None else 1.
        F_loss = alpha_factor * (1 - pt) ** self.gamma * BCE_loss

        if self.reduction == 'mean':
            return torch.mean(F_loss)
        elif self.reduction == 'sum':
            return torch.sum(F_loss)
        else:
            return F_loss

In [None]:
def encode_texts(texts, tokenizer):
    # Utilize the tokenizer's batch processing feature
    encoding = tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        max_length=MAX_LENGTH,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",
    )
    return encoding["input_ids"], encoding["attention_mask"]

# **Training Model**

---



In [None]:
def find_optimal_threshold(true_labels, pred_probs):
    thresholds = np.arange(0.1, 1, 0.01)
    best_threshold = 0.5  # Default threshold
    best_f1 = 0.0
    for threshold in thresholds:
        # Convert probabilities to binary predictions
        temp_predictions = (pred_probs > threshold).astype(int)
        # Calculate F1 score
        f1 = f1_score(true_labels, temp_predictions, average='micro', zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold

In [None]:
# Preprocessing tags for multi-label classification
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['tags'])
num_labels_cols = len(mlb.classes_)
class_counts = np.sum(y, axis=0)

df.loc[:, 'combined_text'] = df.apply(lambda row: f"{row['name']} [SEP] {' '.join(row['ingredients_stemmed'])} [SEP] {' '.join(row['steps_stemmed'])}", axis=1)
texts = df['combined_text']

# Tokenize text
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased", use_auth_token=access_token)
inputs, masks = encode_texts(texts, tokenizer)

# Reshape index to fit the expected shape for iterative_train_test_split
X_indices = np.arange(inputs.shape[0]).reshape(-1, 1)

# Splitting data into train and temporary (validation + test) sets
X_train_indices, y_train, X_temp_indices, y_temp = iterative_train_test_split(X_indices, y, test_size=0.4)

# Splitting temporary into validation and test sets equally
X_val_indices, y_val, X_test_indices, y_test = iterative_train_test_split(X_temp_indices, y_temp, test_size=0.5)

# Convert arrays to tensors using clone().detach()
train_inputs = inputs[X_train_indices.flatten()].clone().detach()
train_masks = masks[X_train_indices.flatten()].clone().detach()
train_labels = torch.tensor(y_train, dtype=torch.float)

validation_inputs = inputs[X_val_indices.flatten()].clone().detach()
validation_masks = masks[X_val_indices.flatten()].clone().detach()
validation_labels = torch.tensor(y_val, dtype=torch.float)

# DataLoader setup
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=BATCH_SIZE)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = DataLoader(validation_data, batch_size=BATCH_SIZE)

print("Training set size:", len(train_inputs))
print("Validation set size:", len(validation_inputs))

def check_distribution(y, mlb):
    tag_counts = np.sum(y, axis=0)
    tag_distribution = {mlb.classes_[i]: tag_counts[i] for i in range(len(mlb.classes_))}
    return tag_distribution




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Training set size: 47972
Validation set size: 16052


In [None]:
# Model initialization
model = DistilBertForMultiLabelClassification(num_labels=num_labels_cols)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=EPSILON, weight_decay=0.01)

inverse_frequency = class_counts.sum() / class_counts
alpha = len(class_counts) * inverse_frequency / inverse_frequency.sum()
loss_fn = FocalLoss(alpha)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * EPOCHS)



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
def train_and_evaluate(model, optimizer, scheduler, train_dataloader, validation_dataloader):
  best_val_f1 = 0.0

  for epoch in range(0, EPOCHS):
      model.train()
      total_loss = 0
      for step, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch+1} Training")):
          batch = tuple(t.to(device) for t in batch)
          inputs, masks, labels = batch
          model.zero_grad()
          logits = model(inputs, attention_mask=masks)
          loss = loss_fn(logits, labels)
          total_loss += loss.item()
          loss.backward()

          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          optimizer.step()
          scheduler.step()

          if step % 10 == 0:
              print(f"Epoch {epoch+1}, Step {step}, Loss: {loss.item()}")

      avg_train_loss = total_loss / len(train_dataloader)
      print(f"Epoch {epoch+1} average training loss: {avg_train_loss}")

      # Validation step
      model.eval()
      val_loss = 0
      all_probs = []  # Store all prediction probabilities
      all_labels = []  # Store all true labels
      for batch in tqdm(validation_dataloader, desc="Validating"):
          batch = tuple(t.to(device) for t in batch)
          inputs, masks, labels = batch
          with torch.no_grad():
              logits = model(inputs, attention_mask=masks)
              loss = loss_fn(logits, labels)

          val_loss += loss.item()
          probs = torch.sigmoid(logits).cpu().numpy()
          labels = labels.cpu().numpy()

          all_probs.append(probs)
          all_labels.append(labels)

      # Concatenate all batches
      all_probs = np.concatenate(all_probs, axis=0)
      all_labels = np.concatenate(all_labels, axis=0)

      # Find the optimal threshold after processing all validation data
      optimal_threshold = find_optimal_threshold(all_labels, all_probs)
      print(f"Optimal threshold found: {optimal_threshold}")

      # Recalculate metrics using the optimal threshold
      predictions = all_probs > optimal_threshold
      avg_val_loss = val_loss / len(validation_dataloader)
      avg_val_f1 = f1_score(all_labels, predictions, average='micro', zero_division=0)
      avg_val_recall = recall_score(all_labels, predictions, average='micro', zero_division=0)
      avg_val_precision = precision_score(all_labels, predictions, average='micro', zero_division=0)
      avg_hamming_loss = hamming_loss(all_labels, predictions)

      print(f"Validation loss: {avg_val_loss}, F1: {avg_val_f1}, Recall: {avg_val_recall}, Precision: {avg_val_precision}, Hamming Loss: {avg_hamming_loss}")

      # Implement early stopping or model checkpointing based on improved F1 score
      if avg_val_f1 > best_val_f1:
          best_val_f1 = avg_val_f1
          torch.save(model.state_dict(), 'best_model_state.bin')

  print("Training complete.")

In [None]:
train_and_evaluate(model, optimizer, scheduler, train_dataloader, validation_dataloader)

Epoch 1 Training:   0%|          | 0/2999 [00:00<?, ?it/s]

Epoch 1, Step 0, Loss: 0.1866951733827591
Epoch 1, Step 10, Loss: 0.04455225169658661
Epoch 1, Step 20, Loss: 0.04060039669275284
Epoch 1, Step 30, Loss: 0.02642030082643032
Epoch 1, Step 40, Loss: 0.038938187062740326
Epoch 1, Step 50, Loss: 0.03366520628333092
Epoch 1, Step 60, Loss: 0.03663862124085426
Epoch 1, Step 70, Loss: 0.02628658339381218
Epoch 1, Step 80, Loss: 0.0310320146381855
Epoch 1, Step 90, Loss: 0.038019899278879166
Epoch 1, Step 100, Loss: 0.03228829428553581
Epoch 1, Step 110, Loss: 0.03562776371836662
Epoch 1, Step 120, Loss: 0.03588177636265755
Epoch 1, Step 130, Loss: 0.04351431503891945
Epoch 1, Step 140, Loss: 0.035847414284944534
Epoch 1, Step 150, Loss: 0.033898260444402695
Epoch 1, Step 160, Loss: 0.036876045167446136
Epoch 1, Step 170, Loss: 0.03442072868347168
Epoch 1, Step 180, Loss: 0.03657296299934387
Epoch 1, Step 190, Loss: 0.03915422037243843
Epoch 1, Step 200, Loss: 0.03894985839724541
Epoch 1, Step 210, Loss: 0.02568887174129486
Epoch 1, Step 220,

Validating:   0%|          | 0/1004 [00:00<?, ?it/s]

Optimal threshold found: 0.4099999999999998
Validation loss: 0.022743121795059437, F1: 0.4891788719966349, Recall: 0.5117048447073458, Precision: 0.46855252555239835, Hamming Loss: 0.0838498837112717


Epoch 2 Training:   0%|          | 0/2999 [00:00<?, ?it/s]

Epoch 2, Step 0, Loss: 0.023373208940029144
Epoch 2, Step 10, Loss: 0.023190809413790703
Epoch 2, Step 20, Loss: 0.02716526947915554
Epoch 2, Step 30, Loss: 0.02004840411245823
Epoch 2, Step 40, Loss: 0.02541869692504406
Epoch 2, Step 50, Loss: 0.019572235643863678
Epoch 2, Step 60, Loss: 0.019725803285837173
Epoch 2, Step 70, Loss: 0.017994532361626625
Epoch 2, Step 80, Loss: 0.023926736786961555
Epoch 2, Step 90, Loss: 0.027419721707701683
Epoch 2, Step 100, Loss: 0.020536428317427635
Epoch 2, Step 110, Loss: 0.023836689069867134
Epoch 2, Step 120, Loss: 0.02300076000392437
Epoch 2, Step 130, Loss: 0.019648250192403793
Epoch 2, Step 140, Loss: 0.017949046567082405
Epoch 2, Step 150, Loss: 0.01828138902783394
Epoch 2, Step 160, Loss: 0.01860474795103073
Epoch 2, Step 170, Loss: 0.021080344915390015
Epoch 2, Step 180, Loss: 0.021429475396871567
Epoch 2, Step 190, Loss: 0.022451389580965042
Epoch 2, Step 200, Loss: 0.023402847349643707
Epoch 2, Step 210, Loss: 0.01711593195796013
Epoch 

Validating:   0%|          | 0/1004 [00:00<?, ?it/s]

Optimal threshold found: 0.4299999999999998
Validation loss: 0.020706766428782112, F1: 0.5250580167401111, Recall: 0.532944274617227, Precision: 0.5174017498105015, Hamming Loss: 0.075647894343384


Epoch 3 Training:   0%|          | 0/2999 [00:00<?, ?it/s]

Epoch 3, Step 0, Loss: 0.013922175392508507
Epoch 3, Step 10, Loss: 0.018136976286768913
Epoch 3, Step 20, Loss: 0.024194754660129547
Epoch 3, Step 30, Loss: 0.01939256303012371
Epoch 3, Step 40, Loss: 0.016327813267707825
Epoch 3, Step 50, Loss: 0.02343185804784298
Epoch 3, Step 60, Loss: 0.01984674111008644
Epoch 3, Step 70, Loss: 0.019200632348656654
Epoch 3, Step 80, Loss: 0.020945221185684204
Epoch 3, Step 90, Loss: 0.021214859560132027
Epoch 3, Step 100, Loss: 0.024383051320910454
Epoch 3, Step 110, Loss: 0.020610295236110687
Epoch 3, Step 120, Loss: 0.020574159920215607
Epoch 3, Step 130, Loss: 0.022254349663853645
Epoch 3, Step 140, Loss: 0.026330742985010147
Epoch 3, Step 150, Loss: 0.019934063777327538
Epoch 3, Step 160, Loss: 0.018037879839539528
Epoch 3, Step 170, Loss: 0.021347172558307648
Epoch 3, Step 180, Loss: 0.01803411729633808
Epoch 3, Step 190, Loss: 0.02046641707420349
Epoch 3, Step 200, Loss: 0.016007088124752045
Epoch 3, Step 210, Loss: 0.0220181904733181
Epoch 

Validating:   0%|          | 0/1004 [00:00<?, ?it/s]

Optimal threshold found: 0.4199999999999998
Validation loss: 0.020058373020835726, F1: 0.5400811496085204, Recall: 0.5447285190625537, Precision: 0.5355124077145738, Hamming Loss: 0.07279207160063128


Epoch 4 Training:   0%|          | 0/2999 [00:00<?, ?it/s]

Epoch 4, Step 0, Loss: 0.019140664488077164
Epoch 4, Step 10, Loss: 0.016025930643081665
Epoch 4, Step 20, Loss: 0.01602640561759472
Epoch 4, Step 30, Loss: 0.013169710524380207
Epoch 4, Step 40, Loss: 0.017380330711603165
Epoch 4, Step 50, Loss: 0.016171716153621674
Epoch 4, Step 60, Loss: 0.020328184589743614
Epoch 4, Step 70, Loss: 0.01585022732615471
Epoch 4, Step 80, Loss: 0.012846216559410095
Epoch 4, Step 90, Loss: 0.016686884686350822
Epoch 4, Step 100, Loss: 0.018363283947110176
Epoch 4, Step 110, Loss: 0.018057607114315033
Epoch 4, Step 120, Loss: 0.01759321056306362
Epoch 4, Step 130, Loss: 0.01794441230595112
Epoch 4, Step 140, Loss: 0.01919514499604702
Epoch 4, Step 150, Loss: 0.0164966881275177
Epoch 4, Step 160, Loss: 0.01539037749171257
Epoch 4, Step 170, Loss: 0.01840042881667614
Epoch 4, Step 180, Loss: 0.016902465373277664
Epoch 4, Step 190, Loss: 0.01536879688501358
Epoch 4, Step 200, Loss: 0.015901459380984306
Epoch 4, Step 210, Loss: 0.020957987755537033
Epoch 4, 

# **Saving Model**

---



In [None]:
import os
import pickle

# function to save the model
def save_model_mlb(model, mlb, model_path, mlb_path):
    os.makedirs(model_path, exist_ok=True)
    os.makedirs(mlb_path, exist_ok=True)

    # Save the model and scaler
    torch.save(model.state_dict(), os.path.join(model_path, 'model_state.bin'))
    with open(os.path.join(mlb_path, 'mlb.pkl'), 'wb') as mlb_file:
        pickle.dump(mlb, mlb_file)

    print(f"Model saved to {model_path}")
    print(f"Scaler saved to {mlb_path}")

# Define paths for model
model_path = '/content/drive/MyDrive/tagging_model_tags/tags_model_v1'
mlb_path = '/content/drive/MyDrive/tagging_model_tags/tags_mlb_v1'

# Assuming model training and other setups are done, and model is ready to be saved
save_model_mlb(model, mlb, model_path, mlb_path)

In [None]:
!zip -r ./tagging_model_tags_v1.zip /content/drive/MyDrive/tagging_model_tags

# **Loading Model**

---



In [None]:
# from transformers import DistilBertModel
# import torch

# class DistilBertForMultiLabelClassification(torch.nn.Module):
#     def __init__(self, num_labels):
#         super().__init__()
#         self.num_labels = num_labels
#         self.distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")
#         self.pre_classifier = torch.nn.Linear(self.distilbert.config.dim, self.distilbert.config.dim)
#         self.classifier = torch.nn.Linear(self.distilbert.config.dim, num_labels)
#         self.dropout = torch.nn.Dropout(0.2)

#     def forward(self, input_ids, attention_mask=None):
#         distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
#         hidden_state = distilbert_output[0]
#         pooled_output = hidden_state[:, 0]
#         pooled_output = self.pre_classifier(pooled_output)
#         pooled_output = torch.nn.ReLU()(pooled_output)
#         pooled_output = self.dropout(pooled_output)
#         logits = self.classifier(pooled_output)
#         return logits

In [None]:
# num_labels = len(mlb.classes_)
# loaded_model = DistilBertForMultiLabelClassification(num_labels=num_labels)

# # Load the saved model state into the newly instantiated model
# loaded_model.load_state_dict(torch.load('./tagging_model_tag_v1/distilbert_multilabel_classification_model.pth'))

# # Don't forget to move the model to the correct device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# loaded_model = model.to(device)

In [None]:
example_text = "CHICKEN & BROCCOLI NOODLE STIR FRY ['2 tsp olive oil', '2 chicken breast, sliced', '1 broccoli, cut into florets', '200g egg noodles', 'Salt & pepper, to taste', '2 tbsp peanut butter', '2 tbsp soy sauce', '1 tsp rice vinegar', '1 tbsp sriracha', '1 tbsp oyster sauce', '2 tsp garlic powder', '1 tsp sesame oil', '2 tbsp water', 'Spring onion, to top', 'Sesame seeds, to top', 'Chilli flakes, to top'] ['1. Whisk together your sauce ingredients in a bowl. If your sauce is too thick, add more water.', '2. Cook your chicken in a fry pan with 1 tsp olive oil until cooked through. Set aside on a plate.', '3. In the same pan, add your broccoli with another 1 tsp of olive oil and 1 tbsp soy sauce until soft. Set aside on the same plate as the chicken.', '4. Cook your egg noodles according to the package instructions.', '5. In the same fry pan, pour in your sauce, stirring frequently for 30 seconds.', '6. Pour in your noodles and stir.', '7. Mix your chicken and broccoli back in, allowing all of the sauce to soak through.', '8. Scoop evenly into 3 plates, and top with fresh spring onion, sesame seeds and chilli flakes.']"

# **Making Predictions**

---



In [None]:
def encode_text_for_prediction(text, tokenizer, max_length=256):
    """Encode a single text for prediction using the provided tokenizer."""
    encoded_dict = tokenizer.encode_plus(
        text,  # Text to encode.
        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
        max_length=max_length,  # Pad or truncate.
        pad_to_max_length=True,  # Pad to max_length.
        return_attention_mask=True,  # Include attention masks.
        return_tensors='pt',  # Return PyTorch tensors.
    )

    return encoded_dict['input_ids'], encoded_dict['attention_mask']

def predict_tags(model, tokenizer, mlb, text, device):
    """Predict tags for a given text using the trained model."""
    # Prepare the text
    input_ids, attention_mask = encode_text_for_prediction(text, tokenizer)

    # Move tensors to the configured device
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    model.eval()  # Evaluation mode
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    # Apply sigmoid to get probabilities
    probabilities = torch.sigmoid(outputs).cpu().numpy()
    predictions = (probabilities > 0.5).astype(int)  # Apply threshold

    # Decode the predictions
    predicted_tags = mlb.inverse_transform(predictions)


    return predicted_tags

# Example usage
text = "Your new recipe text here"
# Predict tags
predicted_tags = predict_tags(model, tokenizer, mlb, example_text, device)
print("Predicted Tags:", predicted_tags)

