In [3]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
from transformers.modeling_outputs import SequenceClassifierOutput
from torch.nn import CrossEntropyLoss, MSELoss
import time

# Load dataset
df = pd.read_csv('output.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)


# Tokenize the ingredients and instructions
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

MAX_SEQ_LENGTH = 512

def truncate_sequences(sequence):
    if len(sequence) > MAX_SEQ_LENGTH:
        return sequence[:MAX_SEQ_LENGTH]
    return sequence

df['tokenized_ingredients'] = df['RecipeIngredientParts'].apply(lambda x: truncate_sequences(tokenizer.encode(x, add_special_tokens=True)))
df['tokenized_instructions'] = df['RecipeInstructions'].apply(lambda x: truncate_sequences(tokenizer.encode(x, add_special_tokens=True)))

# Pad sequences
def pad_sequences(sequences):
    return pad_sequence([torch.tensor(seq, dtype=torch.long) for seq in sequences], batch_first=True, padding_value=0)

padded_ingredients = pad_sequences(df['tokenized_ingredients'].tolist())
padded_instructions = pad_sequences(df['tokenized_instructions'].tolist())

# Encode labels
label_encoder = LabelEncoder()
df['encoded_labels'] = label_encoder.fit_transform(df['RecipeId'])

# Dataset class
class RecipeDataset(Dataset):
    def __init__(self, ingredients, instructions, labels):
        self.ingredients = ingredients
        self.instructions = instructions
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'ingredients': self.ingredients[idx],
            'instructions': self.instructions[idx],
            'labels': self.labels[idx]
        }

# Collate function
def collate_fn(batch):
    ingredients = [item['ingredients'] for item in batch]
    instructions = [item['instructions'] for item in batch]
    labels = torch.tensor([item['labels'] for item in batch], dtype=torch.long)

    padded_ingredients = pad_sequence(ingredients, batch_first=True, padding_value=0)
    padded_instructions = pad_sequence(instructions, batch_first=True, padding_value=0)

    return {
        'ingredients': padded_ingredients,
        'instructions': padded_instructions,
        'labels': labels
    }

# Prepare dataset
dataset = RecipeDataset(
    ingredients=padded_ingredients,
    instructions=padded_instructions,
    labels=df['encoded_labels'].tolist()
)

# DataLoader with custom collate function
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Train the model
model.train()

# Number of epochs
num_epochs = 3

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    print('-' * 10)
    
    epoch_loss = 0.0
    
    # Iterate over batches
    for batch_index, batch in enumerate(dataloader):
        # Clear gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(
            input_ids=batch['ingredients'],
            attention_mask=(batch['ingredients'] != 0).long(),
            labels=batch['labels']
        )
        
        # Loss
        loss = outputs.loss
        epoch_loss += loss.item()
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        # Print batch status
        print(f"Batch {batch_index + 1}/{len(dataloader)} - Loss: {loss.item():.4f}", end='\r')
    
    # Print epoch loss
    print(f"\nEpoch {epoch + 1} Loss: {epoch_loss / len(dataloader):.4f}")
    print()

print('Training complete!')


2024-06-07 21:42:25.912341: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-07 21:42:30.555434: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-06-07 21:42:30.555468: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-06-07 21:42:38.989264: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directo

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (934 > 512). Running this sequence through the model will result in indexing errors


Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/3
----------
Batch 140/1751 - Loss: 10.2306

In [4]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    print('-' * 10)
    
    epoch_loss = 0.0
    
    # Iterate over batches
    for batch_index, batch in enumerate(dataloader):
        # Clear gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(
            input_ids=batch['ingredients'],
            attention_mask=(batch['ingredients'] != 0).long(),
            labels=batch['labels']
        )
        
        # Loss
        loss = outputs.loss
        epoch_loss += loss.item()
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        # Print batch status
        print(f"Batch {batch_index + 1}/{len(dataloader)} - Loss: {loss.item():.4f}", end='\r')
    
    # Print epoch loss
    print(f"\nEpoch {epoch + 1} Loss: {epoch_loss / len(dataloader):.4f}")
    print()

print('Training complete!')

Epoch 1/3
----------
Batch 3/1751 - Loss: 10.3011

KeyboardInterrupt: 