In [1]:
%%capture
!pip install torch transformers datasets pandas scikit-learn
!pip install imbalanced-learn

In [None]:
# incidents_rest.csv is the dataset besides the trial
# incidents_sample.csv is the trial dataset

In [6]:
from transformers import BertTokenizer
import pandas as pd
# data = pd.read_csv('incidents_rest.csv', index_col=0)
data = pd.read_csv('incidents_train.csv', index_col=0)
data.sample()

Unnamed: 0,year,month,day,country,title,text,hazard-category,product-category,hazard,product
3681,2019,8,9,us,Dole Fresh Vegetables Announces Precautionary ...,"null Dole Fresh Vegetables, Inc. is voluntaril...",biological,fruits and vegetables,salmonella,baby spinach


In [5]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler, DataCollatorWithPadding
from torch.utils.data import DataLoader
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
from sklearn.metrics import classification_report

In [7]:
data.title.str.split().apply(len).describe() # title length

Unnamed: 0,title
count,5082.0
mean,13.282369
std,5.229355
min,1.0
25%,10.0
50%,13.0
75%,16.0
max,44.0


In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['title'], padding=True, truncation=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



# Label: `Hazard Category`

* Choose your target

In [9]:
label = 'hazard-category' # change this to: 'product-category', 'hazard', 'product' to alter the ground truth
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data[label])

* Data preprocessing

In [10]:

import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# !yes A | unzip /usr/share/nltk_data/corpora/wordnet.zip

In [11]:


# Get English stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

def remove_numbers(text):
    text_without_numbers = re.sub(r'\d+', '', text)
    return text_without_numbers

def lemmatize_text(text):
    word_tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in word_tokens]
    return ' '.join(lemmatized)


# Apply the remove_stopwords function to the 'title' column

data['title'] = data['title'].apply(remove_stopwords)



# Apply the remove_numbers function to the 'title' column

data['title'] = data['title'].apply(remove_numbers)

# Apply lemmatization to the 'title' column
data['title'] = data['title'].apply(lemmatize_text)

In [12]:
filter_data = data[data['title'].str.split().apply(len) > 8 ]
# filter_data = filter_data[filter_data['title'].str.split().apply(len) < 19 ]

In [15]:
# Split the data into training and testing sets
# train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
# train_df, test_df = train_test_split(filter_data, test_size=0.2, random_state=42)


# Oversampling
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from torch.nn.utils.rnn import pad_sequence
import torch

# Step 1: Split the dataset into training and testing sets
train_df, test_df = train_test_split(filter_data, test_size=0.2, random_state=42)

# Step 2: Separate features and labels for training data
X_train = train_df['title']
y_train = train_df['label']

# Step 3: Tokenize the training data
X_train_tokenized = [tokenizer(title, padding=True, truncation=True, return_tensors='pt')['input_ids'][0] for title in X_train]

# Step 4: Pad the tokenized sequences to a fixed length
X_train_padded = pad_sequence(X_train_tokenized, batch_first=True, padding_value=0)

# Step 5: Convert padded sequences to a NumPy array for oversampling
X_train_np = X_train_padded.numpy()

# Step 6: Apply oversampling to the tokenized data
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_train_np, y_train)

# Step 7: Create a new dataset with the oversampled data
resampled_dataset = Dataset.from_dict({'input_ids': [torch.tensor(x) for x in X_resampled], 'label': y_resampled})
resampled_dataset.set_format(type='torch', columns=['input_ids', 'label'])

# Step 8: Convert the test DataFrame to Hugging Face Dataset
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize_function, batched=True)
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])



# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply the tokenizer to the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Create DataCollator to handle padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, max_length=16)

# Convert dataset to PyTorch format
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Create DataLoader objects
# train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
# test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

# Create DataLoader for the oversampled training dataset
train_dataloader = DataLoader(resampled_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)

# Create DataLoader for the test dataset (remains unchanged)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)


Map:   0%|          | 0/799 [00:00<?, ? examples/s]

Map:   0%|          | 0/3194 [00:00<?, ? examples/s]

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

* Choose your model

In [16]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data[label].unique()))
model.to('cuda')  # Move model to GPU if available

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

* Train it

In [17]:
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model.train()

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)



  0%|          | 0/4500 [00:00<?, ?it/s]



* Assess it

In [18]:
model.eval()
total_predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_predictions.extend([p.item() for p in predictions])

#print(classification_report(test_df.label, total_predictions))
predicted_labels = label_encoder.inverse_transform(total_predictions)
gold_labels = label_encoder.inverse_transform(test_df.label.values)
print(classification_report(gold_labels, predicted_labels, zero_division=0))

                                precision    recall  f1-score   support

                     allergens       0.90      0.96      0.93       298
                    biological       0.89      0.97      0.93       272
                      chemical       0.89      0.75      0.81        52
food additives and flavourings       0.60      0.50      0.55         6
                foreign bodies       0.86      0.82      0.84        77
                         fraud       0.89      0.65      0.75        62
                     migration       0.00      0.00      0.00         1
          organoleptic aspects       1.00      0.40      0.57         5
                  other hazard       0.85      0.55      0.67        20
              packaging defect       1.00      0.50      0.67         6

                      accuracy                           0.89       799
                     macro avg       0.79      0.61      0.67       799
                  weighted avg       0.89      0.89      0.88 

In [19]:
model.save_pretrained("bert_hazard_category")

# Label: `Product Category`

remember to load dataset again, since in previous we may modified the column

In [20]:
from transformers import BertTokenizer
import pandas as pd
# data = pd.read_csv('incidents_rest.csv', index_col=0)
data = pd.read_csv('incidents_train.csv', index_col=0)
data.sample()

Unnamed: 0,year,month,day,country,title,text,hazard-category,product-category,hazard,product
4635,2020,11,28,ca,Certain Metro brand products recalled due to S...,Food Recall Warning - Certain Metro brand prod...,biological,prepared dishes and snacks,salmonella,prepared dishes and snacks


In [21]:
label = 'product-category'
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data[label])

In [22]:
# Apply the remove_stopwords function to the 'title' column

data['title'] = data['title'].apply(remove_stopwords)



# Apply the remove_numbers function to the 'title' column

data['title'] = data['title'].apply(remove_numbers)

# Apply lemmatization to the 'title' column
data['title'] = data['title'].apply(lemmatize_text)

In [None]:
filter_data = data[data['title'].str.split().apply(len) > 8 ]
filter_data = filter_data[filter_data['title'].str.split().apply(len) < 24 ]

In [23]:


# Split the data into training and testing sets
# train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
# train_df, test_df = train_test_split(filter_data, test_size=0.2, random_state=42)


# Oversampling
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from torch.nn.utils.rnn import pad_sequence
import torch

# Step 1: Split the dataset into training and testing sets
train_df, test_df = train_test_split(filter_data, test_size=0.2, random_state=42)

# Step 2: Separate features and labels for training data
X_train = train_df['title']
y_train = train_df['label']

# Step 3: Tokenize the training data
X_train_tokenized = [tokenizer(title, padding=True, truncation=True, return_tensors='pt')['input_ids'][0] for title in X_train]

# Step 4: Pad the tokenized sequences to a fixed length
X_train_padded = pad_sequence(X_train_tokenized, batch_first=True, padding_value=0)

# Step 5: Convert padded sequences to a NumPy array for oversampling
X_train_np = X_train_padded.numpy()

# Step 6: Apply oversampling to the tokenized data
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_train_np, y_train)

# Step 7: Create a new dataset with the oversampled data
resampled_dataset = Dataset.from_dict({'input_ids': [torch.tensor(x) for x in X_resampled], 'label': y_resampled})
resampled_dataset.set_format(type='torch', columns=['input_ids', 'label'])

# Step 8: Convert the test DataFrame to Hugging Face Dataset
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize_function, batched=True)
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply the tokenizer to the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Create DataCollator to handle padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, max_length=16)

# Convert dataset to PyTorch format
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Create DataLoader objects
# train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
# test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

# Create DataLoader for the oversampled training dataset
train_dataloader = DataLoader(resampled_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)

# Create DataLoader for the test dataset (remains unchanged)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

Map:   0%|          | 0/3194 [00:00<?, ? examples/s]

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

* Train

In [24]:
model_product_category = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data[label].unique()))
model_product_category.to('cuda')  # Move model to GPU if available

optimizer = AdamW(model_product_category.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model_product_category.train()
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_product_category(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/4500 [00:00<?, ?it/s]



* Test

In [25]:
model_product_category.eval()
total_predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_product_category(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_predictions.extend([p.item() for p in predictions])

#print(classification_report(test_df.label, total_predictions, zero_division=0))
predicted_labels = label_encoder.inverse_transform(total_predictions)
gold_labels = label_encoder.inverse_transform(test_df.label.values)
print(classification_report(gold_labels, predicted_labels, zero_division=0))

                                                   precision    recall  f1-score   support

                              alcoholic beverages       0.89      0.93      0.91       298
                      cereals and bakery products       0.88      0.96      0.92       272
     cocoa and cocoa preparations, coffee and tea       0.89      0.79      0.84        52
                                    confectionery       0.50      0.33      0.40         6
dietetic foods, food supplements, fortified foods       0.92      0.84      0.88        77
                                    fats and oils       0.87      0.66      0.75        62
                                   feed materials       0.00      0.00      0.00         1
                   food additives and flavourings       0.67      0.40      0.50         5
                           food contact materials       0.91      0.50      0.65        20
                            fruits and vegetables       0.80      0.67      0.73         

In [26]:
model_product_category.save_pretrained("bert_product_category")

# Label: `Hazard`

In [27]:
label = 'hazard'
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data[label])

# Split the data into training and testing sets
# train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
# Oversampling
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from torch.nn.utils.rnn import pad_sequence
import torch

# Step 1: Split the dataset into training and testing sets
train_df, test_df = train_test_split(filter_data, test_size=0.2, random_state=42)

# Step 2: Separate features and labels for training data
X_train = train_df['title']
y_train = train_df['label']

# Step 3: Tokenize the training data
X_train_tokenized = [tokenizer(title, padding=True, truncation=True, return_tensors='pt')['input_ids'][0] for title in X_train]

# Step 4: Pad the tokenized sequences to a fixed length
X_train_padded = pad_sequence(X_train_tokenized, batch_first=True, padding_value=0)

# Step 5: Convert padded sequences to a NumPy array for oversampling
X_train_np = X_train_padded.numpy()

# Step 6: Apply oversampling to the tokenized data
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_train_np, y_train)

# Step 7: Create a new dataset with the oversampled data
resampled_dataset = Dataset.from_dict({'input_ids': [torch.tensor(x) for x in X_resampled], 'label': y_resampled})
resampled_dataset.set_format(type='torch', columns=['input_ids', 'label'])

# Step 8: Convert the test DataFrame to Hugging Face Dataset
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize_function, batched=True)
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply the tokenizer to the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Create DataCollator to handle padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, max_length=16)

# Convert dataset to PyTorch format
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Create DataLoader objects
# train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
# test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

# Create DataLoader for the oversampled training dataset
train_dataloader = DataLoader(resampled_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)

# Create DataLoader for the test dataset (remains unchanged)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

Map:   0%|          | 0/3194 [00:00<?, ? examples/s]

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

In [28]:
model_hazard = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data[label].unique()))
model_hazard.to('cuda')  # Move model to GPU if available

optimizer = AdamW(model_hazard.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model_hazard.train()

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_hazard(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/4500 [00:00<?, ?it/s]



In [29]:
model_hazard.eval()
total_predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_hazard(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_predictions.extend([p.item() for p in predictions])

#print(classification_report(test_df.label, total_predictions, zero_division=0))
predicted_labels = label_encoder.inverse_transform(total_predictions)
gold_labels = label_encoder.inverse_transform(test_df.label.values)
print(classification_report(gold_labels, predicted_labels, zero_division=0))

                                      precision    recall  f1-score   support

                           Aflatoxin       0.90      0.93      0.92       298
                      abnormal smell       0.91      0.96      0.93       272
                     alcohol content       0.89      0.77      0.82        52
                           alkaloids       0.75      0.50      0.60         6
                           allergens       0.91      0.88      0.89        77
                              almond       0.82      0.65      0.72        62
altered organoleptic characteristics       0.00      0.00      0.00         1
                           amygdalin       1.00      0.60      0.75         5
              antibiotics, vet drugs       0.70      0.70      0.70        20
                       bacillus spp.       0.83      0.83      0.83         6

                            accuracy                           0.89       799
                           macro avg       0.77      0.68     

In [30]:
model_hazard.save_pretrained("bert_hazard")

# Label: `product`

In [31]:
label = 'product'
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data[label])

# Split the data into training and testing sets
# train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from torch.nn.utils.rnn import pad_sequence
import torch

# Step 1: Split the dataset into training and testing sets
train_df, test_df = train_test_split(filter_data, test_size=0.2, random_state=42)

# Step 2: Separate features and labels for training data
X_train = train_df['title']
y_train = train_df['label']

# Step 3: Tokenize the training data
X_train_tokenized = [tokenizer(title, padding=True, truncation=True, return_tensors='pt')['input_ids'][0] for title in X_train]

# Step 4: Pad the tokenized sequences to a fixed length
X_train_padded = pad_sequence(X_train_tokenized, batch_first=True, padding_value=0)

# Step 5: Convert padded sequences to a NumPy array for oversampling
X_train_np = X_train_padded.numpy()

# Step 6: Apply oversampling to the tokenized data
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_train_np, y_train)

# Step 7: Create a new dataset with the oversampled data
resampled_dataset = Dataset.from_dict({'input_ids': [torch.tensor(x) for x in X_resampled], 'label': y_resampled})
resampled_dataset.set_format(type='torch', columns=['input_ids', 'label'])

# Step 8: Convert the test DataFrame to Hugging Face Dataset
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize_function, batched=True)
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply the tokenizer to the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Create DataCollator to handle padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, max_length=16)

# Convert dataset to PyTorch format
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# # Create DataLoader objects
# train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
# test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

# Create DataLoader for the oversampled training dataset
train_dataloader = DataLoader(resampled_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)

# Create DataLoader for the test dataset (remains unchanged)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

Map:   0%|          | 0/3194 [00:00<?, ? examples/s]

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

In [32]:
model_product = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data[label].unique()))
model_product.to('cuda')  # Move model to GPU if available

optimizer = AdamW(model_product.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
model_product.train()
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_product(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/4500 [00:00<?, ?it/s]



In [34]:
model_product.eval()
total_predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_product(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_predictions.extend([p.item() for p in predictions])

#print(classification_report(test_df.label, total_predictions, zero_division=0))
predicted_labels = label_encoder.inverse_transform(total_predictions)
gold_labels = label_encoder.inverse_transform(test_df.label.values)
print(classification_report(gold_labels, predicted_labels, zero_division=0))



                                     precision    recall  f1-score   support

             Catfishes (freshwater)       0.90      0.96      0.93       298
                    Dried pork meat       0.94      0.96      0.95       272
              Fishes not identified       0.86      0.83      0.84        52
                 Groupers (generic)       0.60      0.50      0.55         6
           Not classified pork meat       0.88      0.87      0.88        77
         Pangas catfishes (generic)       0.81      0.68      0.74        62
Precooked cooked pork meat products       0.00      0.00      0.00         1
 Torpedo-shaped catfishes (generic)       1.00      0.60      0.75         5
                      Veggie Burger       0.73      0.55      0.63        20
                    adobo seasoning       1.00      0.67      0.80         6

                           accuracy                           0.90       799
                          macro avg       0.77      0.66      0.71       7

In [35]:
model_product.save_pretrained("bert_product")
tokenizer.save_pretrained("bert_tokenizer")

('bert_tokenizer/tokenizer_config.json',
 'bert_tokenizer/special_tokens_map.json',
 'bert_tokenizer/vocab.txt',
 'bert_tokenizer/added_tokens.json')

In [36]:
!zip bert_baseline.zip bert_*

  adding: bert_hazard/ (stored 0%)
  adding: bert_hazard_category/ (stored 0%)
  adding: bert_product/ (stored 0%)
  adding: bert_product_category/ (stored 0%)
  adding: bert_tokenizer/ (stored 0%)


# Loading a trained baseline

In [37]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

def predict(texts, model_path, tokenizer_path='bert-base-uncased'):
    # Load the saved tokenizer
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

    # Load the saved model
    model = BertForSequenceClassification.from_pretrained(model_path)

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize the input texts
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Set the model to evaluation mode
    model.eval()

    # Make predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

    return predictions

In [38]:
from sklearn.metrics import f1_score

def compute_score(hazards_true, products_true, hazards_pred, products_pred):
  # compute f1 for hazards:
  f1_hazards = f1_score(
    hazards_true,
    hazards_pred,
    average='macro'
  )

  # compute f1 for products:
  f1_products = f1_score(
    products_true[hazards_pred == hazards_true],
    products_pred[hazards_pred == hazards_true],
    average='macro'
  )

  return (f1_hazards + f1_products) / 2.

### Sub-Task 1:

In [39]:
trial = pd.read_csv('incidents_train.csv', index_col=0)

In [1]:
predictions = predict(trial.title.to_list(), "/content/bert_hazard_category")

# Decode predictions back to string labels
label_encoder = LabelEncoder()
label_encoder.fit(data['hazard-category'])

trial['bert-hazard-category'] = predict(trial.title.to_list(), "bert_hazard_category")
print(classification_report(trial['hazard-category'], trial['bert-hazard-category'], zero_division=0))

NameError: name 'predict' is not defined

In [None]:
predictions = predict(trial.title.to_list(), "bert_product_category")

# Decode predictions back to string labels
label_encoder = LabelEncoder()
label_encoder.fit(data['product-category'])

trial['bert-product-category'] = predict(trial.title.to_list(), "bert_product_category")
print(classification_report(trial['product-category'], trial['bert-product-category'], zero_division=0))

In [None]:
print('Score Sub-Task 1:', compute_score(trial['hazard-category'], trial['product-category'], trial['bert-hazard-category'], trial['bert-product-category']))

### Sub-Task 2:

In [None]:
predictions = predict(trial.title.to_list(), "bert_hazard")

# Decode predictions back to string labels
label_encoder = LabelEncoder()
label_encoder.fit(data['hazard'])

trial['bert-hazard'] = predict(trial.title.to_list(), "bert_hazard")
print(classification_report(trial['hazard'], trial['bert-hazard'], zero_division=0))

In [None]:
predictions = predict(trial.title.to_list(), "bert_product")

# Decode predictions back to string labels
label_encoder = LabelEncoder()
label_encoder.fit(data['product'])

trial['bert-product'] = predict(trial.title.to_list(), "bert_product")
print(classification_report(trial['product'], trial['bert-product'], zero_division=0))

In [None]:
print('Score Sub-Task 2:', compute_score(trial['hazard'], trial['product'], trial['bert-hazard'], trial['bert-product']))