##### Classifying our 1000 rows of data using the BERT transformer model

In [None]:
import pandas as pd
pd.read_csv('processed_data.csv').head()

Unnamed: 0,username,processed_comments,standardized_label,reason
0,LoveAGoodTwist,"Female, Kentucky. 4 years out. Work equine on...",Veterinarian,The statement mentions working in equine-only...
1,wahznooski,"As a woman of reproductive age, fuck Texas",Others,The statement does not specifically mention an...
2,Churro_The_fish_Girl,what makes you want to become a vet?,Veterinarian,The question specifically mentions becoming a ...
3,abarthch,"I see of course there are changing variables, ...",Others,The statement does not relate to medical or ve...
4,VoodooKing,I have 412+ and faced issues because wireguard...,Others,The statement provided does not indicate any d...


#### Install the required libraries

In [2]:
!pip install accelerate>=0.21.0

In [3]:
!pip install transformers[torch]



In [None]:
# import transformers
# import torch
# import accelerate

# print(transformers.__version__)
# print(torch.__version__)
# print(accelerate.__version__)


4.40.2
2.2.1+cu121
0.30.1


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the data
data = pd.read_csv('processed_data.csv')

# Extract features and labels
X = data['processed_comments']
y = data['standardized_label']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)

# Define the custom dataset class
class RedditDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Load the respective dataset objects
train_dataset = RedditDataset(train_encodings, y_train)
test_dataset = RedditDataset(test_encodings, y_test)

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2015,0.218995,0.945,0.937354,0.93575,0.945
2,0.1309,0.145143,0.945,0.940566,0.936217,0.945
3,0.1913,0.179502,0.955,0.952322,0.953085,0.955


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.1795024871826172,
 'eval_accuracy': 0.955,
 'eval_f1': 0.9523223232962007,
 'eval_precision': 0.9530846917080086,
 'eval_recall': 0.955,
 'eval_runtime': 6.2123,
 'eval_samples_per_second': 32.194,
 'eval_steps_per_second': 4.024,
 'epoch': 3.0}

#### Saving the fine-tuned transformer model

In [None]:
model.save_pretrained('./fine-tuned-bert')
tokenizer.save_pretrained('./fine-tuned-bert')


('./fine-tuned-bert/tokenizer_config.json',
 './fine-tuned-bert/special_tokens_map.json',
 './fine-tuned-bert/vocab.txt',
 './fine-tuned-bert/added_tokens.json')

In [None]:
# Load the saved model
model = BertForSequenceClassification.from_pretrained('./fine-tuned-bert')
tokenizer = BertTokenizer.from_pretrained('./fine-tuned-bert')

# Function to classify new comments
def classify_comment(comment):
    inputs = tokenizer(comment, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    return label_encoder.inverse_transform(predictions.cpu().numpy())[0]

new_comment = "I'm planning to become a veterinarian because I love animals."
classification = classify_comment(new_comment)
print(f'Classification: {classification}')


Classification: Veterinarian


In [None]:
new_comment = "I'm in love with animals."
classification = classify_comment(new_comment)
print(f'Classification: {classification}')


Classification: Others


In [None]:
new_comment = "I work with animals."
classification = classify_comment(new_comment)
print(f'Classification: {classification}')


Classification: Others


In [None]:
new_comment = "I'm a medical doctor"
classification = classify_comment(new_comment)
print(f'Classification: {classification}')


Classification: Veterinarian


#### Predictions from our model are not spot on. Perhaps, due to the imbalanced dataset. Revert to the `train.ipynb` notebook to generate synthetic data.