<div style="background-image: url(https://wallpaperaccess.com/full/1614776.jpg); height: 500px; width: 100%; border: 1px solid black;   background-size: cover; color:white;"> 
<br>
<br>
<h1 style="text-align:center;">BERT - TEXT CLASSIFICATION</h1>
<br>
<h2 style='text-align: center;'> Mercado Libre
<h4 style='text-align: center;'> Andrés Felipe Téllez
<h4 style='text-align: center;'>October 2024</h4>
</div>

Resources: 
- https://medium.com/@khang.pham.exxact/text-classification-with-bert-7afaacc5e49b
- https://huggingface.co/docs/transformers/training


### **Import libraries**

In [75]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import jsonlines

### **Loading data**

In [94]:
def read_jsonline(file_name: str) -> pd.DataFrame:
    '''Read jsonline files as a pandas data frame.'''

    with jsonlines.open(f'../data/raw/{file_name}.jsonlines') as reader:
        
        data: list = []
        for json_line in reader:
            data.append(json_line)

    df = pd.json_normalize(data, sep='.') 
    
    return df

In [95]:
FILE_NAME = 'MLA_100k'
df = read_jsonline(FILE_NAME)

### **Selecting text features**

In [98]:
cols = ['condition', 'title', 'permalink']

### **Merging text to one text feature**

In [99]:
df['text'] = df.apply(lambda row: {'title': row['title'], 'permalink': row['permalink']}, axis=1)

### **Import dataset**

In [100]:
texts = df['text'].tolist()
labels = (df['condition'].apply(lambda x: 1 if x == 'new' else 0)).tolist()

### **Tokenize dataset**

In [56]:
# Create a custom dataset class for text classification
class TextClassificationDataset(Dataset):
        
    def __init__(self, texts, labels, tokenizer, max_length):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

### **Build model**

In [57]:
# Build our customer BERT classifier + dropout layer to prever overfitting
class BERTClassifier(nn.Module):
    
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.pooler_output
            x = self.dropout(pooled_output)
            logits = self.fc(x)
            return logits

### **Training model**

In [58]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

### **Evaluation**

In [59]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

### **Predictions**

In [60]:
def predict_usage(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)

        return "nuevo" if preds.item() == 1 else "usado"

In [61]:
# Parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-4

### **Loading and splitting the data.**

In [62]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

### **Initialize tokenizer, dataset, and data loader**

In [63]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

### **Set up the device and model**

In [64]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

### **Set up optimizer and learning rate scheduler**

In [65]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

### **Training the model**

In [66]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Epoch 1/4
Validation Accuracy: 0.8250
              precision    recall  f1-score   support

           0       0.80      0.83      0.82      9283
           1       0.85      0.82      0.83     10717

    accuracy                           0.82     20000
   macro avg       0.82      0.83      0.82     20000
weighted avg       0.83      0.82      0.83     20000

Epoch 2/4
Validation Accuracy: 0.8337
              precision    recall  f1-score   support

           0       0.78      0.90      0.83      9283
           1       0.90      0.78      0.83     10717

    accuracy                           0.83     20000
   macro avg       0.84      0.84      0.83     20000
weighted avg       0.84      0.83      0.83     20000

Epoch 3/4
Validation Accuracy: 0.8386
              precision    recall  f1-score   support

           0       0.80      0.86      0.83      9283
           1       0.87      0.82      0.84     10717

    accuracy                           0.84     20000
   macro avg  

In [67]:
# Saving the final model
torch.save(model.state_dict(), "bert_classifier.pth")

### **Evaluate**

In [None]:
# Test sentiment prediction
test_text = "iphone sin uso"
sentiment = predict_usage(test_text, model, tokenizer, device)

print("Auriculares nuevos")
print(f"Predicted sentiment: {sentiment}")