# Training pretrained model BERT

In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizer
import torch

df = pd.read_csv('../../data/text/combined_cleaned.csv')

# Preprocess your text and labels
df['label'] = df['emotion'].factorize()[0]  # Encoding emotion labels
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['label'], test_size=0.2)

# Convert train_texts and val_texts to lists
train_texts = list(train_texts.tolist())  # Ensure the input is a list of strings
val_texts = list(val_texts.tolist())   # Ensure the input is a list of strings

train_labels = train_labels.tolist()  
val_labels = val_labels.tolist()

train_texts = train_texts[:5]
val_texts = val_texts[:5]
train_labels = train_labels[:5]
val_labels = val_labels[:5]

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text (debug by printing the type of inputs)
# print(f"Train texts type: {type(train_texts)}")  # Should be list
# print(f"First 5 train texts: {train_texts[:5]}")  # Check sample text
train_encodings = tokenizer(train_texts, truncation=True, padding=True)

# print(f"Val texts type: {type(val_texts)}")  # Should be list
# print(f"First 5 val texts: {val_texts[:5]}")  # Check sample text
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Convert to torch datasets
# print(f"Train labels type: {type(train_labels)}")  # Should be list
# print(f"First 5 train labels: {train_labels[:5]}")  # Check sample labels

print(train_encodings)

train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})
val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels
})


# from transformers import BertTokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# text_batch = ["I love Pixar.", "I don't care for Pixar."]
# encoding = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True)
# input_ids = encoding['input_ids']
# attention_mask = encoding['attention_mask']

{'input_ids': [[101, 2748, 2672, 2411, 4687, 2374, 5114, 2172, 6217, 2855, 2327, 14999, 2941, 2191, 3807, 2172, 2327, 2374, 2732, 2051, 102, 0], [101, 3198, 2113, 2518, 2673, 2689, 2514, 2978, 15311, 3046, 2562, 2599, 3341, 4121, 3815, 2689, 13260, 3499, 2272, 12139, 4608, 102], [101, 21271, 3676, 16665, 3711, 3185, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 9788, 2204, 2146, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 4067, 29337, 2569, 27576, 2191, 2154, 3524, 2733, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 



In [None]:
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['label'].unique()))

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    logging_dir='./logs',
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Fine-tune the model
trainer.train()

# Evaluate the model
trainer.evaluate()
