In [None]:
import os
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from transformers import DistilBertTokenizer
from transformers import DistilBertForSequenceClassification
from transformers import TrainingArguments
from datasets import Dataset, DatasetDict

In [None]:
os.chdir('..')
from src.utils.get_config import get_config
CONFIG_FILE = "./config/bert_config.yml"

DATA_PATH = get_config(CONFIG_FILE, "DATA_PATH")
SAVE_MODEL = get_config(CONFIG_FILE, "SAVE_MODEL")
MODEL = get_config(CONFIG_FILE, "MODEL")

In [None]:
data = pd.read_csv(DATA_PATH)
data.head()

In [None]:
# skiping unnecessary columns
columns_to_keep = ["text", "fear", "optimism", "neutral"]
data = data[columns_to_keep]
data.head()

In [None]:
def get_category(row):
    if row['fear'] == 1:
        return "fear"
    elif row['optimism'] == 1:
        return "hope"
    elif row['neutral'] == 1:
        return "neutral"

In [None]:
data['category'] = data.apply(get_category, axis=1)
data = data[['text', 'category']]
data.head()

In [None]:
data.groupby('category')['text'].count()

In [None]:
label_map = {"neutral": 0, "hope": 1, "fear": 2}
data["labels"] = data["category"].map(label_map)

temp = data.drop(columns=["category"])

train_data, temp_data = train_test_split(temp, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained(MODEL)

dataset_tr = Dataset.from_pandas(train_data)
dataset_val = Dataset.from_pandas(val_data)
dataset_test = Dataset.from_pandas(test_data)

dataset = DatasetDict({
    "train": dataset_tr,
    "validation": dataset_val,
    "test": dataset_test
})

def preprocess_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

encoded_dataset = dataset.map(preprocess_function, batched=True)

print(encoded_dataset['train'][0])

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(MODEL, num_labels=3)

training_args = TrainingArguments(
    output_dir="./results",          # Directory for saving results
    evaluation_strategy="epoch",     # Evaluate at the end of each epoch
    learning_rate=5e-5,              # Initial learning rate
    per_device_train_batch_size=16,  # Batch size per GPU
    num_train_epochs=1,              # Number of epochs
    weight_decay=0.01,               # Regularization
    logging_dir="./logs/bert_logs",  # Directory for logs
    logging_steps=10                 # Log every 10 steps
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,                          # The DistilBERT model
    args=training_args,                   # Training arguments
    train_dataset=encoded_dataset['train'],  # Training data
    eval_dataset=encoded_dataset['validation']  # Validation data
)

# Start training
trainer.train()

In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Get predictions
predictions = trainer.predict(encoded_dataset['test'])
predicted_labels = np.argmax(predictions.predictions, axis=1)
true_labels = encoded_dataset['test']['labels']

print(classification_report(true_labels, predicted_labels))

In [None]:
predictions = trainer.predict(encoded_dataset['test'])
predicted_labels = np.argmax(predictions.predictions, axis=1)
true_labels = encoded_dataset['test']['labels']
print("Accuracy:")
print(accuracy_score(true_labels, predicted_labels))

print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))

cm = confusion_matrix(true_labels, predicted_labels)
print("\nConfusion Matrix:")
print(cm)

In [None]:
model.save_pretrained(SAVE_MODEL)
tokenizer.save_pretrained(SAVE_MODEL)