#  Fine-tuning to a BERT-based model with classification layer: Polarity detection in Mental Health and Social Networks

The main idea is to use BERT models to tokenise texts to be classified by a new neural network, which will be placed at the output of the tokeniser. The idea of this is to "specialise" the classifier on the given task, in this case, classifying twitch comments.

In [None]:
!pip install transformers
!pip install torch torchvision
!pip install scikit-learn
!pip install matplotlib
!pip install tqdm
!pip install pandas

In [None]:
#In case we use Google Colab
from google.colab import drive
drive.mount('/content/drive')

## Preparing the database: Mental Health Corpus

Ideally a classifier should have data associated with labels. As machines do not understand words directly, it is best to use numeric labels, such as natural numbers or one-hot encoding. The following code transforms the unique variables contained in the tag column and creates a new column.

In [None]:
import pandas as pd

# Upload CSV file
df = pd.read_csv('/content/drive/MyDrive/BERTAttentMask_SM/CorpusSaludMentalCompleto.csv', sep=';', encoding='utf-8')

# Get the label columns
labels = df['Polarity']


# Get all unique labels and assign them a numeric value
unique_labels = labels.unique()
label_to_numeric = {label: i for i, label in enumerate(unique_labels)}

# Create a new column with numeric values
df['Numeric_Label'] = labels.map(label_to_numeric)

#Label Positivo is Positive
#Label Negativo is Negative
#Label Indeterminado is Neutral

# Function to convert numeric label back to text
def numeric_to_text(numeric_label):
    numeric_to_label = {v: k for k, v in label_to_numeric.items()}
    return numeric_to_label[numeric_label]

# Example of use of the function
print(numeric_to_text(0))  # This will print the text corresponding to the numeric label 0
print(numeric_to_text(1))  
print(numeric_to_text(2))  


# Save the modified DataFrame in a new CSV file
df.to_csv('data_numeric_label_sm_polaridad.csv', index=False)

print("A new column with numeric values has been created and the modified DataFrame has been saved in 'data_numeric_label.csv'.")




# Fine-tuning BERT model with classification layer
We load the created database. For this example we used current pytorch, it is suggested to use the version higher than 2

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

# Step 1: Load data from a CSV file
def load_data(file_path):
    data = pd.read_csv(file_path)
    texts = data['Text'].tolist()
    labels = data['Numeric_Label'].tolist()
    return texts, labels

# Path to CSV file
file_path = 'data_numeric_label_sm_polaridad.csv'  # Replace with the path to your CSV file

# Load Data
texts, labels = load_data(file_path)

# Step 2: Initialise BERT tokeniser and load pre-trained model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Step 3: Tokenise and encrypt the data
encodings = tokenizer(texts, truncation=True, padding=True)

# Step 4: Create PyTorch dataset
dataset = TensorDataset(torch.tensor(encodings['input_ids']),
                        torch.tensor(encodings['attention_mask']),
                        torch.tensor(labels))

## Model Training
A model based on "Bertsequenceclassificator" is trained to ensure that the pipeline of tokenised data is faster to implement. This model can be fine-tuned using classical methods (hyperparameter settings).

In [None]:
# Step 5: Configure k-fold cross-validation
kf = KFold(n_splits=8, shuffle=True, random_state=42)  # We change the K-fold value according to our model

# Lists for storing metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []
confusion_matrices = []

# Step 6: Train and evaluate the model using k-fold cross-validation
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=max(labels)+1).to(device)  
# Number of labels is the maximum numeric value in the Numeric_Label column.
optimizer = AdamW(model.parameters(), lr=1e-6)

for fold, (train_indices, test_indices) in enumerate(kf.split(dataset)):
    train_dataset = torch.utils.data.Subset(dataset, train_indices)
    test_dataset = torch.utils.data.Subset(dataset, test_indices)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

    model.train()
    for epoch in range(8):  # 8 epoch in this case
        for batch in tqdm(train_loader, desc="Fold {} - Epoch {}".format(fold+1, epoch + 1)):
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    model.eval()
    correct = 0
    total = 0
    predicted_labels = []
    true_labels = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Fold {} - Evaluation".format(fold+1)):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            predicted_labels.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = correct / total
    accuracies.append(accuracy)
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    precisions.append(precision)
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    recalls.append(recall)
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    f1_scores.append(f1)

    print("Metrics in the test set (Fold {}):".format(fold+1))
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)

    # Calculating and storing the confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)
    confusion_matrices.append(cm)

    # Save the training model to be used in the future
    torch.save(model.state_dict(), 'bert_model_SM_Polaridad_fold_{}.pth'.format(fold+1))

## Evaluation of the trained BERT model with the classification layer
The evaluation is made with the data set not considered during training

In [None]:
# Calculate and display the average confusion matrix as a heat map with Matplotlib
average_cm = np.mean(confusion_matrices, axis=0)
average_cm = np.round(average_cm).astype(int)

# Define label names
labels_names = ["Positivo", "Negativo", "Indeterminado"]
#Label Positivo is Positive
#Label Negativo is Negative
#Label Indeterminado is Neutral

plt.figure(figsize=(8, 6))
plt.imshow(average_cm, interpolation='nearest', cmap='Blues')
plt.title('Confusion Matrix Average')
plt.colorbar()
tick_marks = np.arange(len(labels_names))
plt.xticks(tick_marks, labels_names, rotation=45)
plt.yticks(tick_marks, labels_names)
plt.tight_layout()
plt.ylabel('Real')
plt.xlabel('Predicted')

# Show the values of the confusion matrix in each cell
width, height = average_cm.shape
for x in range(width):
    for y in range(height):
        plt.annotate(str(average_cm[x][y]), xy=(y, x), horizontalalignment='center', verticalalignment='center')

plt.show()

# Show average metrics
print("\nAverage metrics across all folds:")
print("Average accuracy:", sum(accuracies) / len(accuracies))
print("Average Precision:", sum(precisions) / len(precisions))
print("Average Recall:", sum(recalls) / len(recalls))
print("Average F1-score:", sum(f1_scores) / len(f1_scores))


## Emotional response prediction of the trained BERT model
Give a few sentences and test how you classify the new model

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the pre-trained model
model_path = 'bert_model_V9_Polaridad_fold_6.pth' #Aqui se carga el modelo ya entrenado
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  
# Ensure that the number of labels is the same as the number used during training
model.load_state_dict(torch.load(model_path))
#If no GPU is used add map_location=torch.device('cpu')
model.eval()

# Initialising the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Text classification function
def classify_text(text):
    # Tokenise and encode text
    encoded_text = tokenizer(text, truncation=True, padding=True, return_tensors="pt")

    # Passing text through the model
    outputs = model(**encoded_text)

    # Getting the predictions
    _, predicted_class = torch.max(outputs.logits, 1)

    return predicted_class.item()

# Example to predict
while True:
    user_input = input("Enter a phrase to classify (or 'exit' to exit): ")
    if user_input.lower() == 'exit':
        break
    else:
        label_id = classify_text(user_input)
        labels_names = ["Positivo", "Negativo", "Indeterminado"]
        print("The sentece '{}' is classified as: {}".format(user_input, labels_names[label_id]))

## Complete training code in one step for 3 polarities: Positive, Negative, Neutral
All the above steps in one unified notebook


In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

# Step 1: Load data from a CSV file
def load_data(file_path):
    data = pd.read_csv(file_path)
    texts = data['Text'].tolist()
    labels = data['Numeric_Label'].tolist()
    return texts, labels

# Path to CSV file
file_path = 'data_numeric_label_v3_polaridad.csv'  # Replace with the path to your CSV file

# Load data
texts, labels = load_data(file_path)

# Step 2: Initialise BERT tokeniser and load pre-trained model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Step 3: Tokenise and encrypt the data
encodings = tokenizer(texts, truncation=True, padding=True)

# Step 4: Create PyTorch dataset
dataset = TensorDataset(torch.tensor(encodings['input_ids']),
                        torch.tensor(encodings['attention_mask']),
                        torch.tensor(labels))

# Step 5: Configure k-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)  # we can change the value of K-fold

# Lists for storing metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []
confusion_matrices = []

# Step 6: Train and evaluate the model using k-fold cross-validation
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=max(labels)+1).to(device)  
# Number of labels is the maximum numeric value in the Numeric_Label column
optimizer = AdamW(model.parameters(), lr=1e-6)

for fold, (train_indices, test_indices) in enumerate(kf.split(dataset)):
    train_dataset = torch.utils.data.Subset(dataset, train_indices)
    test_dataset = torch.utils.data.Subset(dataset, test_indices)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

    model.train()
    for epoch in range(8):  # 8 epoch in this case
        for batch in tqdm(train_loader, desc="Fold {} - Epoch {}".format(fold+1, epoch + 1)):
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    model.eval()
    correct = 0
    total = 0
    predicted_labels = []
    true_labels = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Fold {} - Evaluation".format(fold+1)):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            predicted_labels.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = correct / total
    accuracies.append(accuracy)
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    precisions.append(precision)
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    recalls.append(recall)
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    f1_scores.append(f1)

    print("Metrics in the test set(Fold {}):".format(fold+1))
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)

    # Calculating and storing the confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)
    confusion_matrices.append(cm)

    # Save the trained model
    torch.save(model.state_dict(), 'bert_model_V9_Polaridad_fold_{}.pth'.format(fold+1))

# Calculate and display the average confusion matrix as a heat map with Matplotlib
average_cm = np.mean(confusion_matrices, axis=0)
average_cm = np.round(average_cm).astype(int)

# Define the labels
labels_names = ["Positivo", "Negativo", "Indeterminado"]
#Label Positivo is Positive
#Label Negativo is Negative
#Label Indeterminado is Neutral

plt.figure(figsize=(8, 6))
plt.imshow(average_cm, interpolation='nearest', cmap='Blues')
plt.title('Average Confusion Matrix')
plt.colorbar()
tick_marks = np.arange(len(labels_names))
plt.xticks(tick_marks, labels_names, rotation=45)
plt.yticks(tick_marks, labels_names)
plt.tight_layout()
plt.ylabel('Real')
plt.xlabel('Predicted')

# Display confusion matrix values in each cell
width, height = average_cm.shape
for x in range(width):
    for y in range(height):
        plt.annotate(str(average_cm[x][y]), xy=(y, x), horizontalalignment='center', verticalalignment='center')

plt.show()

# Display average metrics
print("\nAverage metrics across all folds:")
print("Average accuracy:", sum(accuracies) / len(accuracies))
print("Average Precision:", sum(precisions) / len(precisions))
print("Average Recall:", sum(recalls) / len(recalls))
print("Average F1-score:", sum(f1_scores) / len(f1_scores))

# Model metrics, both globally and by class



In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns


# Define the device to be used (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Load the pre-trained model
model_path = '/content/drive/MyDrive/BERTAttentMask_SM/bert_model_SM_Polaridad_fold_7.pth'
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.load_state_dict(torch.load(model_path))
model.to(device)
model.eval()

# Loading the database from a CSV file
# Replace with the actual path of your CSV file
data = pd.read_csv("/content/drive/MyDrive/BERTAttentMask_SM/data_numeric_label_sm_polaridad.csv")  

# Split data into features (X) and labels (y)
X = data['Text']  # Assuming you have a column called "Text" that contains your data
y = data['Numeric_Label']  # Assuming you have a column called "Numeric_Label" that contains the labels

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# prepare inputs and labels as tensioners
train_inputs = tokenizer(list(X_train), padding=True, truncation=True, return_tensors="pt")
train_labels = torch.tensor(y_train.values)
test_inputs = tokenizer(list(X_test), padding=True, truncation=True, return_tensors="pt")
test_labels = torch.tensor(y_test.values)


# Create DataLoaders to iterate over data with a progress bar
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)
test_dataset = TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'], test_labels)
train_dataloader = DataLoader(train_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)

# Initialising metrics
true_labels = []
predicted_labels = []

# Performing inference with the test set
for batch in tqdm(test_dataloader, desc="Inference on test set"):
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
    labels = batch[2]

    with torch.no_grad():
        outputs = model(**inputs)

    true_labels.extend(labels.cpu().numpy())
    predicted_labels.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
report = classification_report(true_labels, predicted_labels, target_names=["Positivo", "Negativo", "Indeterminado"])

# Calculate the confusion matrix by class
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Calculate accuracy, recall and F1-score for each class
class_metrics = classification_report(true_labels, predicted_labels, target_names=["Positivo", "Negativo", "Indeterminado"], output_dict=True)

# Print results
print("Accuracy:", accuracy)
print("Classification report:")
print(report)
print("Confusion Matrix:")
print(conf_matrix)
print("Class metrics:")
for clase, metrics in class_metrics.items():
    if clase != 'accuracy':  # Exclude the global accuracy metric
        print(f"Class {clase}:")
        print(f"  Precision: {metrics['precision']}")
        print(f"  Recall: {metrics['recall']}")
        print(f"  F1-score: {metrics['f1-score']}")

# Visualising the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='YlGnBu', xticklabels=["Positivo", "Negativo", "Indeterminado"], yticklabels=["Positivo", "Negativo", "Indeterminado"])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


## Emotional response prediction of the trained BERT model with polarity
Give a few sentences and test how you classify the new model

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Define the device to be used (GPU if available, otherwise CPU).
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained model
model_path = 'bert_model_V10_Polaridad_fold_7.pth' 

# Make sure that the number of labels is the same as the number used during the training
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  

#If no GPU is used add map_location=torch.device('cpu'
#model.load_state_dict(torch.load(model_path))
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()

# Initialising the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


# Text classification function
def classify_text(text):
    # Tokenise and encode text
    encoded_text = tokenizer(text, truncation=True, padding=True, return_tensors="pt")

    # Passing text through the model
    outputs = model(**encoded_text)

    # Getting the predictions
    _, predicted_class = torch.max(outputs.logits, 1)

    return predicted_class.item()

# Example to predict the polarity of a sentence
while True:
    user_input = input("Enter a sentence to classify (or 'exit' to exit):: ")
    if user_input.lower() == 'exit':
        break
    else:
        label_id = classify_text(user_input)
        labels_names = ["Negativo", "Positivo"]
        print("The sentences '{}' is classified as: {}".format(user_input, labels_names[label_id]))