In [1]:
import pandas as pd
import numpy as np
import random
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
import warnings

warnings.filterwarnings("ignore")

In [2]:
# Set a random seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if using multiple GPUs
torch.backends.cudnn.deterministic = True  # Make sure to set this for reproducibility

In [3]:
# Hyperparameters
batch_size = 16
epochs = 3
learning_rate = 2e-5

# Define valid labels for multi-label classification
valid_labels = ["דיכאון ועצבות קשה", "פציעה עצמית", "טראומה מינית", "Other"]

In [4]:
# Load pretrained model and tokenizer
model_name = 'onlplab/alephbert-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(valid_labels))  # num_labels = 4

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model = bert_model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
## Loading Data
conv_info_path = '/home/astrin/Projects/Sahar/Dataset/conv_info.csv'
messages_path = '/home/astrin/Projects/Sahar/Dataset/messages_anonymized.csv'

conv_info_df = pd.read_csv(conv_info_path)
messages_df = pd.read_csv(messages_path)

conv_info_df['engagement_id'] = conv_info_df['engagement_id'].astype(str)
messages_df['engagement_id'] = messages_df['engagement_id'].astype(str)
messages_df = messages_df[messages_df['text'].notna()]
messages_df['name'] = messages_df['name'].fillna('-')

In [6]:
# Create multi-label binary representation for each conversation including "Other"
def assign_multi_labels(row):
    labels = []
    for label in valid_labels[:-1]:  # Exclude "Other" at first
        if label in [row['subject_1'], row['subject_2'], row['subject_3']]:
            labels.append(label)
    # If no valid labels found, append "Other"
    if not labels:
        labels.append("Other")
    return labels

In [7]:
# Assign multi-labels (list of labels) for each row
conv_info_df['labels'] = conv_info_df.apply(assign_multi_labels, axis=1)

# Merge messages and conversation info
merged_df = messages_df.merge(conv_info_df, on='engagement_id')

# Filter only the help seeker's messages
merged_df = merged_df[merged_df['seeker'] == True]

# Aggregate the messages so each row contains an entire conversation
merged_df = merged_df.groupby('engagement_id').agg({'text': '[SEP]'.join, 'labels': 'first'}).reset_index()

# Use MultiLabelBinarizer to convert the labels into binary vectors
mlb = MultiLabelBinarizer(classes=valid_labels)
merged_df['label'] = mlb.fit_transform(merged_df['labels']).tolist()

# Split into train and test sets
train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=seed)


In [8]:
print(len(merged_df))

30232


In [9]:
## Tokenization
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=512)

In [10]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=16)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=16)

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Map:   0%|          | 0/24185 [00:00<?, ? examples/s]

Map:   0%|          | 0/6047 [00:00<?, ? examples/s]

In [11]:
# Check size of train and test sets
print(f"Train set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")

# Check the distribution of labels in the test set
print(test_df['label'].value_counts())

Train set size: 24185
Test set size: 6047
label
[0, 0, 0, 1]    3784
[1, 0, 0, 0]    1620
[0, 0, 1, 0]     282
[0, 1, 0, 0]     161
[1, 1, 0, 0]     110
[1, 0, 1, 0]      73
[0, 1, 1, 0]      15
[1, 1, 1, 0]       2
Name: count, dtype: int64


In [12]:
import torch.nn as nn

# Define the loss function explicitly as BCEWithLogitsLoss
criterion = nn.BCEWithLogitsLoss()

# Model Training
optimizer = torch.optim.AdamW(bert_model.parameters(), lr=learning_rate)
bert_model.train()

progress_bar = tqdm(range(epochs * len(train_loader)), desc="Training")

for epoch in range(epochs):
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = torch.tensor(batch['label'], dtype=torch.float32).to(device)  # Binary labels

        outputs = bert_model(input_ids, attention_mask=attention_mask)

        logits = outputs.logits  # Get the logits from the model outputs

        # Calculate the loss using BCEWithLogitsLoss
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        progress_bar.update(1)

progress_bar.close()

Training:   0%|          | 0/4536 [00:00<?, ?it/s]

In [13]:
## Evaluating the Model
bert_model.eval()
labels = []
preds = []

for batch in tqdm(test_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    label = torch.tensor(batch['label'], dtype=torch.float32).to(device)

    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predictions = torch.sigmoid(logits)  # Use sigmoid for multi-label classification

    # Apply threshold to predictions (0.5 by default for binary classification)
    predictions = (predictions > 0.5).int()

    labels.extend(label.cpu().numpy())
    preds.extend(predictions.cpu().numpy())

# Convert the predictions and true labels to numpy arrays
labels = np.array(labels)
preds = np.array(preds)

  0%|          | 0/378 [00:00<?, ?it/s]

In [15]:
# Print Classification Report for Multi-label
report_dict = classification_report(labels, preds, target_names=valid_labels, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()

# # Export the classification report to Excel
file_path = "multilabel_classification_with_other_report.xlsx"
report_df.to_excel(file_path)

# Display the DataFrame to view the classification report in a tabular format
report_df

Unnamed: 0,precision,recall,f1-score,support
דיכאון ועצבות קשה,0.636364,0.61662,0.626337,1805.0
פציעה עצמית,0.622581,0.670139,0.645485,288.0
טראומה מינית,0.845833,0.545699,0.663399,372.0
Other,0.822128,0.820825,0.821476,3784.0
micro avg,0.759421,0.738518,0.748824,6249.0
macro avg,0.731726,0.663321,0.689174,6249.0
weighted avg,0.760685,0.738518,0.747589,6249.0
samples avg,0.743895,0.743592,0.740985,6249.0
