In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm_pandas
from tqdm.notebook import tqdm
from transformers import BertModel, BertTokenizerFast, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, BertForSequenceClassification, BertTokenizer
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support,  roc_auc_score, fbeta_score

import warnings
warnings.filterwarnings("ignore")

1. Loading pretrained Model
2. Loading the data
3. Preprocessing the data
4. Training the model
5. Evaluating the model

In [None]:
# hyperparameters
batch_size = 16
epochs = 3
learning_rate = 2e-5

## Creating Model and Tokenizer

In [None]:
model_name = 'onlplab/alephbert-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model = bert_model.to(device)

## Loading Data

In [None]:
conv_info_path = 'conv_info.csv'
messages_path = 'messages_anonymized.csv'

conv_info_df = pd.read_csv(conv_info_path)
messages_df = pd.read_csv(messages_path)

conv_info_df['engagement_id'] = conv_info_df['engagement_id'].astype(str)
messages_df['engagement_id'] = messages_df['engagement_id'].astype(str)
messages_df = messages_df[messages_df['anonymized'].notna()]
messages_df['name'] = messages_df['name'].fillna('-')

In [None]:
ids = conv_info_df['engagement_id']
conv_info_df = conv_info_df[conv_info_df['engagement_id'].isin(ids)]
messages_df = messages_df[messages_df['engagement_id'].isin(ids)]

## Preprocessing Data

In [None]:
# grouping messages with conversation information to pair text and label
merged_df = messages_df.merge(conv_info_df, on='engagement_id')

# for better results we take only text from help seeker
merged_df = merged_df[merged_df['seeker'] == True]

# aggregating messages such that each row contains the entire conversation
merged_df = merged_df.groupby('engagement_id').agg({'anonymized': '[SEP]'.join, 'imsr': 'first'}).reset_index()

# renaming label column (convention) and creating a Dataset object
merged_df = merged_df.rename(columns={'imsr': 'label'})

# split to train and test stratisfied by label
train_df, test_df = train_test_split(merged_df, test_size=0.2, stratify=merged_df['label'])

## Create Dataloaders

In [None]:
# creating Dataset objects
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# mapping the text into inputs that fits the model
def tokenize(batch):
    return tokenizer(batch['anonymized'], padding='max_length', truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=16)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=16)

# setting the format to pytorch tensors
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Training the Model

In [None]:
for batch in train_loader:
    print(batch)
    break

In [None]:
optimizer = torch.optim.AdamW(bert_model.parameters(), lr=learning_rate)
bert_model.train()

progress_bar = tqdm(range(epochs * len(train_loader)), desc="Training")

for epoch in range(epochs):
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        progress_bar.update(1)

progress_bar.close()

## Evaluating the Model

In [None]:
bert_model.eval()
labels = []
preds = []
pred_probs = []

for batch in tqdm(test_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    label = batch['label'].to(device)

    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)
    predictions = torch.argmax(logits, dim=-1)

    labels.extend(label.cpu().numpy())
    preds.extend(predictions.cpu().numpy())
    pred_probs.extend(probabilities[:, 1].cpu().numpy())

In [None]:
accuracy = accuracy_score(labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
roc_auc = roc_auc_score(labels, pred_probs)
f2 = fbeta_score(labels, preds, beta=2)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')
print(f'ROC-AUC: {roc_auc}')
print(f'F2: {f2}')