# Background
## Dataset
We provide the model with 2 datasets: **messages** and **conversation info**.

Both datasets contains more information, but We will describe only what's neccessary.

**Messages** contains 
* engagement_id 
* text (original messages) 
* anonymized (modified messages by Sahar volunteers)

**Conversation info** contains
* engagement_id
* gsr (suicide score assessment - 0 or 1)

## Flow of the model
We try to predict whether a help-seeker is suicidal based on a combination of the chat and his gsr score.

This model takes into account for each chat all the messages of the help seeker, and the counselor messages.

If the conversation is too long, only the last 512 tokens are taken.

* We merge both datasets based on engagement_id.
* We tokenize every batch
* We train the model
* OPTIONAL: We can add differential privacy to the model. It seems like the optimal threshold is around 0.009.

In [None]:
import sys
sys.path.append('/home/astrin/')
from imports import *

seed_everything()
memory_usage()

In [None]:
model_name = 'onlplab/alephbert-base'
epochs = 3
batch_size = 16
learning_rate = 2e-5
max_len = 512

classification_threshold = 0.3
alpha = 0.5

In [None]:
%%capture 

tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model = bert_model.to(device)

In [None]:
conv_info_path = '/home/astrin/Projects/Sahar/Dataset/conv_info.csv'
messages_path = '/home/astrin/Projects/Sahar/Dataset/messages_anonymized.csv'

conv_info_df = pd.read_csv(conv_info_path)
messages_df = pd.read_csv(messages_path)

conv_info_df['engagement_id'] = conv_info_df['engagement_id'].astype(str)
messages_df['engagement_id'] = messages_df['engagement_id'].astype(str)
messages_df = messages_df[messages_df['anonymized'].notna()]
messages_df['name'] = messages_df['name'].fillna('-')

# Get all engagement_ids from conv_info_df
ids = conv_info_df['engagement_id']
conv_info_df = conv_info_df[conv_info_df['engagement_id'].isin(ids)]
messages_df = messages_df[messages_df['engagement_id'].isin(ids)]

In [None]:
# Grouping messages with conversation information to pair text and label
merged_df = messages_df.merge(conv_info_df, on='engagement_id')

# Aggregating messages such that each row contains the entire conversation
merged_df = merged_df.groupby('engagement_id').agg({'anonymized': '[SEP]'.join, 'gsr': 'first'}).reset_index()

# Renaming label column (convention) and creating a Dataset object
merged_df = merged_df.rename(columns={'gsr': 'label'})

# Split to train and test stratified by label
train_df, test_df = train_test_split(merged_df, test_size=0.2, stratify=merged_df['label'])

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

def tokenize(batch):
    tokens = tokenizer(batch['anonymized'], return_tensors='pt')
    # Take last 512 tokens
    tokens['input_ids'] = tokens['input_ids'][:, -max_len:]
    tokens['attention_mask'] = tokens['attention_mask'][:, -max_len:]

    # Now pad to 512
    pad_len = max_len - tokens['input_ids'].shape[1]
    tokens['input_ids'] = F.pad(tokens['input_ids'], (0, pad_len), value=tokenizer.pad_token_id)
    tokens['attention_mask'] = F.pad(tokens['attention_mask'], (0, pad_len), value=0)
    return tokens

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=1)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=1)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
def train(model, loader, epochs, learning_rate, device=device):
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    losses = []

    pbar = tqdm(total=epochs * len(loader), leave=False, desc='Training')

    for epoch in range(epochs):
        model.train()

        for batch in loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            
            # DP
            # Clipping gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Adding differential privacy noise to gradients
            for param in model.parameters():
                if param.grad is not None:
                    param.grad = param.grad +  torch.normal(0, 0.025, size=param.grad.shape, device=param.grad.device)
            # End of DP
        
            optimizer.step()
            losses.append(loss.item())

            pbar.update(1)
            pbar.set_postfix({'loss': loss.item()})

    pbar.close()
    return losses

def test(model, loader, device=device):
    model.eval()
    predictions = []
    probs = []
    labels = []
    with torch.no_grad():
        for batch in tqdm(loader, desc='Testing'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels_batch = batch['label'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probs_batch = torch.softmax(logits, dim=1)
            predictions_batch = torch.argmax(probs_batch, dim=1)
            predictions.extend(predictions_batch.cpu().numpy())
            probs.extend(probs_batch.cpu().numpy())
            labels.extend(labels_batch.cpu().numpy())
    return predictions, np.array(probs), labels

def compute_metrics(predictions, probs, labels):
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    roc_auc = roc_auc_score(labels, probs[:, 1])
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'roc_auc': roc_auc}

In [None]:
losses = train(bert_model, train_loader, epochs, learning_rate)

In [None]:
preds, probs, labels = test(bert_model, test_loader)
metrics = compute_metrics(preds, np.array(probs), labels)
for key, value in metrics.items():
    print(f'{key}: {round(value, 4)}')