In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from transformers import AdamW
from torch.nn import CrossEntropyLoss

In [None]:
#Load pre-trained RoBERTa
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 2: Prepare tokenized posts for model ingestion
# Convert tokenized text data to input IDs and attention masks
def prepare_input_data(df):
    # Load RoBERTa tokenizer
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

    # Tokenize texts and add special tokens
    tokenized_texts = []
    for text in df['text']:
        inputs = tokenizer.encode_plus(text, add_special_tokens=True, max_length=512, pad_to_max_length=True, return_tensors='pt')
        tokenized_texts.append(inputs)

    # Extract input IDs and attention masks from tokenized texts
    input_ids = torch.cat([inputs['input_ids'] for inputs in tokenized_texts], dim=0)
    attention_masks = torch.cat([inputs['attention_mask'] for inputs in tokenized_texts], dim=0)

    # Convert labels to tensor
    labels = torch.tensor(df['class'].apply(lambda x: 1 if x.lower() == 'suicide' else 0).tolist())

    return input_ids, attention_masks, labels

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/mml/datasets/Suicide_Detection.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [None]:
df['label'] = df['class'].apply(lambda x: 1 if x.lower() == 'suicide' else 0)
df.head()

Unnamed: 0.1,Unnamed: 0,text,class,label
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide,1
1,3,Am I weird I don't get affected by compliments...,non-suicide,0
2,4,Finally 2020 is almost over... So I can never ...,non-suicide,0
3,8,i need helpjust help me im crying so hard,suicide,1
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide,1


In [None]:
input_ids, attention_masks, labels = prepare_input_data(df)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# Step 3: Split data into train and validation sets
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(input_ids, attention_masks, labels, test_size=0.1, random_state=42)

In [None]:
# Step 4: Fine-tune RoBERTa with XGBoost classification head
# Train XGBoost classifier on RoBERTa hidden states
train_dataset = TensorDataset(train_inputs, train_masks)
train_dataloader = DataLoader(train_dataset, batch_size=32)

# Move the model to the GPU
model = model.to('cuda')

# Inside the training loop
hidden_states = []
model.eval()
for batch in train_dataloader:
    batch = tuple(t.to('cuda') for t in batch)
    with torch.no_grad():
        outputs = model(input_ids=batch[0], attention_mask=batch[1])

    hidden_states.append(outputs.logits.cpu().numpy())

# Concatenate hidden states
hidden_states = np.concatenate(hidden_states, axis=0)

# Train XGBoost classifier
clf = xgb.XGBClassifier()
clf.fit(hidden_states, train_labels)

In [None]:
import pickle
# Save the trained RoBERTa model
torch.save(model.state_dict(), 'roberta_model.pth')

# Save the trained XGBoost classifier
with open('xgboost_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [None]:
# Step 5: Evaluation
# Prepare validation data
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)  # Include labels in the dataset
val_dataloader = DataLoader(val_dataset, batch_size=32)

# Evaluate RoBERTa + XGBoost classifier on validation set
val_hidden_states = []
val_targets = []
model.eval()
for batch in val_dataloader:
    batch = tuple(t.to('cuda') for t in batch)
    with torch.no_grad():
        outputs = model(input_ids=batch[0], attention_mask=batch[1])
    val_hidden_states.append(outputs.logits.cpu().numpy())
    val_targets.extend(batch[2].cpu().numpy())  # Use labels from the dataset, not from the batch

val_hidden_states = np.concatenate(val_hidden_states, axis=0)

val_preds = clf.predict(val_hidden_states)
accuracy = accuracy_score(val_targets, val_preds)
print("Validation Accuracy:", accuracy)


Validation Accuracy: 0.7119527749052051


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Evaluate the tuned classifier on the validation set
val_preds = clf.predict(val_hidden_states)

# Calculate accuracy
accuracy = accuracy_score(val_targets, val_preds)
print("Tuned Validation Accuracy:", accuracy)

# Calculate confusion matrix
conf_matrix = confusion_matrix(val_targets, val_preds)
print("Confusion Matrix:")
print(conf_matrix)

# Print classification report
class_report = classification_report(val_targets, val_preds)
print("Classification Report:")
print(class_report)

Tuned Validation Accuracy: 0.7119527749052051
Confusion Matrix:
[[8187 3512]
 [3173 8336]]
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.70      0.71     11699
           1       0.70      0.72      0.71     11509

    accuracy                           0.71     23208
   macro avg       0.71      0.71      0.71     23208
weighted avg       0.71      0.71      0.71     23208

