<a href="https://colab.research.google.com/github/Amankp1/Psychological-Profiling-and-Context-Aware-Labeling-of-Hate-Speech/blob/main/Psychological_labeling_hateSpeech_file_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("hate_speech_dataset.csv")

In [5]:
label_mapping = {"Depression": 0, "Stress": 1, "Loneliness": 2}
df["label"] = df["label"].map(label_mapping)

In [6]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42)

In [7]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [8]:
from torch.utils.data import Dataset, DataLoader

class HateSpeechDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [9]:
train_dataset = HateSpeechDataset(train_encodings, train_labels)
val_dataset = HateSpeechDataset(val_encodings, val_labels)

In [10]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
import torch

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [12]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mamankpatelamankpatel[0m ([33mamankpatelamankpatel-pandit-deendayal-energy-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.594,0.635023
2,0.293,0.438358
3,0.2751,0.467926
4,0.0859,0.433937
5,0.0677,0.441325


TrainOutput(global_step=450, training_loss=0.3349243817064497, metrics={'train_runtime': 67.0358, 'train_samples_per_second': 53.404, 'train_steps_per_second': 6.713, 'total_flos': 25935095400480.0, 'train_loss': 0.3349243817064497, 'epoch': 5.0})

In [18]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

trainer.evaluate()

{'eval_loss': 0.4339368939399719,
 'eval_runtime': 0.2828,
 'eval_samples_per_second': 632.912,
 'eval_steps_per_second': 81.324,
 'epoch': 5.0}

In [19]:
test_texts = [
    # Depression-related hate speech
    "I hate how I wake up every day knowing nothing will change.",
    "No one gives a damn about me, and I’m tired of pretending otherwise.",
    "Every single person in this college is fake, just like my own smile.",
    "No matter what I do, I’m always the forgotten one in the background.",
    "This place is just a constant reminder that I’ll never be good enough.",
    "I’m done trying to fit in when all I get is rejection.",
    "If I vanished today, this college would just keep moving like I never existed.",
    "Everyone pretends to be so caring, but in the end, they leave just like the rest.",
    "The only thing college has taught me is how to hate myself more efficiently.",
    "I despise how people act like they’re my friend, only to push me away when I need them most.",

    # Stress-related hate speech
    "This college drains every ounce of energy and leaves nothing but anxiety.",
    "I hate how I work my ass off, and all I get is stress and sleepless nights.",
    "Professors act like we’re machines who can function without a break.",
    "Everything about this place is designed to break students mentally.",
    "Group projects are just a nightmare that make me hate people even more.",
    "The pressure here is so suffocating, it feels like I’m drowning.",
    "No matter how hard I work, there’s always another impossible deadline.",
    "I hate how this college turns students into stressed-out zombies.",
    "People here act like stress is a competition, and I’m sick of it.",
    "I swear, if one more professor dumps another assignment on me, I’m going to lose it.",

    # Loneliness-related hate speech
    "I hate walking into a crowded room and realizing I have no one to sit with.",
    "This college is just a giant popularity contest, and I was never even considered.",
    "People only notice me when they need something; otherwise, I’m invisible.",
    "No one even bothers to check in on me, and I hate them for it.",
    "I sit alone every single day, watching everyone else form friendships so easily.",
    "I hate how everyone has their group, and I’m just an outsider looking in.",
    "The lonelier I feel, the more I resent every fake smile I see.",
    "No one ever remembers my name, and honestly, I’ve stopped caring.",
    "I could disappear right now, and not a single person here would notice.",
    "This college makes it so easy to be surrounded by people and still feel completely alone."
]

test_labels = [
    "Depression", "Depression", "Depression", "Depression", "Depression",
    "Depression", "Depression", "Depression", "Depression", "Depression",

    "Stress", "Stress", "Stress", "Stress", "Stress",
    "Stress", "Stress", "Stress", "Stress", "Stress",

    "Loneliness", "Loneliness", "Loneliness", "Loneliness", "Loneliness",
    "Loneliness", "Loneliness", "Loneliness", "Loneliness", "Loneliness"
]

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model(**test_encodings)
    predictions = torch.argmax(outputs.logits, dim=1).cpu().tolist()

label_mapping = {0: "Depression", 1: "Stress", 2: "Loneliness"}
y_pred_test_labels = [label_mapping[p] for p in predictions]

accuracy = accuracy_score(test_labels, y_pred_test_labels)
conf_matrix = confusion_matrix(test_labels, y_pred_test_labels)
classification_rep = classification_report(test_labels, y_pred_test_labels)

print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_rep)
print("\nAccuracy:", accuracy)

for i in range(len(test_texts)):
    status = "✅ Correct Prediction" if test_labels[i] == y_pred_test_labels[i] else "❌ Incorrect Prediction"
    print(f"Text: {test_texts[i]}\nExpected: {test_labels[i]}, Predicted: {y_pred_test_labels[i]} → {status}\n")


Confusion Matrix:
 [[9 1 0]
 [2 8 0]
 [1 0 9]]

Classification Report:
               precision    recall  f1-score   support

  Depression       0.75      0.90      0.82        10
  Loneliness       0.89      0.80      0.84        10
      Stress       1.00      0.90      0.95        10

    accuracy                           0.87        30
   macro avg       0.88      0.87      0.87        30
weighted avg       0.88      0.87      0.87        30


Accuracy: 0.8666666666666667
Text: I hate how I wake up every day knowing nothing will change.
Expected: Depression, Predicted: Depression → ✅ Correct Prediction

Text: No one gives a damn about me, and I’m tired of pretending otherwise.
Expected: Depression, Predicted: Depression → ✅ Correct Prediction

Text: Every single person in this college is fake, just like my own smile.
Expected: Depression, Predicted: Depression → ✅ Correct Prediction

Text: No matter what I do, I’m always the forgotten one in the background.
Expected: Depression,