In [1]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Kaggle credentials set.
Kaggle credentials successfully validated.


In [2]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

ai_dragon_path_path = kagglehub.competition_download('ai-dragon-path')
hayounii_datacompe_path = kagglehub.dataset_download('hayounii/datacompe')

print('Data source import complete.')


Downloading from https://www.kaggle.com/api/v1/competitions/data/download-all/ai-dragon-path...


100%|██████████| 30.5M/30.5M [00:00<00:00, 101MB/s] 

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/hayounii/datacompe?dataset_version_number=1...


100%|██████████| 30.5M/30.5M [00:00<00:00, 72.8MB/s]

Extracting files...





Data source import complete.


# Mental Health Text Classification with BERT

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [4]:
import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from collections import Counter

In [7]:
# Load data
df = pd.read_csv('/content/train.csv')

In [8]:
# Fix common misspellings in target
target_mapping = {
    'anxity': 'anxiety',
    'depresion': 'depression',
    'ptsd': 'ptsd-and-trauma',
    'relationship-and-family-issues': 'relationship-and-family-issues'
}
df['target'] = df['target'].map(target_mapping).fillna(df['target'])

In [9]:
# Combine title and content
df['text'] = df['title'] + ' ' + df['content']

In [10]:
# Handle missing values
df = df.dropna(subset=['text', 'target'])

In [11]:
# Custom text cleaning
def clean_text(text):
    if not isinstance(text, str):
        return ""

    misspellings = {
        'z': 's', 'znxiety': 'anxiety', 'deptession': 'depression',
        'wotk': 'work', 'thzt': 'that', 'hzs': 'has', 'znd': 'and',
        'fot': 'for', 'tet': 'get', 'zt': 'at', 'zre': 'are',
        'zn': 'an', 'zmd': 'and', 'zct': 'act', 'znd': 'and'
    }

    for wrong, right in misspellings.items():
        text = text.replace(wrong, right)

    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

df['cleaned_text'] = df['text'].apply(clean_text)

In [12]:
# Check class distribution
print(Counter(df['target']))

Counter({'relationship-and-family-issues': 5251, 'depression': 4554, 'anxiety': 2643, 'ptsd-and-trauma': 1432, 'suicidal-thoughts-and-self-harm': 917})


In [13]:
# Split data
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['target'], random_state=42)

In [14]:
# Dataset class
class MentalHealthDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_map = {label: idx for idx, label in enumerate(sorted(set(labels)))}

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.label_map[self.labels[item]]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [44]:
MAX_LEN = 256
BATCH_SIZE = 64
EPOCHS = 2
LEARNING_RATE = 2e-5

tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'google-bert/bert-base-uncased',
    num_labels=len(set(df['target'])),
    output_attentions=False,
    output_hidden_states=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = MentalHealthDataset(
        texts=df['cleaned_text'].to_numpy(),
        labels=df['target'].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(ds, batch_size=batch_size, num_workers=4)

train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(val_df, tokenizer, MAX_LEN, BATCH_SIZE)

In [40]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

In [41]:
def train_epoch(model, data_loader, optimizer, device, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in tqdm(data_loader):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

In [45]:
def eval_model(model, data_loader, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            labels = d['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [46]:
best_accuracy = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(model, train_data_loader, optimizer, device, len(train_df))
    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(model, val_data_loader, device, len(val_df))
    print(f'Val loss {val_loss} accuracy {val_acc}')
    print()

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

Epoch 1/2
----------


  0%|          | 0/370 [00:03<?, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [29]:
model.load_state_dict(torch.load('best_model_state.bin'))
model = model.to(device)

In [35]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Load test file
test_df = pd.read_csv('/content/test.csv')

# Combine title and content
test_df['text'] = test_df['title'].fillna('') + ' ' + test_df['content'].fillna('')

# Clean text using existing clean_text function
test_df['cleaned_text'] = test_df['text'].apply(clean_text)

# Test Dataset (no labels)
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

# Create test DataLoader
def create_test_data_loader(df, tokenizer, max_len, batch_size):
    ds = TestDataset(
        texts=df['cleaned_text'].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size, num_workers=2)

test_data_loader = create_test_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)

# Predict function for test set
def predict_test(model, data_loader):
    model.eval()
    predictions = []

    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())

    return predictions

# Run predictions
test_preds = predict_test(model, test_data_loader)

# Convert to labels
test_labels = [label_map[p] for p in test_preds]

# Prepare submission
submission = pd.DataFrame({
    'id': test_df['id'] if 'id' in test_df.columns else test_df.index,
    'target': test_labels
})

# Save to CSV
submission.to_csv('submission.csv', index=False)
print("✅ Submission file saved: /kaggle/working/submission.csv")


100%|██████████| 236/236 [01:03<00:00,  3.73it/s]

✅ Submission file saved: /kaggle/working/submission.csv



