In [None]:
import pandas as pd
import numpy as np
import transformers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader
from datasets import Dataset
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")

In [None]:
path = "/kaggle/input/sentiment-analysis-for-mental-health/Combined Data.csv"

In [None]:
df = pd.read_csv(path)
df.head()

In [None]:
df.info()

In [None]:
df.drop('Unnamed: 0',axis=1,inplace =True)
df.reset_index(drop=True)
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace = True)
df.isnull().sum()

In [None]:
df[df['statement']== 'what do you mean?'].count()

In [None]:
df = df[df['statement'] != 'what do you mean?'].reset_index(drop=True)

In [None]:
df[df['statement']== 'what do you mean?'].count()

In [None]:
nltk.download('stopwords')
stopwords_set = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-zA-Z\s]", '', text)
    words = text.split()
    filtered_words = [word for word in words if word not in stopwords_set]
    return " ".join(filtered_words) 

df['statement'] = df['statement'].apply(clean_text)

In [None]:
X = df.drop(columns=['status']) 
y = df['status']                 

le = LabelEncoder()
y_encoded = le.fit_transform(y)

for i, label in enumerate(le.classes_):
    print(f"LABEL_{i} -> {label}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

X_train_tokenized = tokenizer(
    X_train['statement'].tolist(), 
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

X_test_tokenized = tokenizer(
    X_test['statement'].tolist(),  
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

train_dataset = Dataset.from_dict({
    'input_ids': X_train_tokenized['input_ids'],
    'attention_mask': X_train_tokenized['attention_mask'],
    'labels': y_train.tolist()
})

test_dataset = Dataset.from_dict({
    'input_ids': X_test_tokenized['input_ids'],
    'attention_mask': X_test_tokenized['attention_mask'],
    'labels': y_test.tolist()
})

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class_counts = torch.bincount(torch.tensor(y_train))
num_samples = len(y_train)
num_classes = len(class_counts)

class_weights = num_samples / (num_classes * class_counts.float())
class_weights = class_weights.to(device)

loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

print("Class counts:", class_counts.tolist())
print("Class weights:", class_weights.cpu().tolist())

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(le.classes_),
    id2label={i: str(label) for i, label in enumerate(le.classes_)},  
    label2id={str(label): i for i, label in enumerate(le.classes_)}   
).to(device)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels").to(device)
        inputs = {k: v.to(device) for k, v in inputs.items() if k != "labels"}
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = loss_fn(
            logits.view(-1, model.config.num_labels),
            labels.view(-1)
        )
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=6,
    per_device_train_batch_size=40,
    save_strategy="epoch",
    report_to=[],
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=50
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

trainer.train()

In [None]:
pred, labels, _ = trainer.predict(test_dataset)

predicted_labels = np.argmax(pred, axis=1)

print(classification_report(y_test, predicted_labels, target_names=le.classes_))

cm = confusion_matrix(y_test, predicted_labels)

plt.figure(figsize=(12,8))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('predicted')
plt.ylabel('True')
plt.show()

In [None]:
import os
from huggingface_hub import login, create_repo
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")
login(token=hf_token)

repo_id = "BienKieu/mental-health"

trainer.push_to_hub(repo_id)