In [None]:
!pip install transformers datasets torch scikit-learn pandas numpy accelerate -q

import pandas as pd
import numpy as np
import torch
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from datasets import Dataset
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
from google.colab import files
print(" Upload your kaggle.json file (from kaggle.com/settings):")
uploaded = files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

print("\n Downloading dataset...")
# Using the best beginner-friendly dataset
!kaggle datasets download -d suchintikasarkar/sentiment-analysis-for-mental-health
!unzip -q sentiment-analysis-for-mental-health.zip

print(" Dataset downloaded!")

 Upload your kaggle.json file (from kaggle.com/settings):


Saving kaggle.json to kaggle (1).json

 Downloading dataset...
Dataset URL: https://www.kaggle.com/datasets/suchintikasarkar/sentiment-analysis-for-mental-health
License(s): DbCL-1.0
sentiment-analysis-for-mental-health.zip: Skipping, found more recently modified local copy (use --force to force download)
replace Combined Data.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
 Dataset downloaded!


In [None]:
df = pd.read_csv('Combined Data.csv')  # Adjust filename

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nColumn names:")
print(df.columns.tolist())
print("\nClass distribution:")
print(df['status'].value_counts())

Dataset shape: (53043, 3)

First few rows:
   Unnamed: 0                                          statement   status
0           0                                         oh my gosh  Anxiety
1           1  trouble sleeping, confused mind, restless hear...  Anxiety
2           2  All wrong, back off dear, forward doubt. Stay ...  Anxiety
3           3  I've shifted my focus to something else but I'...  Anxiety
4           4  I'm restless and restless, it's been a month n...  Anxiety

Column names:
['Unnamed: 0', 'statement', 'status']

Class distribution:
status
Normal                  16351
Depression              15404
Suicidal                10653
Anxiety                  3888
Bipolar                  2877
Stress                   2669
Personality disorder     1201
Name: count, dtype: int64


In [None]:
print("\nðŸ”§ PREPROCESSING DATA...")

# Clean data
df = df.dropna(subset=['statement', 'status'])
df = df[df['statement'].str.len() > 10]  # Remove very short texts

# Check unique labels in your dataset
print(f"\n   Unique labels found: {df['status'].unique()}")

# Map labels to 3 classes (0: Stress/Anxiety, 1: Neutral, 2: Positive)
# Adjust this mapping based on what you see above
label_map = {
    'Anxiety': 0,
    'Stress': 0,
    'Depression': 0,
    'Suicidal': 0,
    'Normal': 1,
    'Neutral': 1,
    'Bipolar': 0,
    'Personality disorder': 0
}

# Apply mapping
df['label'] = df['status'].map(label_map)
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)

# Rename for consistency
df = df.rename(columns={'statement': 'text'})

print(f"\n After preprocessing:")
print(f"   Total samples: {len(df)}")
print(f"   Label distribution:")
print(df['label'].value_counts())
print(f"   0 = Stress/Anxiety/Negative")
print(f"   1 = Neutral")
print(f"   2 = Positive")



ðŸ”§ PREPROCESSING DATA...

   Unique labels found: ['Anxiety' 'Normal' 'Depression' 'Suicidal' 'Stress' 'Bipolar'
 'Personality disorder']

 After preprocessing:
   Total samples: 52186
   Label distribution:
label
0    36317
1    15869
Name: count, dtype: int64
   0 = Stress/Anxiety/Negative
   1 = Neutral
   2 = Positive


In [None]:
print("\n  SPLITTING DATA")

train_df, test_df = train_test_split(
    df[['text', 'label']],
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

print(f"   Training samples: {len(train_df)}")
print(f"   Testing samples: {len(test_df)}")

# Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))



  SPLITTING DATA
   Training samples: 41748
   Testing samples: 10438


In [None]:
print("\n TOKENIZING TEXT")

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=128
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])



 TOKENIZING TEXT


Map:   0%|          | 0/41748 [00:00<?, ? examples/s]

Map:   0%|          | 0/10438 [00:00<?, ? examples/s]

In [None]:
print("\n LOADING MODEL")

num_labels = len(df['label'].unique())
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels
)

model.to(device)
print(f" Model loaded with {num_labels} classes")


 LOADING MODEL


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Model loaded with 2 classes


In [None]:
pip install --upgrade transformers




In [None]:
import transformers
print(transformers.__version__)


4.57.1


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    learning_rate=2e-5,
    fp16=True,
)


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    # Ensure preds are numpy array
    preds = np.argmax(pred.predictions, axis=-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

print("\n" + "="*50)
print(" STARTING TRAINING (This will take 30-60 minutes)")
print("="*50 + "\n")

trainer.train()


 STARTING TRAINING (This will take 30-60 minutes)



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 Â·Â·Â·Â·Â·Â·Â·Â·Â·Â·


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mavishiagrawal0309[0m ([33mavishiagrawal0309-iiit-dharwad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,0.5799
200,0.3558
300,0.2154
400,0.1946
500,0.1471
600,0.1681
700,0.1448
800,0.1382
900,0.132
1000,0.1417


TrainOutput(global_step=7830, training_loss=0.08568423879862136, metrics={'train_runtime': 1227.552, 'train_samples_per_second': 102.027, 'train_steps_per_second': 6.379, 'total_flos': 4147686719318016.0, 'train_loss': 0.08568423879862136, 'epoch': 3.0})

In [None]:
print("\n" + "="*50)
print(" EVALUATING MODEL")
print("="*50 + "\n")

results = trainer.evaluate()
print(f" Test Accuracy: {results['eval_accuracy']*100:.2f}%")

# Get detailed predictions
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

# Classification report
label_names = ['Stress/Anxiety', 'Neutral', 'Positive'][:num_labels]
print("\n Detailed Classification Report:")
print(classification_report(labels, preds, target_names=label_names))


 EVALUATING MODEL



 Test Accuracy: 97.06%

 Detailed Classification Report:
                precision    recall  f1-score   support

Stress/Anxiety       0.98      0.98      0.98      7264
       Neutral       0.96      0.94      0.95      3174

      accuracy                           0.97     10438
     macro avg       0.97      0.96      0.97     10438
  weighted avg       0.97      0.97      0.97     10438



In [None]:
print("\n SAVING MODEL...")
model.save_pretrained('./mental_health_model')
tokenizer.save_pretrained('./mental_health_model')
print(" Model saved to './mental_health_model'")



 SAVING MODEL...
 Model saved to './mental_health_model'


In [None]:
def predict_sentiment(text):
    """Predict sentiment for any text"""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(probs, dim=-1).item()
    confidence = probs[0][predicted_class].item()

    return {
        'sentiment': label_names[predicted_class],
        'confidence': confidence * 100,
        'all_probabilities': {label_names[i]: probs[0][i].item() * 100 for i in range(num_labels)}
    }

# Test examples
test_texts = [
    "I'm feeling really overwhelmed with work and can't seem to catch a break. Everything is stressing me out.",
    "Had a wonderful day today! Feeling grateful for everything and everyone in my life.",
    "Just finished my project. It was okay, nothing special.",
    "I can't stop worrying about everything. My anxiety is through the roof.",
    "Feeling peaceful and relaxed after a good meditation session."
]

print("\n" + "="*50)
print(" TESTING LIVE PREDICTIONS")
print("="*50 + "\n")

for i, text in enumerate(test_texts, 1):
    result = predict_sentiment(text)
    print(f"{i}. Text: {text[:60]}...")
    print(f" Sentiment: {result['sentiment']} ({result['confidence']:.1f}% confidence)")
    print()


 TESTING LIVE PREDICTIONS

1. Text: I'm feeling really overwhelmed with work and can't seem to c...
 Sentiment: Stress/Anxiety (99.8% confidence)

2. Text: Had a wonderful day today! Feeling grateful for everything a...
 Sentiment: Neutral (99.0% confidence)

3. Text: Just finished my project. It was okay, nothing special....
 Sentiment: Neutral (99.9% confidence)

4. Text: I can't stop worrying about everything. My anxiety is throug...
 Sentiment: Stress/Anxiety (99.9% confidence)

5. Text: Feeling peaceful and relaxed after a good meditation session...
 Sentiment: Neutral (99.7% confidence)



In [None]:
print("\n CREATING INTERACTIVE DEMO...")

!pip install gradio -q
import gradio as gr

def analyze_text(text):
    if not text.strip():
        return " Please enter some text to analyze."

    result = predict_sentiment(text)

    output = f"""
**Sentiment**: {result['sentiment']}
**Confidence**: {result['confidence']:.1f}%

**Detailed Probabilities**:
"""
    for label, prob in result['all_probabilities'].items():
        bar = " " * int(prob / 5)
        output += f"\nâ€¢ {label}: {prob:.1f}% {bar}"

    return output

demo = gr.Interface(
    fn=analyze_text,
    inputs=gr.Textbox(
        label="Enter your text",
        placeholder="Type or paste your thoughts here...",
        lines=5
    ),
    outputs=gr.Markdown(label="Analysis Result"),
    title=" Mental Health Sentiment Analyzer",
    description="Analyze text for stress, anxiety, and emotional well-being indicators using fine-tuned BERT.",
    examples=[
        ["I feel so stressed and overwhelmed with everything going on."],
        ["Today was amazing! I'm so happy and grateful."],
        ["Just another day at work, nothing special."]
    ]
)

print("\n Launching demo...")
demo.launch(share=True)



 CREATING INTERACTIVE DEMO...

 Launching demo...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://bbbfd0b117d435b2e5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
!pip install gradio huggingface_hub -q


In [None]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [None]:
%%writefile app.py
import gradio as gr

def predict_sentiment(text):
    if "sad" in text.lower():
        return "Negative ðŸ˜”"
    else:
        return "Positive ðŸ˜Š"

demo = gr.Interface(
    fn=predict_sentiment,
    inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
    outputs="text",
    title="Mental Health Sentiment Analysis",
    description="Predicts whether the text expresses positive or negative sentiment.",
)

if __name__ == "__main__":
    demo.launch()



Writing app.py


In [None]:
!ls


app.py	sample_data


In [None]:
demo.launch()
# OR, for permanent hosting:
!gradio deploy


Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7f922c116669244095.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Need [32m'write'[0m access token to create a Spaces repo.

    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Processing Files (0 / 0)      : |          |  0.00B /  0.00B            
New Data Upload               : |          |  0.00B /  0.00B            [A

  ...ample_data/mnist_test.csv: 100% 18.3M/18.3M [00:00<?, ?B/s][A[A


  ...ata/mnist_train_small.csv: 100% 36.5M/36.5M [00:00<?, ?