# Mental Health - Sentiment Analysis

<img src="https://lifework.arizona.edu/sites/default/files/styles/az_large/public/2023-01/news-banner-anxiety-problem.png?itok=5Sk4Xdgp" alt="Imagen Principal" style="width:100%;display:block">

## Descripción del Dataset

This comprehensive dataset is a meticulously curated collection of mental health statuses tagged from various statements. The dataset amalgamates raw data from multiple sources, cleaned and compiled to create a robust resource for developing chatbots and performing sentiment analysis.

#Import Data from Kaggle

In [1]:
import os
import zipfile
from google.colab import files

In [2]:
files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [3]:
!kaggle datasets download -d suchintikasarkar/sentiment-analysis-for-mental-health

Dataset URL: https://www.kaggle.com/datasets/suchintikasarkar/sentiment-analysis-for-mental-health
License(s): DbCL-1.0
Downloading sentiment-analysis-for-mental-health.zip to /content
 45% 5.00M/11.1M [00:00<00:00, 44.7MB/s]
100% 11.1M/11.1M [00:00<00:00, 61.1MB/s]


In [4]:
with zipfile.ZipFile("/content/sentiment-analysis-for-mental-health.zip", 'r') as zip_ref:
    zip_ref.extractall("data")

## Lectura y Preprosesamiento

In [29]:
import numpy as np
import script as pre #Script con las funciones a usar

from transformers import AutoTokenizer, BertModel
from torch import nn
import torch

In [7]:
path = "/content/data/Combined Data.csv"
data = pre.read(path)
data.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [21]:
def tokenizer_text(text):
  t = tokenizer.encode_plus(text, max_length = 128,
                            truncation = True,
                            padding = 'max_length',
                            add_special_tokens = True,
                            return_tensors = 'pt')
  return t

def tokenizer_data(df, tokenizer):

  ids = np.zeros((len(df),128))
  masks = np.zeros((len(df),128))

  for i,text in enumerate(df['statement']):
    text_tokenized = tokenizer_text(str(text))

    ids[i,:] = text_tokenized.input_ids
    masks[i,:] = text_tokenized.attention_mask
    return ids, masks

In [24]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
x_ids, x_masks = tokenizer_data(data, tokenizer)

In [25]:
dataset = pre.df_to_Dataset(x_ids, x_masks, data["status"])
train, val, test = pre.split_data(dataset, train = 0.8, val = 0.1, test = 0.1, shuffle = True, batch_size = 32)

# Model

### Construcción del Modelo

In [30]:
class Model(nn.Module):
    def __init__(self, num_labels = 7):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        #self.fc1 = nn.Linear(self.bert.config.hidden_size, 512)
        #self.fc2 = nn.Linear(512, 256)
        self.dropout = nn.Dropout(p = 0.3)
        self.batchnorm = nn.BatchNorm1d(self.bert.config.hidden_size)
        self.fc3 = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_embds = bert_outputs.pooler_output

        #x = torch.relu(self.fc1(bert_embds))
        #x = torch.relu(self.fc2(x))
        x = self.dropout(bert_embds)
        x = self.batchnorm(x)
        logits = self.fc3(x)
        return logits

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Model().to(device)
model

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Model(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=

### Funciones de Entrenamiento y Validación

In [None]:
def train_model(dataloader, model, fn_loss, opt):
    model.train()

    train_size = len(dataloader.dataset)

    total_loss = 0
    total_correct = 0
    total_samples = 0
    n_batches = len(dataloader)

    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask)
        loss = fn_loss(outputs, labels)

        opt.zero_grad()
        loss.backward()
        opt.step()

        #Monitoring
        total_loss += loss.item() * input_ids.size(0)
        _, predicted = torch.max(outputs, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += input_ids.size(0)

        if total_samples % 25 == 0:
            print(f"Batch Loss: {loss.item():.7f} [{total_samples :> 5d}/{len(dataloader.dataset) :> 5d}]")

    avg_loss = total_loss / total_samples
    avg_acc = total_correct / total_samples
    print(f"Epoch Loss: {avg_loss:.4f}, Epoch Accuracy: {avg_acc:.4f}")

def eval_model(dataloader, model, fn_loss):

    model.eval()

    total_loss = 0
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask)

            loss = fn_loss(outputs, labels)
            total_loss += loss.item() * input_ids.size(0)

            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += input_ids.size(0)

    avg_loss = total_loss / total_samples
    avg_acc = total_correct / total_samples

    print(f"\t\tLoss: {avg_loss:.4f} / Val: Accuracy: {100*avg_acc:.2f}%")

## Etrenando el Modelo

In [34]:
fn_loss = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr = 5e-4, eps = 1e-08)

In [None]:
epoch = 4
for t in range(epoch):
    print(f"Epoch {t + 1}/{epoch}\n-------------------------------------------------------")
    train_model(train, model, fn_loss, opt)
    eval_model(val, model, fn_loss)
print("Modelo Entrenado")

### Guardadndo el modelo para no entrenar de nuevo

In [None]:
torch.save(model.state_dict(), 'rober_model.pth')