---
## This is a test of the emergency throw stuff at the wall service
---

## Map of stuff included in this notebook at some point.

In [2]:
# Library imports

# core just in case
import os
import sys

# Data handling
import numpy as np
import pandas as pd

# ML framework (TensorFlow -> includes Keras)
import tensorflow as tf
import torch
from torch.utils.data import Dataset
from tensorflow.keras import layers, Model, callbacks, optimizers

# Hugging Face * had a small error with transformers -> (pip install transformers datasets huggingface_hub) -> error go byebye.
from transformers import (
    AutoTokenizer,
    TFAutoModelForSequenceClassification,
    pipeline,)
from datasets import load_dataset
from huggingface_hub import notebook_login

# Evaluation / Utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Logging / visualization
import matplotlib.pyplot as plt
import seaborn as sns

#Note had to install earlier version of Keras via (pip install tf-keras) -> restart kernel -> working:)


In [3]:
# Load data
df = pd.read_csv('../data/sentiment.csv')  
print(df.head())

   Unnamed: 0                                          statement   status
0           0                                         oh my gosh  Anxiety
1           1  trouble sleeping, confused mind, restless hear...  Anxiety
2           2  All wrong, back off dear, forward doubt. Stay ...  Anxiety
3           3  I've shifted my focus to something else but I'...  Anxiety
4           4  I'm restless and restless, it's been a month n...  Anxiety


In [4]:
print(df["status"].value_counts())

status
Normal                  16351
Depression              15404
Suicidal                10653
Anxiety                  3888
Bipolar                  2877
Stress                   2669
Personality disorder     1201
Name: count, dtype: int64


In [5]:
# Note: Leaning on bert instead of installing SpaCy just yet. 

In [8]:
# Dataset Class Definition
class HF_Dataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        # tokenize all at once
        self.encodings = tokenizer(
            texts,
            truncation=True,
            padding="max_length",
            max_length=max_length
        )
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
# Prep datasets
train_dataset = HF_Dataset(train_texts, train_labels, tokenizer)
test_dataset  = HF_Dataset(test_texts,  test_labels,  tokenizer)

NameError: name 'train_texts' is not defined

In [10]:
# Encode labels
le = LabelEncoder()
df["label"] = le.fit_transform(df["status"])
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Mapping:", label_mapping)
num_labels = len(le.classes_)

Mapping: {'Anxiety': 0, 'Bipolar': 1, 'Depression': 2, 'Normal': 3, 'Personality disorder': 4, 'Stress': 5, 'Suicidal': 6}


In [11]:
# Train/validation split
train_df, test_df = train_test_split(
    df, test_size=0.1, stratify=df["label"], random_state=42)

print("Train size:", len(train_df))
print("Validation size:", len(test_df))
print("Label distribution (train):")
print(train_df["status"].value_counts(normalize=True))
print("Label distribution (test):")
print(test_df["status"].value_counts(normalize=True))

Train size: 47738
Validation size: 5305
Label distribution (train):
status
Normal                  0.308266
Depression              0.290398
Suicidal                0.200846
Anxiety                 0.073296
Bipolar                 0.054234
Stress                  0.050316
Personality disorder    0.022644
Name: proportion, dtype: float64
Label distribution (test):
status
Normal                  0.308200
Depression              0.290481
Suicidal                0.200754
Anxiety                 0.073327
Bipolar                 0.054288
Stress                  0.050330
Personality disorder    0.022620
Name: proportion, dtype: float64


In [12]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def encode(df_subset):
    # force strings and drop/fill NaNs
    texts = df_subset["statement"].astype(str).fillna("").tolist()
    encodings = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="tf",
    )
    labels = tf.constant(df_subset["label"].values)
    return encodings, labels

train_enc, train_labels = encode(train_df)
test_enc,   test_labels   = encode(test_df)

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


In [13]:
# Build tf.data pipelines
train_ds = (
    tf.data.Dataset
      .from_tensor_slices((dict(train_enc), train_labels))
      .shuffle(2000)
      .batch(16)
      .prefetch(tf.data.AUTOTUNE)
)

val_ds = (
    tf.data.Dataset
      .from_tensor_slices((dict(test_enc), test_labels))
      .batch(16)
      .prefetch(tf.data.AUTOTUNE))

In [14]:
# Load & compile model
model = TFAutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.Recall(name="recall")])

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: 'Variable' object has no attribute '_distribute_strategy'

In [None]:
# Train
model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=3)