<a href="https://colab.research.google.com/github/CaptainOdin/stress/blob/main/DeepLearningStressPractice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### P1 Load/Explore

In [1]:
!git clone https://github.com/CaptainOdin/stress.git

fatal: destination path 'stress' already exists and is not an empty directory.


In [4]:
#Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import re
from tensorflow.keras import layers, models, preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

In [3]:
stress = pd.read_csv('/content/stress/Stressv2.csv')
stress

Unnamed: 0,subreddit,post_id,sentence_range,text,label,confidence,social_timestamp
0,ptsd,8601tu,"(15, 20)","He said he had not felt that way before, sugge...",1,0.800000,1521614353
1,assistance,8lbrx9,"(0, 5)","Hey there r/assistance, Not sure if this is th...",0,1.000000,1527009817
2,ptsd,9ch1zh,"(15, 20)",My mom then hit me with the newspaper and it s...,1,0.800000,1535935605
3,relationships,7rorpp,"[5, 10]","until i met my new boyfriend, he is amazing, h...",1,0.600000,1516429555
4,survivorsofabuse,9p2gbc,"[0, 5]",October is Domestic Violence Awareness Month a...,1,0.800000,1539809005
...,...,...,...,...,...,...,...
2833,relationships,7oee1t,"[35, 40]","* Her, a week ago: Precious, how are you? (I i...",0,1.000000,1515187044
2834,ptsd,9p4ung,"[20, 25]",I don't have the ability to cope with it anymo...,1,1.000000,1539827412
2835,anxiety,9nam6l,"(5, 10)",In case this is the first time you're reading ...,0,1.000000,1539269312
2836,almosthomeless,5y53ya,"[5, 10]",Do you find this normal? They have a good rela...,0,0.571429,1488938143


#### LSTM and BERT
##### First preprocess text

In [11]:
# Extract the text and labels from the dataset, use NLTK
texts = stress['text'].values
labels = stress['label'].values

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer and the list of stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove non-ASCII characters
    text = ''.join(c for c in text if 0 < ord(c) < 127)

    # Lowercase the text
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters and digits
    text = re.sub(r'\W+|\d+', ' ', text)

    # Tokenize the text
    words = word_tokenize(text)

    # Lemmatize the words and filter out stopwords
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    # Join the words back into a single string
    cleaned_text = ' '.join(words)

    return cleaned_text

# Clean the text data
cleaned_texts = [clean_text(text) for text in texts]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# 1. Print a Sample
sample_size = 5
for i in range(sample_size):
    print("Original Text:", texts[i])
    print("Cleaned Text:", cleaned_texts[i])
    print("-" * 50)

# 2. Check for Stopwords
for i in range(sample_size):
    remaining_stopwords = [word for word in cleaned_texts[i].split() if word in stop_words]
    print(f"Cleaned Text {i+1} Remaining Stopwords:", remaining_stopwords)


Original Text: He said he had not felt that way before, suggeted I go rest and so ..TRIGGER AHEAD IF YOUI'RE A HYPOCONDRIAC LIKE ME: i decide to look up "feelings of doom" in hopes of maybe getting sucked into some rabbit hole of ludicrous conspiracy, a stupid "are you psychic" test or new age b.s., something I could even laugh at down the road. No, I ended up reading that this sense of doom can be indicative of various health ailments; one of which I am prone to.. So on top of my "doom" to my gloom..I am now f'n worried about my heart. I do happen to have a physical in 48 hours.
Cleaned Text: said felt way suggeted go rest trigger ahead youi hypocondriac like decide look feeling doom hope maybe getting sucked rabbit hole ludicrous conspiracy stupid psychic test new age b something could even laugh road ended reading sense doom indicative various health ailment one prone top doom gloom f n worried heart happen physical hour
--------------------------------------------------
Original Te

In [13]:
# Define max_words
max_words = 600

# Continue with tokenization and other preprocessing steps
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(cleaned_texts)
sequences = tokenizer.texts_to_sequences(cleaned_texts)

# Pad the sequences
max_length = 150
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

In [14]:
cleaned_texts = ' '.join(stress['text']).split()
word_series = pd.Series(cleaned_texts)
word_counts = word_series.value_counts()

# Display the top N most frequent words
print(word_counts.head(50))

# Analyze the cumulative distribution
cumulative_distribution = word_counts.cumsum() / word_counts.sum()
print(cumulative_distribution)
max_words = cumulative_distribution[cumulative_distribution < 0.99].shape[0]


I          10513
to          8163
and         7669
the         5775
a           5217
my          3938
of          3583
in          2676
that        2641
for         2417
me          2328
was         2241
is          2069
have        2053
it          2009
with        1996
but         1804
this        1520
he          1433
on          1418
be          1286
so          1226
I'm         1201
her         1174
just        1167
you         1155
about       1151
or          1141
not         1128
like        1104
at          1081
she         1064
as           974
out          902
had          899
if           880
because      856
been         840
get          829
up           826
do           793
what         780
we           774
know         765
from         750
feel         739
would        737
are          735
when         716
can          715
dtype: int64
I              0.043240
to             0.076815
and            0.108357
the            0.132110
a              0.153567
                 

# Testing

In [15]:
# Split the dataset
train_data, test_data, train_labels, test_labels = train_test_split(padded_sequences, labels, test_size=0.2, random_state=36)
val_data, test_data, val_labels, test_labels = train_test_split(test_data, test_labels, test_size=0.5, random_state=36)

In [None]:
train_data.shape

In [16]:
#LSTM Model - v1
embedding_dim = 64

model = models.Sequential([
    layers.Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length),
    layers.SpatialDropout1D(0.3),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.5)),
    layers.Bidirectional(layers.LSTM(32)),
    layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])
# Use Adam optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Implement early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)

# Train and test
history = model.fit(train_data, train_labels, epochs=20, batch_size=75, validation_data=(val_data, val_labels), callbacks=[early_stopping])
test_loss, test_acc = model.evaluate(test_data, test_labels)
print(f'Test accuracy: {test_acc}')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Test accuracy: 0.7147887349128723


In [None]:
#LSTM Model - v2
embedding_dim = 64

model = models.Sequential([
    layers.Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length),
    layers.SpatialDropout1D(0.2),
    layers.Bidirectional(layers.LSTM(32, return_sequences=True, dropout=0.4)),
    layers.Bidirectional(layers.LSTM(16)),
    layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    layers.Dropout(0.4),
    layers.Dense(1, activation='sigmoid')
])

# Use Adam optimizer with gradient clipping
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, clipnorm=1.0)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Implement early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train and test
history = model.fit(train_data, train_labels, epochs=20, batch_size=50, validation_data=(val_data, val_labels), callbacks=[early_stopping])
test_loss, test_acc = model.evaluate(test_data, test_labels)
print(f'Test accuracy: {test_acc}')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20

In [None]:
#LSTM Model - v3
embedding_dim = 50

model = models.Sequential([
    layers.Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length),
    layers.SpatialDropout1D(0.4),
    layers.Bidirectional(layers.LSTM(32, return_sequences=True, dropout=0.5)),
    layers.Bidirectional(layers.LSTM(16)),
    layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    layers.Dropout(0.6),
    layers.Dense(1, activation='sigmoid')
])

# Use Adam optimizer with a learning rate scheduler
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
lr_schedule = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 0.0001 * 10**(epoch/20))

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Implement early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)

# Train and test
history = model.fit(train_data, train_labels, epochs=20, batch_size=40, validation_data=(val_data, val_labels), callbacks=[early_stopping, lr_schedule])
test_loss, test_acc = model.evaluate(test_data, test_labels)
print(f'Test accuracy: {test_acc}')

In [17]:
#LSTM Model - v4
embedding_dim = 50

model = models.Sequential([
    layers.Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length),
    layers.SpatialDropout1D(0.5),
    layers.Bidirectional(layers.LSTM(32, return_sequences=True, dropout=0.6)),
    layers.Bidirectional(layers.LSTM(16)),
    layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    layers.Dropout(0.7),
    layers.Dense(1, activation='sigmoid')
])

# Use Adam optimizer with a learning rate scheduler
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
lr_schedule = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 0.0001 * 10**(epoch/20))
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.00001)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Implement early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train and test
history = model.fit(train_data, train_labels, epochs=20, batch_size=40, validation_data=(val_data, val_labels), callbacks=[early_stopping, lr_schedule, reduce_lr])
test_loss, test_acc = model.evaluate(test_data, test_labels)
print(f'Test accuracy: {test_acc}')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test accuracy: 0.6901408433914185


BERT

In [None]:
# BERT V1
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")

# Convert data to InputExample format
def convert_to_input_example(data, label):
    input_examples = []
    for text, label in zip(data, label):
        input_examples.append(InputExample(guid=None, text_a=text, text_b=None, label=label))
    return input_examples

train_examples = convert_to_input_example(train_data, train_labels)
val_examples = convert_to_input_example(val_data, val_labels)

# Convert InputExamples to InputFeatures
def convert_to_input_features(examples, max_length=128):
    features = []
    for example in examples:
        # Extract the token IDs directly from the example
        input_ids = example.text_a.tolist()

        # Ensure the length is consistent with max_length
        if len(input_ids) > max_length:
            input_ids = input_ids[:max_length]
        else:
            input_ids += [0] * (max_length - len(input_ids))

        # Create attention mask
        attention_mask = [1 if token_id != 0 else 0 for token_id in input_ids]

        # Since BERT typically expects token_type_ids, we'll create a dummy one
        token_type_ids = [0] * max_length

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=example.label
            )
        )
    return features

train_features = convert_to_input_features(train_examples)
val_features = convert_to_input_features(val_examples)


# Prepare dataset
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": [f.input_ids for f in train_features], "attention_mask": [f.attention_mask for f in train_features]}, [f.label for f in train_features]))
val_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": [f.input_ids for f in val_features], "attention_mask": [f.attention_mask for f in val_features]}, [f.label for f in val_features]))

# Compile and train the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
model.fit(train_dataset.batch(32), validation_data=val_dataset.batch(32), epochs=3)
# Print the results
 print(f"Training   - Accuracy: {train_accuracy*100:.2f}% | Loss: {train_loss:.4f}")
 print(f"Validation - Accuracy: {val_accuracy*100:.2f}% | Loss: {val_loss:.4f}")

In [None]:
# Bert V2
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")

def convert_data_to_features(data, labels, max_length=128):
    """Convert data and labels into BERT's InputFeatures format."""
    # Convert data to InputExample format
    examples = [InputExample(guid=None, text_a=text, text_b=None, label=label) for text, label in zip(data, labels)]

    # Convert examples to InputFeatures
    features = []
    for example in examples:
        input_ids = example.text_a.tolist()
        input_ids = input_ids[:max_length] + [0] * (max_length - len(input_ids))
        attention_mask = [1 if token_id != 0 else 0 for token_id in input_ids]
        token_type_ids = [0] * max_length
        features.append(InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=example.label))

    return features

train_features = convert_data_to_features(train_data, train_labels)
val_features = convert_data_to_features(val_data, val_labels)

# Prepare dataset
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": [f.input_ids for f in train_features], "attention_mask": [f.attention_mask for f in train_features]}, [f.label for f in train_features]))
val_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": [f.input_ids for f in val_features], "attention_mask": [f.attention_mask for f in val_features]}, [f.label for f in val_features]))

# Compile and train the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
model.fit(train_dataset.batch(32), validation_data=val_dataset.batch(32), epochs=3)

# Number of epochs
num_epochs = 3

for epoch in range(1, num_epochs + 1):
    print(f"\nEpoch {epoch}/{num_epochs}")
    print("-" * 30)

    # Train the model for one epoch
    train_history = model.fit(train_dataset.batch(32), verbose=0)

    # Validate the model
    val_loss, val_accuracy = model.evaluate(val_dataset.batch(32), verbose=0)

    # Extract training accuracy and loss from the history
    train_accuracy = train_history.history['accuracy'][0]
    train_loss = train_history.history['loss'][0]

    # Print the results
    print(f"Training   - Accuracy: {train_accuracy*100:.2f}% | Loss: {train_loss:.4f}")
    print(f"Validation - Accuracy: {val_accuracy*100:.2f}% | Loss: {val_loss:.4f}")