In [2]:
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords

In [4]:
# Access the file
file_path = 'Dataset_Suicidal_Sentiment.csv'

In [5]:
df = pd.read_csv(file_path)
# Calculate the length of each post in terms of words
df['text_length'] = df['Post'].apply(lambda x: len(str(x).split()))
df.head()

Unnamed: 0.1,Unnamed: 0,Post,Suicidal_label,Sentiment_label,text_length
0,0,Ex Wife Threatening SuicideRecently I left my ...,0,0,146
1,1,Am I weird I don t get affected by compliments...,1,1,29
2,2,Finally is almost over So I can never hear ...,1,0,25
3,3,i need helpjust help me im crying so hard,0,0,9
4,4,I m so lostHello my name is Adam and I ve b...,0,0,452


In [6]:
df['Suicidal_label'].value_counts()

Suicidal_label
0    113534
1    113419
Name: count, dtype: int64

In [7]:
df['text_length'].describe()

count    226953.000000
mean        136.649355
std         221.631959
min           1.000000
25%          27.000000
50%          63.000000
75%         161.000000
max        9685.000000
Name: text_length, dtype: float64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226953 entries, 0 to 226952
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Unnamed: 0       226953 non-null  int64 
 1   Post             226895 non-null  object
 2   Suicidal_label   226953 non-null  int64 
 3   Sentiment_label  226953 non-null  int64 
 4   text_length      226953 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 8.7+ MB


In [9]:
df_cleaned = df.dropna()
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 226895 entries, 0 to 226952
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Unnamed: 0       226895 non-null  int64 
 1   Post             226895 non-null  object
 2   Suicidal_label   226895 non-null  int64 
 3   Sentiment_label  226895 non-null  int64 
 4   text_length      226895 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 10.4+ MB


In [10]:
# Download the stopwords from NLTK
nltk.download('stopwords')

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Function to remove stop words from a given text
def remove_stop_words(text):
    # Split the text into words
    words = str(text).split()

    # Filter out stop words
    filtered_words = [word for word in words if word.lower() not in stop_words]

    # Join the words back into a single string
    return ' '.join(filtered_words)

# Apply the function to your dataset
df_cleaned.loc[:, 'Post'] = df_cleaned['Post'].apply(remove_stop_words)

In [12]:
# Since the original dataset does not provide headers you need to index the columns by their index
text = df_cleaned['Post'].to_numpy()
labels = df_cleaned['Suicidal_label'].to_numpy()

# Create the dataset
dataset = tf.data.Dataset.from_tensor_slices((text, labels))

# Get the first 5 elements of the dataset
examples = list(dataset.take(5))

print(f"dataset contains {len(dataset)} examples\n")

print(f"Text of second example look like this: {examples[1][0].numpy().decode('utf-8')}\n")
print(f"Labels of first 5 examples look like this: {[x[1].numpy() for x in examples]}")

dataset contains 226895 examples

Text of second example look like this: weird get affected compliments coming someone know irl feel really good internet strangers

Labels of first 5 examples look like this: [np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(0)]


In [13]:
EMBEDDING_DIM = 128
MAX_LENGTH = 150
TRAINING_SPLIT = 0.7
VAL_SPLIT = 0.2
TEST_SPLIT = 0.1
NUM_BATCHES = 64
PREFETCH_BUFFER_SIZE = tf.data.AUTOTUNE
SHUFFLE_BUFFER_SIZE = 1000

In [14]:
def split_datasets(dataset):
  train_size = int(TRAINING_SPLIT * len(list(dataset)))
  val_size = int(VAL_SPLIT * len(list(dataset)))

  print(train_size)
  print(val_size)

  train_dataset = dataset.take(train_size)
  remaining = dataset.skip(train_size)
  val_dataset = remaining.take(val_size)
  test_dataset = remaining.skip(val_size)

  # Turn the dataset into a batched dataset with num_batches batches
  train_dataset = train_dataset.cache().shuffle(SHUFFLE_BUFFER_SIZE).prefetch(PREFETCH_BUFFER_SIZE).batch(NUM_BATCHES)
  val_dataset = val_dataset.cache().shuffle(SHUFFLE_BUFFER_SIZE).prefetch(PREFETCH_BUFFER_SIZE).batch(NUM_BATCHES)
  test_dataset = test_dataset.cache().shuffle(SHUFFLE_BUFFER_SIZE).prefetch(PREFETCH_BUFFER_SIZE).batch(NUM_BATCHES)

  return train_dataset, val_dataset, test_dataset

In [15]:
train_dataset, val_dataset, test_dataset = split_datasets(dataset)
print(train_dataset.element_spec)
print(f"There are {len(train_dataset)} batches for a total of {NUM_BATCHES*len(train_dataset)} elements for training.\n")
print(f"There are {len(val_dataset)} batches for a total of {NUM_BATCHES*len(val_dataset)} elements for validation.\n")
print(f"There are {len(test_dataset)} batches for a total of {NUM_BATCHES*len(test_dataset)} elements for validation.\n")

print(f"Total elements in dataset: {(NUM_BATCHES*len(train_dataset)) + (NUM_BATCHES*len(val_dataset)) + (NUM_BATCHES*len(test_dataset))}")

158826
45379
(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))
There are 2482 batches for a total of 158848 elements for training.

There are 710 batches for a total of 45440 elements for validation.

There are 355 batches for a total of 22720 elements for validation.

Total elements in dataset: 227008


In [16]:
def fit_vectorizer(dataset):
    # Instantiate the TextVectorization class, defining the necessary arguments alongside their corresponding values
    vectorizer = tf.keras.layers.TextVectorization(
        standardize='lower_and_strip_punctuation',
        output_sequence_length=MAX_LENGTH
    )

    # Fit the tokenizer to the training sentences
    vectorizer.adapt(dataset)

    return vectorizer

In [17]:
# Get only the texts out of the dataset
text_only_dataset = train_dataset.map(lambda text, label: text)

# Adapt the vectorizer to the training sentences
vectorizer = fit_vectorizer(text_only_dataset)

# Check size of vocabulary
vocab_size = vectorizer.vocabulary_size()

print(f"Vocabulary contains {vocab_size} words\n")

Vocabulary contains 122113 words



In [18]:
# Apply vectorization to train and val datasets
train_dataset_vectorized = train_dataset.map(lambda x,y: (vectorizer(x), y))
val_dataset_vectorized = val_dataset.map(lambda x,y: (vectorizer(x), y))
print(train_dataset_vectorized.element_spec)
for text_batch, label_batch in train_dataset_vectorized.take(1):
    print(f"Text batch shape: {text_batch.shape}")
    print(f"Label batch shape: {label_batch.shape}")

(TensorSpec(shape=(None, None), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))
Text batch shape: (64, 150)
Label batch shape: (64,)


In [19]:
def create_model(vocab_size):

    model = tf.keras.Sequential([
        tf.keras.Input(shape=(MAX_LENGTH,)),
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM),
        tf.keras.layers.Conv1D(32, 5, activation='relu'),
        tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(0.01))),
        tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
        ])

    model.compile(
        loss='binary_crossentropy',
        optimizer='rmsprop',
        metrics=['accuracy']
    )

    return model

In [None]:
class EarlyStoppingCallback(tf.keras.callbacks.Callback):

    # Define the correct function signature for on_epoch_end method
    def on_epoch_end(self, epoch, logs=None):

        # Check if the accuracy is greater or equal to 0.95
        if  logs['val_accuracy']>= 0.95:

            # Stop training once the above condition is met
            self.model.stop_training = True

            print("\nReached 95% accuracy so cancelling training!")

In [21]:
# Create your untrained model
model = create_model(vocab_size)

In [22]:
# Take an example batch of data
example_batch = train_dataset_vectorized.take(1)
print(example_batch.element_spec)
try:
	model.evaluate(example_batch, verbose=False)
except:
	print("Your model is not compatible with the dataset you defined earlier. Check that the loss function and last layer are compatible with one another.")
else:
	predictions = model.predict(example_batch, verbose=False)
	print(f"predictions have shape: {predictions.shape}")

(TensorSpec(shape=(None, None), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))
predictions have shape: (64, 1)


In [23]:
# Train the model and save the training history
history = model.fit(
	train_dataset_vectorized,
	epochs=10,
	validation_data=val_dataset_vectorized,
	callbacks=[EarlyStoppingCallback()],
  verbose=1
)

Epoch 1/10
[1m2482/2482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1316s[0m 514ms/step - accuracy: 0.8731 - loss: 0.3992 - val_accuracy: 0.9353 - val_loss: 0.1854
Epoch 2/10
[1m2482/2482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2456s[0m 990ms/step - accuracy: 0.9366 - loss: 0.1765 - val_accuracy: 0.9377 - val_loss: 0.1723
Epoch 3/10
[1m2482/2482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1169s[0m 471ms/step - accuracy: 0.9457 - loss: 0.1544 - val_accuracy: 0.9371 - val_loss: 0.1763
Epoch 4/10
[1m2482/2482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10515s[0m 4s/step - accuracy: 0.9528 - loss: 0.1373 - val_accuracy: 0.9409 - val_loss: 0.1675
Epoch 5/10
[1m2482/2482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1158s[0m 451ms/step - accuracy: 0.9606 - loss: 0.1197 - val_accuracy: 0.9411 - val_loss: 0.1694
Epoch 6/10
[1m2482/2482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2186s[0m 881ms/step - accuracy: 0.9669 - loss: 0.1029 - val_accuracy: 0.9396 - val_l