In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
data=pd.read_csv('/mental.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [None]:
data = data[data['status'] != 'Suicidal']

In [None]:
data = data[['statement', 'status']]
data.dropna(subset=['statement', 'status'], inplace=True)

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
def clean_text(text):
  text = text.lower()
  # Remove extra spaces
  text = re.sub(r'\s+', ' ', text).strip()
  # Remove special characters and numbers
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  # Tokenize words
  words = text.split()

  # Remove stopwords
  words = [word for word in words if word not in stop_words]

  # Lemmatize words
  words = [lemmatizer.lemmatize(word) for word in words]

  # Remove single characters (like "a", "b", etc.)
  words = [word for word in words if len(word) > 1]

  # Join words back into a single string
  cleaned_text = ' '.join(words)

  return cleaned_text

In [None]:
data['statement'] = data['statement'].apply(clean_text)

In [None]:
label_encoder = LabelEncoder()
data['status'] = label_encoder.fit_transform(data['status'])

In [None]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
max_words = 10000
max_length = 100

In [None]:
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['statement'])

In [None]:
X_train = tokenizer.texts_to_sequences(train_data['statement'])
X_test = tokenizer.texts_to_sequences(test_data['statement'])

In [None]:
X_train = pad_sequences(X_train, maxlen=max_length, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post', truncating='post')

In [None]:
y_train = tf.keras.utils.to_categorical(train_data['status'])  # One-hot encode labels
y_test = tf.keras.utils.to_categorical(test_data['status'])

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=max_words, output_dim=64, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(6, activation='softmax')
])



In [None]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath='best_suicidal_detection_model.keras',
    monitor='val_loss',
    save_best_only=True,
    mode='min',
    verbose=1
)

In [None]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2,
                    verbose=2,
                    callbacks=[checkpoint])

Epoch 1/10

Epoch 1: val_loss improved from inf to 0.72481, saving model to best_suicidal_detection_model.keras
841/841 - 137s - 162ms/step - accuracy: 0.7120 - loss: 1.0431 - val_accuracy: 0.7707 - val_loss: 0.7248
Epoch 2/10

Epoch 2: val_loss improved from 0.72481 to 0.65169, saving model to best_suicidal_detection_model.keras
841/841 - 104s - 124ms/step - accuracy: 0.8001 - loss: 0.6164 - val_accuracy: 0.8051 - val_loss: 0.6517
Epoch 3/10

Epoch 3: val_loss improved from 0.65169 to 0.62132, saving model to best_suicidal_detection_model.keras
841/841 - 133s - 159ms/step - accuracy: 0.8496 - loss: 0.5047 - val_accuracy: 0.8190 - val_loss: 0.6213
Epoch 4/10

Epoch 4: val_loss improved from 0.62132 to 0.58121, saving model to best_suicidal_detection_model.keras
841/841 - 142s - 169ms/step - accuracy: 0.8796 - loss: 0.4239 - val_accuracy: 0.8351 - val_loss: 0.5812
Epoch 5/10

Epoch 5: val_loss did not improve from 0.58121
841/841 - 142s - 168ms/step - accuracy: 0.8996 - loss: 0.3645 - v

In [None]:
best_model = tf.keras.models.load_model('best_suicidal_detection_model.keras')
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 35ms/step - accuracy: 0.8476 - loss: 0.6954
Test Accuracy: 0.85


In [None]:
def predict_status(input_text):
    cleaned_text = clean_text(input_text)

    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')

    prediction = model.predict(padded_sequence)

    predicted_class = np.argmax(prediction, axis=1)

    predicted_status = label_encoder.inverse_transform(predicted_class)

    return predicted_status[0]

user_input = input("Masukkan kalimat: ")
predicted_status = predict_status(user_input)

print(f"Hasil Prediksi: {predicted_status}")


Masukkan kalimat: trouble sleeping, confused mind, restless heart. All out of tune
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 572ms/step
Hasil Prediksi: Anxiety
