In [None]:
Mental Health Detection - Main Script

Copyright (C) 2025 Yanuar Noor Wicaksono A.K.A Christianus Yanuar

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see https://www.gnu.org/licenses/.

In [2]:
pip install -r requirements.txt

Collecting nltk (from -r requirements.txt (line 4))
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting regex>=2021.8.3 (from nltk->-r requirements.txt (line 4))
  Downloading regex-2025.10.23-cp312-cp312-win_amd64.whl.metadata (41 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 4.2 MB/s eta 0:00:01
   -------------------- ------------------- 0.8/1.5 MB 2.0 MB/s eta 0:00:01
   --------------------------- ------------ 1.0/1.5 MB 1.7 MB/s eta 0:00:01
   --------------------------- ------------ 1.0/1.5 MB 1.7 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 1.5 MB/s  0:00:01
Downloading regex-2025.10.23-cp312-cp312-win_amd64.whl (276 kB)
Installing collected packages: regex, nltk

   ---------------------------------------- 0/2 [regex]
   --------------


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\chris\AR\venv_312\Scripts\python.exe -m pip install --upgrade pip


In [8]:
# -------------------------------------------------------------------
# BAGIAN 1: INSTALASI DAN IMPORT LIBRARY
# -------------------------------------------------------------------

import pandas as pd
import numpy as np
import re
import nltk

# Untuk Preprocessing dengan NLTK
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag  # Ini untuk Part-of-Speech (POS) Tagging

# Untuk Metode 1: Frequency-Based
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Untuk Metode 2: Prediction-Based
from gensim.models import Word2Vec

# --- Mengunduh data yang diperlukan oleh NLTK ---
# Jalankan ini sekali saja.
print("Mengunduh paket NLTK yang diperlukan...")
try:
    nltk.download('punkt', quiet=True) # Untuk tokenization
    nltk.download('punkt_tab')
    nltk.download('wordnet', quiet=True) # Untuk lemmatization
    nltk.download('averaged_perceptron_tagger_eng')
    nltk.download('averaged_perceptron_tagger', quiet=True) # Untuk POS tagging
    nltk.download('stopwords', quiet=True) # Untuk stop words
    print("Paket NLTK (punkt, punkt_tab, wordnet, averaged_perceptron_tagger, stopwords) berhasil diunduh/sudah ada.")
except Exception as e:
    print(f"Error saat mengunduh paket NLTK: {e}")
    print("Pastikan Anda terhubung ke internet.")
    
# Siapkan stop words dan lemmatizer
STOP_WORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

Mengunduh paket NLTK yang diperlukan...


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...


Paket NLTK (punkt, punkt_tab, wordnet, averaged_perceptron_tagger, stopwords) berhasil diunduh/sudah ada.


[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


In [6]:
# -------------------------------------------------------------------
# BAGIAN 2: MEMUAT DAN MEMBERSIHKAN DATA
# -------------------------------------------------------------------
file_path = 'reddit_mentalhealth_sample (1).csv'

try:
    df = pd.read_csv(file_path)
    print(f"Dataset '{file_path}' berhasil dimuat.")
    print(f"Bentuk data awal: {df.shape}")

    # 1. Ganti string kosong dengan NaN (jika ada)
    df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

    # 2. Hapus baris yang memiliki nilai NaN di 'content' atau 'subreddit'
    df_cleaned = df.dropna(subset=['content', 'subreddit'])
    print(f"Bentuk data setelah menghapus NaN: {df_cleaned.shape}")

    # 3. Hapus duplikat (jika ada)
    df_cleaned = df_cleaned.drop_duplicates()
    print(f"Bentuk data setelah menghapus duplikat: {df_cleaned.shape}")

    # 4. Tampilkan 5 baris pertama
    print("\nContoh data (5 baris pertama):")
    print(df_cleaned.head())

except FileNotFoundError:
    print(f"ERROR: File '{file_path}' tidak ditemukan.")
    print("Pastikan file tersebut berada di folder yang sama dengan notebook Anda.")
except Exception as e:
    print(f"Terjadi error: {e}")

Dataset 'reddit_mentalhealth_sample (1).csv' berhasil dimuat.
Bentuk data awal: (700, 2)
Bentuk data setelah menghapus NaN: (700, 2)
Bentuk data setelah menghapus duplikat: (700, 2)

Contoh data (5 baris pertama):
  subreddit                                            content
0   Anxiety  My #1 biggest fear is death. Losing my conscio...
1   Anxiety  Trying my 4th med out soon. The first 3 anxiet...
2   Anxiety  Nauseated when thinking about a holiday How ca...
3   Anxiety  Angerxiety? Does anyone else experience anger ...
4   Anxiety  No idea how to deal with new anxiety For as lo...


In [9]:
# -------------------------------------------------------------------
# BAGIAN 3: ADVANCED PREPROCESSING (CLEANING + POS TAGGING NLTK)
# -------------------------------------------------------------------

# Ini adalah fungsi bantuan untuk mengubah format tag NLTK
# agar sesuai dengan format yang diterima oleh lemmatizer.
def get_wordnet_pos(nltk_tag):
    """Konversi tag NLTK ke tag WordNet."""
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        # Default-nya adalah NOUN jika tidak dikenali
        return wordnet.NOUN

# Daftar POS Tag yang ingin kita simpan
# NOUN, VERB, ADJ (Kata Benda, Kata Kerja, Kata Sifat)
KEPT_POS_TAGS = [wordnet.NOUN, wordnet.VERB, wordnet.ADJ]

def advanced_preprocess_nltk(text):
    """
    Fungsi untuk membersihkan dan memfilter teks menggunakan NLTK.
    """
    # 1. Hapus tanda baca, angka, dan karakter khusus
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    
    # 2. Ubah jadi huruf kecil
    text = text.lower()
    
    # 3. Tokenisasi (memecah teks menjadi kata-kata)
    tokens = word_tokenize(text)
    
    # 4. Dapatkan POS Tag untuk setiap token
    # Hasilnya: [('my', 'PRP$'), ('fear', 'NN'), ('is', 'VBZ'), ('death', 'NN')]
    tagged_tokens = pos_tag(tokens)
    
    clean_lemmas = []
    # 5. Lakukan Lemmatization, Stop Word Removal, dan POS Tag Filtering
    for token, tag in tagged_tokens:
        
        # Konversi tag NLTK ke tag WordNet
        wordnet_tag = get_wordnet_pos(tag)
        
        # Periksa 3 kondisi:
        # - Token BUKAN stop word
        # - Panjang token > 2 (menghapus kata seperti 'I', 'a')
        # - POS tag-nya ada di daftar yang kita inginkan
        if (token not in STOP_WORDS) and (len(token) > 2) and (wordnet_tag in KEPT_POS_TAGS):
            
            # Ambil bentuk dasarnya (lemma) menggunakan tag POS yang benar
            lemma = lemmatizer.lemmatize(token, pos=wordnet_tag)
            clean_lemmas.append(lemma)
            
    # 6. Mengembalikan daftar token/lemma yang sudah bersih
    return clean_lemmas


print("Memulai proses preprocessing lanjutan (NLTK POS Tagging & Lemmatization)...")

# 'processed_tokens' akan berisi list dari token
df_cleaned['processed_tokens'] = df_cleaned['content'].apply(advanced_preprocess_nltk)

# 'processed_content' akan berisi string yang sudah digabung kembali
df_cleaned['processed_content'] = df_cleaned['processed_tokens'].apply(lambda tokens: ' '.join(tokens))

print("Preprocessing selesai.")
print("\nContoh hasil preprocessing:")
print(df_cleaned[['content', 'processed_content']].head())

Memulai proses preprocessing lanjutan (NLTK POS Tagging & Lemmatization)...
Preprocessing selesai.

Contoh hasil preprocessing:
                                             content  \
0  My #1 biggest fear is death. Losing my conscio...   
1  Trying my 4th med out soon. The first 3 anxiet...   
2  Nauseated when thinking about a holiday How ca...   
3  Angerxiety? Does anyone else experience anger ...   
4  No idea how to deal with new anxiety For as lo...   

                                   processed_content  
0  big fear death lose consciousness since little...  
1  try med first anxiety med make bad buspar zolo...  
2  nauseate think holiday deal nausea talk trip m...  
3  angerxiety anyone else experience anger alongs...  
4  idea deal new anxiety long remember anxious pe...  


In [11]:
# -------------------------------------------------------------------
# BAGIAN 4: MEMBAGI DATA (TRAIN/TEST SPLIT)
# -------------------------------------------------------------------

X = df_cleaned['processed_content'] # Fitur (teks yang sudah bersih)
y = df_cleaned['subreddit']         # Label (target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Total data: {len(df_cleaned)}")
print(f"Jumlah data training: {len(X_train)}")
print(f"Jumlah data testing: {len(X_test)}")

Total data: 700
Jumlah data training: 560
Jumlah data testing: 140


In [15]:
# -------------------------------------------------------------------
# METODE 1: FREQUENCY-BASED (TF-IDF)
# -------------------------------------------------------------------
# TfidfVectorizer akan mengubah teks menjadi matriks angka
# berdasarkan skor Term Frequency-Inverse Document Frequency.
# Inisialisasi TfidfVectorizer
# max_features=5000 berarti kita hanya ambil 5000 kata paling penting
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# "Fit" (belajar) kosakata HANYA dari data training
tfidf_vectorizer.fit(X_train)

# "Transform" (ubah) data training dan testing menjadi matriks TF-IDF
X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("\nBentuk matriks TF-IDF (data training):", X_train_tfidf.shape)
print("Bentuk matriks TF-IDF (data testing):", X_test_tfidf.shape)
print(f"Artinya: {X_train_tfidf.shape[0]} dokumen dan {X_train_tfidf.shape[1]} fitur (kata).")

# Data 'X_train_tfidf' dan 'X_test_tfidf' ini sudah siap
# dimasukkan ke model Machine Learning (misal: LogisticRegression, RandomForest, dll)


Bentuk matriks TF-IDF (data training): (560, 5000)
Bentuk matriks TF-IDF (data testing): (140, 5000)
Artinya: 560 dokumen dan 5000 fitur (kata).


In [14]:
# -------------------------------------------------------------------
# METODE 2: PREDICTION-BASED (WORD2VEC)
# -------------------------------------------------------------------
# Langkah 1: Kita harus "melatih" model Word2Vec pada data kita.
# Langkah 2: Kita ubah setiap *dokumen* (postingan) menjadi satu vektor
#            dengan cara merata-ratakan vektor dari semua kata di dalamnya.

# --- Langkah 2.1: Melatih Model Word2Vec ---
# Input untuk Word2Vec adalah list of list of tokens.
# Kita sudah punya ini di 'df_cleaned['processed_tokens']'
corpus_tokens = df_cleaned['processed_tokens'].tolist()

# Latih model
# vector_size=100 -> setiap kata akan direpresentasikan sebagai vektor 100-dimensi
# window=5 -> melihat 5 kata sebelum dan 5 kata sesudah saat belajar konteks
# min_count=5 -> abaikan kata yang muncul kurang dari 5 kali
print("Melatih model Word2Vec...")
w2v_model = Word2Vec(sentences=corpus_tokens, vector_size=100, window=5, min_count=5, workers=4)
print("Model Word2Vec selesai dilatih.")

# Contoh: Melihat vektor untuk satu kata
try:
    print("\nVektor untuk kata 'anxiety':\n", w2v_model.wv['anxiety'][:10]) # tampilkan 10 dimensi pertama
except KeyError:
    print("\nKata 'anxiety' tidak ada di vocabulary (mungkin karena min_count).")

# Contoh: Melihat kata yang mirip
try:
    print("\nKata yang mirip dengan 'anxiety':\n", w2v_model.wv.most_similar('anxiety', topn=5))
except KeyError:
    pass

# --- Langkah 2.2: Membuat Vektor Dokumen (Document Vectors) ---
# Kita buat fungsi untuk merata-ratakan vektor kata dalam satu dokumen
def create_document_vector(tokens, model, vector_size=100):
    # Buat vektor nol sebagai dasar
    doc_vector = np.zeros(vector_size)
    word_count = 0
    
    # Ambil vocabulary dari model
    vocab = set(model.wv.index_to_key)
    
    for word in tokens:
        if word in vocab:
            doc_vector += model.wv[word]
            word_count += 1
            
    # Ambil rata-ratanya
    if word_count > 0:
        doc_vector /= word_count
        
    return doc_vector

print("\nMembuat vektor dokumen (merata-ratakan vektor kata)...")
# Terapkan fungsi ini ke data training dan testing
# Kita harus menggunakan 'processed_tokens' dari data split
X_train_tokens = df_cleaned.loc[X_train.index, 'processed_tokens']
X_test_tokens = df_cleaned.loc[X_test.index, 'processed_tokens']

X_train_w2v = np.array([create_document_vector(tokens, w2v_model) for tokens in X_train_tokens])
X_test_w2v = np.array([create_document_vector(tokens, w2v_model) for tokens in X_test_tokens])

print("Bentuk matriks Word2Vec (data training):", X_train_w2v.shape)
print("Bentuk matriks Word2Vec (data testing):", X_test_w2v.shape)

# Data 'X_train_w2v' dan 'X_test_w2v' ini juga sudah siap
# dimasukkan ke model Machine Learning.

Melatih model Word2Vec...
Model Word2Vec selesai dilatih.

Vektor untuk kata 'anxiety':
 [-0.2224123   0.20925497 -0.2845556   0.45667154  0.16534568 -0.6691461
  0.34463271  0.7962702  -0.45252115  0.01727195]

Kata yang mirip dengan 'anxiety':
 [('take', 0.9996471405029297), ('self', 0.9996382594108582), ('make', 0.9996116161346436), ('issue', 0.9996101260185242), ('bipolar', 0.9996057152748108)]

Membuat vektor dokumen (merata-ratakan vektor kata)...
Bentuk matriks Word2Vec (data training): (560, 100)
Bentuk matriks Word2Vec (data testing): (140, 100)
