In [1]:
import pandas as pd

# Baca file CSV
df = pd.read_csv('Combined Data.csv')

# Tampilkan beberapa baris pertama untuk verifikasi
print("Sebelum perubahan label:")
print(df[['statement', 'status']].head())

# Ubah label: normal -> positive, lainnya -> negative
df['label'] = df['status'].apply(lambda x: 'positive' if str(x).lower().strip() == 'Normal' else 'negative')

# Tampilkan hasil setelah perubahan
print("\nSetelah perubahan label:")
print(df[['statement', 'status', 'label']].head())

# (Opsional) Simpan hasil ke file baru
df.to_csv('Combined_Data_Labeled.csv', index=False)


Sebelum perubahan label:
                                           statement   status
0                                         oh my gosh  Anxiety
1  trouble sleeping, confused mind, restless hear...  Anxiety
2  All wrong, back off dear, forward doubt. Stay ...  Anxiety
3  I've shifted my focus to something else but I'...  Anxiety
4  I'm restless and restless, it's been a month n...  Anxiety

Setelah perubahan label:
                                           statement   status     label
0                                         oh my gosh  Anxiety  negative
1  trouble sleeping, confused mind, restless hear...  Anxiety  negative
2  All wrong, back off dear, forward doubt. Stay ...  Anxiety  negative
3  I've shifted my focus to something else but I'...  Anxiety  negative
4  I'm restless and restless, it's been a month n...  Anxiety  negative


In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# 1. Load Data
df = pd.read_csv('Combined_Data_Labeled.csv')

# 2. Drop NaN
df = df.dropna(subset=['statement', 'status'])

# 3. Mapping Label: 'normal' -> positive, selain itu negative
df['label'] = df['status'].apply(lambda x: 'positive' if str(x).strip().lower() == 'normal' else 'negative')

# 4. Cek distribusi label
print("Distribusi label:\n", df['label'].value_counts())

# 5. Bersihkan teks
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)   # hapus tanda baca
    text = re.sub(r'\d+', '', text)       # hapus angka
    return text.strip()

df['cleaned_text'] = df['statement'].apply(clean_text)

# 6. Vectorization - BoW
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])

# 7. Encode label
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])  # positive=1, negative=0

# 8. Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 9. Train Model - Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# 10. Evaluasi Model
y_pred = model.predict(X_test)
print("\nAkurasi:", accuracy_score(y_test, y_pred))
print("\nLaporan Klasifikasi:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Distribusi label:
 label
negative    36338
positive    16343
Name: count, dtype: int64

Akurasi: 0.9510297048495777

Laporan Klasifikasi:
               precision    recall  f1-score   support

    negative       0.97      0.95      0.96      7268
    positive       0.90      0.94      0.92      3269

    accuracy                           0.95     10537
   macro avg       0.94      0.95      0.94     10537
weighted avg       0.95      0.95      0.95     10537



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
