In [1]:
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import train_test_split

# Pastikan preprocessing.py ada di folder yang sama
from preprocessing import TextPreprocessor, LexiconCount

# 1) Load data yang sudah berisi kolom 'text_clean' (string) dan 'Rating'
df = pd.read_csv('DataFix_Preprocessed.csv', encoding='utf-8')

# Buat label sentiment
df['sentiment'] = df['Rating'].apply(
    lambda r: 'negatif' if r<=2 else ('netral' if r==3 else 'positif')
)

# Gunakan kolom text_clean (string) sebagai X
X = df['text_clean'].fillna('').astype(str).tolist()
y = df['sentiment'].tolist()

# 2) Split & balance dataset
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=28
)
max_n = pd.Series(y_tr).value_counts().max()
train_df = pd.DataFrame({'text': X_tr, 'sentiment': y_tr})
parts = []
for cls, grp in train_df.groupby('sentiment'):
    parts.append(grp.sample(max_n, replace=len(grp)<max_n, random_state=28))
train_bal = pd.concat(parts)
X_trb = train_bal['text'].tolist()
y_trb = train_bal['sentiment'].tolist()

# 3) Definisikan pipeline Stacking
pre       = TextPreprocessor()
feat_union= FeatureUnion([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('lex',   LexiconCount())
])

base_learners = [
    ('mnb', MultinomialNB(alpha=0.1, class_prior=[1/3]*3)),
    ('cnb', ComplementNB(alpha=0.1, class_prior=[1/3]*3)),
    ('lr',  LogisticRegression(class_weight='balanced', max_iter=1000, random_state=28))
]

stack_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=LogisticRegression(max_iter=1000, random_state=28),
    cv=5,
    stack_method='predict_proba'
)

pipe = Pipeline([
    ('pre',  pre),
    ('feat', feat_union),
    ('clf',  stack_clf)
])

# 4) Fit & Tune threshold untuk kelas 'negatif'
print("Training stacked model…")
pipe.fit(X_trb, y_trb)

print("Menentukan threshold optimal untuk kelas 'negatif'…")
proba_val = pipe.predict_proba(X_te)
neg_idx   = list(pipe.classes_).index('negatif')
y_bin     = np.array(y_te) == 'negatif'
prec, rec, thr = precision_recall_curve(y_bin.astype(int), proba_val[:,neg_idx])
f1_scores = 2*prec*rec/(prec+rec+1e-9)
best_t = thr[np.nanargmax(f1_scores)]
print(f"Threshold_negatif = {best_t:.3f}")

# 5) Serialize pipeline + threshold
artifact = {
    'pipeline': pipe,
    'threshold': best_t
}
joblib.dump(artifact, 'sentiment_model.pkl')
print("✅ Model dan threshold tersimpan di sentiment_model.pkl")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Training stacked model…
Menentukan threshold optimal untuk kelas 'negatif'…
Threshold_negatif = 0.263
✅ Model dan threshold tersimpan di sentiment_model.pkl
