MODEL SVM + TF-IDF

In [6]:
# ================================
# 1. IMPORT LIBRARY
# ================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

# ================================
# 2. LOAD DATA
# ================================
df = pd.read_excel("Data_Manual.xlsx")  # ubah jika namanya beda

# Rename 'Label' column to 'label' for consistency
if 'Label' in df.columns:
    df = df.rename(columns={'Label': 'label'})

# Pastikan label rapi
df['label'] = df['label'].astype(str).str.strip()
df['label'] = df['label'].replace({'': np.nan, 'nan': np.nan})

# ================================
# 3. PISAHKAN LABELED & UNLABELED
# ================================
labeled = df[df['label'].notna()]
unlabeled = df[df['label'].isna()]

print("Data Berlabel  :", len(labeled))
print("Data Unlabeled :", len(unlabeled))

# ================================
# 4. TRAIN MODEL BASELINE
# ================================
X_train, X_test, y_train, y_test = train_test_split(
    labeled['review'], labeled['label'], test_size=0.2, stratify=labeled['label'], random_state=42
)

model = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), max_features=8000)),
    ('svm', SVC(kernel='linear', probability=True))
])

model.fit(X_train, y_train)

# EVALUASI SAAT BASELINE
print("\n=== Evaluasi Model Awal ===")
pred = model.predict(X_test)
print(classification_report(y_test, pred))

# ================================
# 5. PSEUDO LABELING
# ================================
pseudo = model.predict_proba(unlabeled['review'])
conf = np.max(pseudo, axis=1)

# Ambil data confidence >= 0.90
threshold = 0.90
mask = conf >= threshold

pseudo_labels = model.predict(unlabeled['review'][mask])

unlabeled.loc[mask, 'label'] = pseudo_labels

print("\nPseudo-label ditambahkan:", mask.sum(), "data")

# Gabung kembali hanya data yang memiliki label (original atau pseudo)
df_final = pd.concat([labeled, unlabeled[unlabeled['label'].notna()]], ignore_index=True)

# ================================
# 6. TRAIN MODEL FINAL
# ================================
model.fit(df_final['review'], df_final['label'])

print("\n=== Model Final Siap Dipakai ===")


Data Berlabel  : 251
Data Unlabeled : 817

=== Evaluasi Model Awal ===
              precision    recall  f1-score   support

     Negatif       0.81      0.81      0.81        21
      Netral       0.00      0.00      0.00        10
     Positif       0.60      0.90      0.72        20

    accuracy                           0.69        51
   macro avg       0.47      0.57      0.51        51
weighted avg       0.57      0.69      0.62        51



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Pseudo-label ditambahkan: 4 data

=== Model Final Siap Dipakai ===


In [7]:
# SIMPAN DATA AKHIR YANG SUDAH TERLABEL
df_final.to_csv("hasil_label_svm.csv", index=False, encoding='utf-8')

print("\nFile berhasil disimpan ➤ hasil_label_svm.csv")



File berhasil disimpan ➤ hasil_label_svm.csv
