MODEL SVM + TF-IDF

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

df = pd.read_excel("Data_Manual.xlsx")

# Rename 'Label' column to 'label' for consistency
if 'Label' in df.columns:
    df = df.rename(columns={'Label': 'label'})

df['label'] = df['label'].astype(str).str.strip()
df['label'] = df['label'].replace({'': np.nan, 'nan': np.nan})

labeled = df[df['label'].notna()]
unlabeled = df[df['label'].isna()]

print("Data Berlabel  :", len(labeled))
print("Data Unlabeled :", len(unlabeled))

# Train base model
X_train, X_test, y_train, y_test = train_test_split(
    labeled['review'], labeled['label'], test_size=0.2, stratify=labeled['label'], random_state=42
)

model = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), max_features=8000)),
    ('svm', SVC(kernel='linear', probability=True))
])

model.fit(X_train, y_train)

# Evaluasi baseline
print("\n=== Evaluasi Model Awal ===")
pred = model.predict(X_test)
print(classification_report(y_test, pred))

# ---- Bagian untuk memastikan semua data terlabeli ----
print("\nMelabeli seluruh data unlabeled...")
unlabeled['label'] = model.predict(unlabeled['review'])

# Gabungkan kembali
df_final = pd.concat([labeled, unlabeled], ignore_index=True)

# Train ulang model menggunakan semua data
model.fit(df_final['review'], df_final['label'])

print("\n=== Model Final Siap Dipakai (Semua Data Sudah Berlabel) ===")
print("Total rows:", len(df_final))
print("Label distribution:")
print(df_final['label'].value_counts())



Data Berlabel  : 251
Data Unlabeled : 817

=== Evaluasi Model Awal ===
              precision    recall  f1-score   support

     Negatif       0.81      0.81      0.81        21
      Netral       0.00      0.00      0.00        10
     Positif       0.60      0.90      0.72        20

    accuracy                           0.69        51
   macro avg       0.47      0.57      0.51        51
weighted avg       0.57      0.69      0.62        51


Melabeli seluruh data unlabeled...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled['label'] = model.predict(unlabeled['review'])



=== Model Final Siap Dipakai (Semua Data Sudah Berlabel) ===
Total rows: 1068
Label distribution:
label
Positif    799
Negatif    216
Netral      53
Name: count, dtype: int64


In [None]:
# Reset index agar rapi sebelum disimpan
df_final = df_final.reset_index(drop=True)

# Simpan ke file CSV
df_final.to_csv("hasil_labeling_sentimen.csv", index=False, encoding='utf-8')

print("\nFile berhasil disimpan ➤ hasil_labeling_sentimen.csv")



File berhasil disimpan ➤ hasil_labeling_sentimen.csv
