In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [63]:
# LOAD DATA
data_labeled = pd.read_csv("Manual_label.csv", sep=';')
data_unlabeled = pd.read_csv("Model Test.csv", sep=';')

# HAPUS SEMUA KOLOM "Unnamed"
data_labeled = data_labeled.loc[:, ~data_labeled.columns.str.contains("^Unnamed")]
data_unlabeled = data_unlabeled.loc[:, ~data_unlabeled.columns.str.contains("^Unnamed")]

# Handle potential NaN values in 'Cleaned_Text' by filling them with empty strings
X_text = data_labeled["Cleaned_Text"].fillna('')
y = data_labeled["Label"]

# Menampilkan 5 baris pertama dari dataset untuk verifikasi
display(data_labeled.head())
print('')

Unnamed: 0,Cleaned_Text,Label
0,cerita kalo dijadiin wattpad novel film seru b...,positive
1,keren banget mimpi,positive
2,bahagia sedih tiara,positive
3,nonton ceritanya kak tiara seru banget mana ce...,positive
4,banyak cerita suka cerita,positive





In [64]:

# Menghitung TF-IDF
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(X_text)

# 3. SPLIT DATA 80:20
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [65]:
# 4. TRAINING MODEL SVM
model = LinearSVC()
model.fit(X_train, y_train)

# 5. EVALUASI MODEL
y_pred = model.predict(X_test)

print("AKURASI:", accuracy_score(y_test, y_pred))
print("\nCLASSIFICATION REPORT:\n", classification_report(y_test, y_pred))
print("CONFUSION MATRIX:\n", confusion_matrix(y_test, y_pred))

AKURASI: 0.7333333333333333

CLASSIFICATION REPORT:
               precision    recall  f1-score   support

    negative       0.88      0.64      0.74        22
     neutral       0.50      0.81      0.62        16
    positive       0.94      0.77      0.85        22

    accuracy                           0.73        60
   macro avg       0.77      0.74      0.74        60
weighted avg       0.80      0.73      0.75        60

CONFUSION MATRIX:
 [[14  8  0]
 [ 2 13  1]
 [ 0  5 17]]


In [66]:
# Label Otomatis DATA BELUM BERLABEL
X_unlabeled = tfidf.transform(data_unlabeled["Cleaned_Text"].fillna(''))
hasil_prediksi = model.predict(X_unlabeled)

data_unlabeled["label_prediksi"] = hasil_prediksi

# Drop columns from data_unlabeled that are entirely NaN (like 'Unnamed: X')
data_unlabeled = data_unlabeled.dropna(axis=1, how='all')

# 7. SIMPAN KE CSV
data_unlabeled.to_csv("hasil_label_otomatis.csv", index=False)

print("\n✅ MODEL BERHASIL DIBUAT!")
print("✅ DATA BERHASIL DILABELI OTOMATIS!")
print("✅ File: hasil_label_otomatis.csv")


✅ MODEL BERHASIL DIBUAT!
✅ DATA BERHASIL DILABELI OTOMATIS!
✅ File: hasil_label_otomatis.csv
