In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [34]:
# Load dataset
df = pd.read_csv("tokopedia_score_content_cleaned.csv", sep=",")
columns = ["score", "content"]
df.columns = columns

In [35]:
df.head()

Unnamed: 0,score,content
0,2,masih perlu belajar untuk hasil maksimal
1,5,tokopedia kenapa sekarang pencairan dana selle...
2,5,ngga expect ternyata tokopedia sebagus itu pen...
3,3,kocak cs lu status pesanan tiba di tempat tuju...
4,1,agh biaya admin beli pulsa nya mahal tidak sem...


In [47]:
len(df)

5000

In [36]:
# Pastikan kolom content tidak kosong
df["content"] = df["content"].fillna("").astype(str)

In [37]:
# Label sentimen
def label_sentiment(score):
    if score >= 4:
        return 'positif'
    elif score == 3:
        return 'netral'
    else:
        return 'negatif'

df['label'] = df['score'].apply(label_sentiment)

In [38]:
# Fitur dan label
X = df['content'].values
y = df['label'].values

In [39]:
# Encode label ke angka
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [40]:
# TF-IDF vektorisasi
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

In [41]:
# K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []
f1_scores = []

In [42]:
for fold, (train_index, test_index) in enumerate(kf.split(X_tfidf)):
    X_train, X_test = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]

    model = SVC(kernel="linear", random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    report = classification_report(
        y_test, y_pred,
        target_names=label_encoder.classes_,
        output_dict=True,
        zero_division=0
    )
    
    f1_scores.append({label: report[label]['f1-score'] for label in label_encoder.classes_})

    # Print per fold
    print(f"\nHasil evaluasi Fold-{fold + 1}")
    print(classification_report(
        y_test, y_pred,
        target_names=label_encoder.classes_,
        zero_division=0
    ))



Hasil evaluasi Fold-1
              precision    recall  f1-score   support

     negatif       0.79      0.90      0.84       424
      netral       0.00      0.00      0.00        56
     positif       0.88      0.88      0.88       520

    accuracy                           0.84      1000
   macro avg       0.56      0.59      0.57      1000
weighted avg       0.79      0.84      0.81      1000


Hasil evaluasi Fold-2
              precision    recall  f1-score   support

     negatif       0.77      0.92      0.84       409
      netral       0.00      0.00      0.00        57
     positif       0.90      0.87      0.88       534

    accuracy                           0.84      1000
   macro avg       0.56      0.59      0.57      1000
weighted avg       0.80      0.84      0.81      1000


Hasil evaluasi Fold-3
              precision    recall  f1-score   support

     negatif       0.79      0.90      0.84       429
      netral       0.00      0.00      0.00        51
     p

In [43]:
# Rata-rata akurasi dan f1
print("\nAkurasi tiap Fold:")
for i, acc in enumerate(accuracies):
    print(f"Fold {i + 1}: {acc:.2f}")
print(f"\nRata-rata akurasi K-Fold: {np.mean(accuracies):.2f}")



Akurasi tiap Fold:
Fold 1: 0.84
Fold 2: 0.84
Fold 3: 0.83
Fold 4: 0.84
Fold 5: 0.84

Rata-rata akurasi K-Fold: 0.84


In [44]:
# Rata-rata F1-score per kelas
print("\nRata-rata F1-score per kelas:")
for label in label_encoder.classes_:
    avg_f1 = np.mean([f[label] for f in f1_scores])
    print(f"{label}: {avg_f1:.2f}")


Rata-rata F1-score per kelas:
negatif: 0.84
netral: 0.00
positif: 0.88
