In [60]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import time

In [61]:
# 1. Baca Data
data = pd.read_csv("/content/Hasil_Labeling_imdb.csv")

In [62]:
# 2. Pemisahan Fitur dan Label
X = data['reviews']
y = data['sentiment']

In [57]:
# 3. Bagi Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1111, random_state=42)  # 1000/9000 = 0.1111

In [58]:
# 4. Ekstraksi Fitur (N-Gram)
vectorizer = CountVectorizer(ngram_range=(1, 2))  # Ubah ngram_range sesuai kebutuhan
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [59]:
# 5. Seleksi Fitur (Mutual Information, please ini lama banget T_T padahal udah dioptimasi nggak pake KBestSelection)
start_time = time.time()

mi_scores = mutual_info_classif(X_train_vectorized, y_train)
k_best_features = sorted(range(len(mi_scores)), key=lambda i: mi_scores[i], reverse=True)[:3]
X_train_mi = X_train_vectorized[:, k_best_features]
X_test_mi = X_test_vectorized[:, k_best_features]

end_time = time.time()
print("Waktu eksekusi untuk seleksi fitur menggunakan Mutual Information: {:.2f} detik".format(end_time - start_time))

KeyboardInterrupt: 

In [None]:
# 6. Seleksi Fitur (Chi-square)
chi2_selector = SelectKBest(score_func=chi2, k=1000)  # Ubah nilai k sesuai kebutuhan
X_train_chi2 = chi2_selector.fit_transform(X_train_vectorized, y_train)
X_test_chi2 = chi2_selector.transform(X_test_vectorized)

In [None]:
# 7. SVM Mutual Information (kalo Seleksi Fitur yang Mutual Information diskip ini juga skip)
svm_model_mi = SVC(kernel='linear')
svm_model_mi.fit(X_train_mi, y_train)
y_pred_mi = svm_model_mi.predict(X_test_mi)
accuracy_mi = accuracy_score(y_test, y_pred_mi)
print("Akurasi menggunakan Mutual Information setelah seleksi fitur: {:.2f}".format(accuracy_mi))

In [None]:
# 8. SVM (Chi-square)
svm_model_chi2 = SVC(kernel='linear')
svm_model_chi2.fit(X_train_chi2, y_train)
y_pred_chi2 = svm_model_chi2.predict(X_test_chi2)
accuracy_chi2 = accuracy_score(y_test, y_pred_chi2)
print("Accuracy using chi-square:", accuracy_chi2)

In [None]:
# 9. Confusion Matrix dan Visualisasi
cm_mi = confusion_matrix(y_test, y_pred_mi)
print('Confusion matrix using mutual information:\n\n', cm_mi)
sns.heatmap(cm_mi, annot=True, fmt='d', cmap='YlGnBu')
plt.show()

In [None]:
cm_chi2 = confusion_matrix(y_test, y_pred_chi2)
print('Confusion matrix using chi-square:\n\n', cm_chi2)
sns.heatmap(cm_chi2, annot=True, fmt='d', cmap='YlGnBu')
plt.show()

In [None]:
# 10. Classification Report
print('Classification Report using mutual information:\n\n', classification_report(y_test, y_pred_mi))
print('Classification Report using chi-square:\n\n', classification_report(y_test, y_pred_chi2, zero_division=1))

In [None]:
# 11. K-Fold Cross Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
# Model dengan kernel linear (ini juga kalo pake Mutual Information, kalo nggak skip aja)
clf_linear = SVC(kernel='linear')
linear_scores = cross_val_score(clf_linear, X_train_mi, y_train, cv=kfold)
print('Stratified cross-validation scores dengan linear kernel:\n\n{}'.format(linear_scores))
print('Rata-rata stratified cross-validation score dengan linear kernel:{:.4f}'.format(linear_scores.mean()))

In [None]:
# Model dengan kernel rbf
clf_rbf = SVC(kernel='rbf')
rbf_scores = cross_val_score(clf_rbf, X_train_chi2, y_train, cv=kfold)
print('Stratified Cross-validation scores dengan rbf kernel:\n\n{}'.format(rbf_scores))
print('Rata-rata stratified cross-validation dengan with rbf kernel:{:.4f}'.format(rbf_scores.mean()))