In [20]:
# Standard Libraries
import csv
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Pandas options
pd.options.mode.chained_assignment = None
seed = 0
np.random.seed(seed)

# Scikit-learn
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Imbalanced data handling
from imblearn.over_sampling import SMOTE, RandomOverSampler

# TensorFlow / Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical


In [21]:
df = pd.read_csv("clean_dataset.csv")

df

Unnamed: 0,at,userName,score,content,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_akhir,polarity_score,polarity
0,2025-03-29 16:56:30,Pengguna Google,2,mengapa sangat sulit mengganti KTP padahal sem...,mengapa sangat sulit mengganti KTP padahal sem...,mengapa sangat sulit mengganti ktp padahal sem...,mengapa sangat sulit mengganti ktp padahal sem...,"['mengapa', 'sangat', 'sulit', 'mengganti', 'k...","['sulit', 'mengganti', 'ktp', 'persyaratan', '...",sulit mengganti ktp persyaratan terpenuhi vide...,2,positive
1,2025-03-27 18:49:41,Pengguna Google,1,Emang aplikasi BUSUK.Saya transaksi sudah terj...,Emang aplikasi BUSUKSaya transaksi sudah terja...,emang aplikasi busuksaya transaksi sudah terja...,emang aplikasi busuksaya transaksi sudah terja...,"['emang', 'aplikasi', 'busuksaya', 'transaksi'...","['emang', 'aplikasi', 'busuksaya', 'transaksi'...",emang aplikasi busuksaya transaksi berhasil bu...,8,positive
2,2025-04-08 10:18:29,Pengguna Google,2,kurangin bintang nya ga bisa di buka kalo jari...,kurangin bintang nya ga bisa di buka kalo jari...,kurangin bintang nya ga bisa di buka kalo jari...,kurangin bintang nya ga bisa di buka kalo jari...,"['kurangin', 'bintang', 'nya', 'ga', 'bisa', '...","['kurangin', 'bintang', 'buka', 'kalo', 'jarin...",kurangin bintang buka kalo jaringan data mah p...,-9,negative
3,2025-04-08 23:00:03,Pengguna Google,3,"Dulu sbnrnya ini apk bgs, tapi lama lama error...",Dulu sbnrnya ini apk bgs tapi lama lama error ...,dulu sbnrnya ini apk bgs tapi lama lama error ...,dulu sbnrnya ini apk bagus tapi lama lama erro...,"['dulu', 'sbnrnya', 'ini', 'apk', 'bagus', 'ta...","['sbnrnya', 'apk', 'bagus', 'error', 'isi', 's...",sbnrnya apk bagus error isi saldo error tf err...,-17,negative
4,2025-04-03 05:24:42,Pengguna Google,1,"Akun tiba"" terhubung ke UC Drive Premium, Sald...",Akun tiba terhubung ke UC Drive Premium Saldo ...,akun tiba terhubung ke uc drive premium saldo ...,akun tiba terhubung ke uc drive premium saldo ...,"['akun', 'tiba', 'terhubung', 'ke', 'uc', 'dri...","['akun', 'terhubung', 'uc', 'drive', 'premium'...",akun terhubung uc drive premium saldo terpoton...,-12,negative
...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2025-01-14 01:08:50,NOVI ARDANA,5,aplikasi dana sangat membantu untuk tf di mana...,aplikasi dana sangat membantu untuk tf di mana...,aplikasi dana sangat membantu untuk tf di mana...,aplikasi Dana sangat membantu untuk tf di mana...,"['aplikasi', 'Dana', 'sangat', 'membantu', 'un...","['aplikasi', 'Dana', 'membantu', 'tf', 'lpun']",aplikasi Dana membantu tf lpun,-4,negative
49996,2019-11-15 20:12:48,Pengguna Google,1,"Saya baru pake aplikasi ini, karna ada kepenti...",Saya baru pake aplikasi ini karna ada kepentin...,saya baru pake aplikasi ini karna ada kepentin...,saya baru pake aplikasi ini karna ada kepentin...,"['saya', 'baru', 'pake', 'aplikasi', 'ini', 'k...","['pake', 'aplikasi', 'karna', 'kepentingan', '...",pake aplikasi karna kepentingan kerjaan downlo...,-4,negative
49997,2023-04-10 15:29:46,Raja Malam,1,Ini gimana si dulu mudah mudah aja masuk akun ...,Ini gimana si dulu mudah mudah aja masuk akun ...,ini gimana si dulu mudah mudah aja masuk akun ...,ini gimana si dulu mudah mudah aja masuk akun ...,"['ini', 'gimana', 'si', 'dulu', 'mudah', 'muda...","['gimana', 'si', 'mudah', 'mudah', 'aja', 'mas...",gimana si mudah mudah aja masuk akun Dana suli...,4,positive
49998,2021-05-07 21:27:09,Ahmad Saipullah,1,Lama bener ni lelet. Udh di wa antrian selalu ...,Lama bener ni lelet Udh di wa antrian selalu p...,lama bener ni lelet udh di wa antrian selalu p...,lama bener ni lambat udh di wa antrian selalu ...,"['lama', 'bener', 'ni', 'lambat', 'udh', 'di',...","['bener', 'ni', 'lambat', 'udh', 'wa', 'antria...",bener ni lambat udh wa antrian penuh ditanggap...,-3,negative


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   at                    50000 non-null  object
 1   userName              50000 non-null  object
 2   score                 50000 non-null  int64 
 3   content               50000 non-null  object
 4   text_clean            50000 non-null  object
 5   text_casefoldingText  50000 non-null  object
 6   text_slangwords       50000 non-null  object
 7   text_tokenizingText   50000 non-null  object
 8   text_stopword         50000 non-null  object
 9   text_akhir            49993 non-null  object
 10  polarity_score        50000 non-null  int64 
 11  polarity              50000 non-null  object
dtypes: int64(2), object(10)
memory usage: 4.6+ MB


In [23]:
df.isnull().sum()

Unnamed: 0,0
at,0
userName,0
score,0
content,0
text_clean,0
text_casefoldingText,0
text_slangwords,0
text_tokenizingText,0
text_stopword,0
text_akhir,7


In [24]:
df = df.dropna(subset=['text_akhir'])

In [25]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
X = df['text_akhir']
y = df['polarity']

le = LabelEncoder()
y_encoded = le.fit_transform(y)

#Pelatihan: SVM,    Ekstraksi Fitur: TF-IDF,    Pembagian Data: 80/20

In [26]:
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(X)

print("TF-IDF shape:", X_tfidf.shape)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

label_encoder = LabelEncoder()
label_encoder.fit(pd.concat([y_train2, y_test2]))
y_train_encoded = label_encoder.transform(y_train2)
y_test_encoded = label_encoder.transform(y_test2)

sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train2, y_train_encoded)


TF-IDF shape: (49993, 5000)


In [27]:
svm_model = SVC(kernel='linear')

start_time = time.time()
svm_model.fit(X_train_resampled, y_train_resampled)
print("Training selesai dalam", round(time.time() - start_time, 2), "detik")

y_pred_train = svm_model.predict(X_train2)
y_pred_test = svm_model.predict(X_test2)
y_pred_train_original = label_encoder.inverse_transform(y_pred_train)
y_pred_test_original = label_encoder.inverse_transform(y_pred_test)

accuracy_train = accuracy_score(y_train2, y_pred_train_original)
accuracy_test = accuracy_score(y_test2, y_pred_test_original)

print('SVM - akurasi data train:', accuracy_train)
print('SVM - akurasi data test:', accuracy_test)


Training selesai dalam 681.8 detik
SVM - akurasi data train: 0.9348152222833425
SVM - akurasi data test: 0.8782878287828783


#Pelatihan: SVM,    Ekstraksi Fitur: TF-IDF,    Pembagian Data: 70/30

In [36]:
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(X)

print("TF-IDF shape:", X_tfidf.shape)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

label_encoder = LabelEncoder()
label_encoder.fit(pd.concat([y_train2, y_test2]))
y_train_encoded = label_encoder.transform(y_train2)
y_test_encoded = label_encoder.transform(y_test2)

sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train2, y_train_encoded)


TF-IDF shape: (49993, 5000)


In [37]:
svm_model = SVC(kernel='linear')

start_time = time.time()
svm_model.fit(X_train_resampled, y_train_resampled)
print("Training selesai dalam", round(time.time() - start_time, 2), "detik")

y_pred_train = svm_model.predict(X_train2)
y_pred_test = svm_model.predict(X_test2)
y_pred_train_original = label_encoder.inverse_transform(y_pred_train)
y_pred_test_original = label_encoder.inverse_transform(y_pred_test)

accuracy_train = accuracy_score(y_train2, y_pred_train_original)
accuracy_test = accuracy_score(y_test2, y_pred_test_original)

print('SVM - akurasi data train:', accuracy_train)
print('SVM - akurasi data test:', accuracy_test)


Training selesai dalam 571.35 detik
SVM - akurasi data train: 0.9352478925560794
SVM - akurasi data test: 0.872182957727697


#Pelatihan: CNN, Ekstraksi Fitur: Embedding, Pembagian Data: 80/20

In [30]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)

X_seq = tokenizer.texts_to_sequences(X)
maxlen = max(len(seq) for seq in X_seq)
X_pad = pad_sequences(X_seq, padding='post', maxlen=100)

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_pad, y_encoded)

X_train, X_test, y_train, y_test = train_test_split(X_pad, y_encoded, test_size=0.2, random_state=42)

vocab_size = len(tokenizer.word_index) + 1

model_cnn = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=100),
    Conv1D(128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.3),
    Dense(10, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')
])

model_cnn.build(input_shape=(None, maxlen))
model_cnn.summary()



In [31]:
model_cnn.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

history = model_cnn.fit(
              X_train, y_train,
              epochs=200,
              batch_size=32,
              validation_data=(X_test, y_test),
              callbacks=[early_stopping]
          )

cnn_pred = model_cnn.predict(X_test).argmax(axis=1)
print(classification_report(y_test, cnn_pred, target_names=le.classes_))

Epoch 1/200
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 29ms/step - accuracy: 0.6869 - loss: 0.7083 - val_accuracy: 0.8545 - val_loss: 0.3999
Epoch 2/200
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 29ms/step - accuracy: 0.8583 - loss: 0.3937 - val_accuracy: 0.8607 - val_loss: 0.3681
Epoch 3/200
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 30ms/step - accuracy: 0.8939 - loss: 0.3056 - val_accuracy: 0.8642 - val_loss: 0.3756
Epoch 4/200
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 28ms/step - accuracy: 0.9095 - loss: 0.2533 - val_accuracy: 0.8617 - val_loss: 0.4183
Epoch 5/200
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 30ms/step - accuracy: 0.9187 - loss: 0.2176 - val_accuracy: 0.8621 - val_loss: 0.4377
Epoch 6/200
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 31ms/step - accuracy: 0.9273 - loss: 0.1908 - val_accuracy: 0.8529 - val_loss: 0.478

In [32]:
cnn_pred_train = model_cnn.predict(X_train).argmax(axis=1)
cnn_pred_test = model_cnn.predict(X_test).argmax(axis=1)

accuracy_train_cnn = accuracy_score(y_train, cnn_pred_train)
accuracy_test_cnn = accuracy_score(y_test, cnn_pred_test)

print("CNN - akurasi data train:", accuracy_train_cnn)
print("CNN - akurasi data test:", accuracy_test_cnn)

[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step
CNN - akurasi data train: 0.9115367305095764
CNN - akurasi data test: 0.8606860686068607


#EKSPORT

In [38]:
import joblib
from google.colab import drive
drive.mount('/content/drive')

joblib.dump(svm_model, 'svm_model.pkl')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['svm_model.pkl']

In [39]:
model_cnn.save('/content/drive/MyDrive/cnn_model.h5')
joblib.dump(svm_model, '/content/drive/MyDrive/svm_model.pkl')

joblib.dump(tfidf, '/content/drive/MyDrive/tfidf_vectorizer.pkl')
joblib.dump(tokenizer, '/content/drive/MyDrive/tokenizer.pkl')



['/content/drive/MyDrive/tokenizer.pkl']