# Import & Install Library

In [1]:
!pip install Sastrawi
!pip install gensim scikit-learn nltk

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [7]:
import pandas as pd
import numpy as np
import re
import string
import json
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [8]:
# Load dataset
df = pd.read_csv('myim3_dataset.csv')
df = df.drop(columns=['appVersion'], errors='ignore')
print(f"\nJumlah data awal: {len(df)}")
print("\nDistribusi score awal:")
print(df['score'].value_counts().sort_index())


Jumlah data awal: 10000

Distribusi score awal:
score
1    4469
2     654
3     611
4     639
5    3627
Name: count, dtype: int64


# Preprocessing

In [16]:
# Konversi score ke label sentimen
# Score 1-2 = Negatif (0), Score 4-5 = Positif (1), Score 3 = Netral (hapus)
def score_to_label(score):
    if score <= 3:
        return 'negatif'
    elif score >= 4:
        return 'positif'
    else:
        return 'netral'

df['label'] = df['score'].apply(score_to_label)
# Hitung jumlah tiap label
print(df['label'].value_counts())

label
negatif    5734
positif    4266
Name: count, dtype: int64


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   userName  10000 non-null  object
 1   content   10000 non-null  object
 2   score     10000 non-null  int64 
 3   at        10000 non-null  object
 4   label     10000 non-null  object
dtypes: int64(1), object(4)
memory usage: 390.8+ KB


In [11]:
df.isnull().sum()

Unnamed: 0,0
userName,0
content,0
score,0
at,0
label,0


In [12]:
df.duplicated().sum()

np.int64(0)

In [17]:
# Undersampling
target = 4000  # Sesuaikan dengan jumlah data minoritas
df_under = (
    df.groupby('label', group_keys=False)
    .apply(lambda x: x.head(min(target, len(x))))
)
print(f"\nJumlah data setelah undersampling: {len(df_under)}")
print("\nDistribusi label setelah undersampling:")
print(df_under['label'].value_counts())


Jumlah data setelah undersampling: 8000

Distribusi label setelah undersampling:
label
negatif    4000
positif    4000
Name: count, dtype: int64


In [22]:
# Fungsi preprocessing
def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emotikon
        u"\U0001F300-\U0001F5FF"  # simbol & pictograph
        u"\U0001F680-\U0001F6FF"  # transport & map
        u"\U0001F1E0-\U0001F1FF"  # bendera
        u"\U00002700-\U000027BF"  # dingbats
        u"\U0001F900-\U0001F9FF"  # simbol tambahan
        u"\U0001FA70-\U0001FAFF"  # simbol tambahan 2
        u"\U00002600-\U000026FF"  # simbol misc
        u"\U00002000-\U000023FF"  # simbol tambahan
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub('', text)

def clean_text(text):
    text = str(text).lower()  # case folding
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # hapus URL
    text = re.sub(r'@\w+|#\w+', '', text)  # hapus mention & hashtag
    text = re.sub(r'\d+', '', text)  # hapus angka
    text = text.translate(str.maketrans('', '', string.punctuation))  # hapus tanda baca
    text = remove_emoji(text)  # hapus emoji
    text = text.strip()
    return text

def preprocess_text(text):
    text = clean_text(text)
    return text

# Terapkan cleaning
print("\nMelakukan text cleaning...")
df_under['content'] = df_under['content'].apply(preprocess_text)
df_under['content'] = df_under['content'].apply(lambda x: x.split())
print(df_under.head())



Melakukan text cleaning...
                userName                                            content  \
0   UCIL OLOL LEHO [UOL]  [kenapa, indosat, ada, tuyulnya, sekarang, pul...   
2              Xeraphine  [tolong, itu, sistem, login, nomor, utama, dan...   
9              mang pecu  [pelayanan, bayar, bulanan, iseng, isi, pulsa,...   
10         Adelia Raidha  [im, jaringannyaa, jelek, banget, percumaa, mo...   
11         Hayato Hayato  [sinyal, jelek, rugi, make, m, sudah, paket, m...   

    score                   at    label  
0       1  2024-02-02 05:17:07  negatif  
2       3  2024-02-02 05:13:29  negatif  
9       1  2024-02-02 04:45:17  negatif  
10      1  2024-02-02 04:40:51  negatif  
11      1  2024-02-02 04:39:42  negatif  


In [24]:
# Slang word normalization
try:
    def read_dictionary_from_file(file_path):
        with open(file_path, 'r') as file:
            dictionary = json.load(file)
        return dictionary

    file_path = 'slangwords.txt'
    my_dictionary = read_dictionary_from_file(file_path)

    def change(tokens):
        new_list = []
        for token in tokens:
            if token in my_dictionary:
                new_list.append(my_dictionary[token])
            else:
                new_list.append(token)
        return new_list

    print("Melakukan normalisasi slang words...")
    df_under['content'] = df_under['content'].apply(change)
except FileNotFoundError:
    print("File slangwords.txt tidak ditemukan, skip normalisasi slang words")
print(df_under.head())

Melakukan normalisasi slang words...
                userName                                            content  \
0   UCIL OLOL LEHO [UOL]  [kenapa, indosat, ada, tuyulnya, sekarang, pul...   
2              Xeraphine  [tolong, itu, sistem, login, nomor, utama, dan...   
9              mang pecu  [pelayanan, bayar, bulanan, iseng, isi, pulsa,...   
10         Adelia Raidha  [im, jaringannyaa, jelek, banget, percumaa, mo...   
11         Hayato Hayato  [sinyal, jelek, rugi, make, m, sudah, paket, m...   

    score                   at    label  
0       1  2024-02-02 05:17:07  negatif  
2       3  2024-02-02 05:13:29  negatif  
9       1  2024-02-02 04:45:17  negatif  
10      1  2024-02-02 04:40:51  negatif  
11      1  2024-02-02 04:39:42  negatif  


In [26]:
# Stopword removal
try:
    def import_words_from_file(file_path):
        word_list = []
        with open(file_path, 'r') as file:
            for line in file:
                word_list.append(line.strip())
        return word_list

    file_path = 'stopwords.txt'
    stopwords = import_words_from_file(file_path)

    def stopword_removal(tokens):
        new_list = []
        for token in tokens:
            if token not in stopwords:
                new_list.append(token)
        return new_list

    print("Melakukan stopword removal...")
    df_under['content'] = df_under['content'].apply(stopword_removal)
except FileNotFoundError:
    print("File stopwords.txt tidak ditemukan, skip stopword removal")
print(df_under.head())

Melakukan stopword removal...
                userName                                            content  \
0   UCIL OLOL LEHO [UOL]  [indosat, tuyulnya, pulsa, berkurang, tidak, p...   
2              Xeraphine  [sistem, login, nomor, utama, nomor, sekunder,...   
9              mang pecu  [pelayanan, bayar, bulanan, iseng, isi, pulsa,...   
10         Adelia Raidha  [im, jaringannyaa, jelek, banget, percumaa, mo...   
11         Hayato Hayato  [sinyal, jelek, rugi, make, m, paket, mahal, s...   

    score                   at    label  
0       1  2024-02-02 05:17:07  negatif  
2       3  2024-02-02 05:13:29  negatif  
9       1  2024-02-02 04:45:17  negatif  
10      1  2024-02-02 04:40:51  negatif  
11      1  2024-02-02 04:39:42  negatif  


In [27]:
# Stemming
print("Melakukan stemming...")
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stemming(tokens):
    new_list = []
    for token in tokens:
        if token:  # Skip empty tokens
            stemmed = stemmer.stem(token)
            new_list.append(stemmed)
    return new_list

df_under['content'] = df_under['content'].apply(stemming)
print(df_under.head())

Melakukan stemming...
                userName                                            content  \
0   UCIL OLOL LEHO [UOL]  [indosat, tuyul, pulsa, kurang, tidak, paket, ...   
2              Xeraphine  [sistem, login, nomor, utama, nomor, sekunder,...   
9              mang pecu  [layan, bayar, bulan, iseng, isi, pulsa, perlu...   
10         Adelia Raidha  [im, jaringannyaa, jelek, banget, percumaa, mo...   
11         Hayato Hayato  [sinyal, jelek, rugi, make, m, paket, mahal, s...   

    score                   at    label  
0       1  2024-02-02 05:17:07  negatif  
2       3  2024-02-02 05:13:29  negatif  
9       1  2024-02-02 04:45:17  negatif  
10      1  2024-02-02 04:40:51  negatif  
11      1  2024-02-02 04:39:42  negatif  


In [28]:
# Hapus data dengan content kosong setelah preprocessing
df_under = df_under[df_under['content'].apply(len) > 0].copy()
print(f"\nJumlah data setelah hapus content kosong: {len(df_under)}")


Jumlah data setelah hapus content kosong: 7498


In [29]:
# Label encoding
label_map = {'negatif': 0, 'positif': 1}
df_under['label_num'] = df_under['label'].map(label_map)

print("\nContoh data setelah preprocessing:")
print(df_under[['content', 'score', 'label', 'label_num']].head(10))


Contoh data setelah preprocessing:
                                              content  score    label  \
0   [indosat, tuyul, pulsa, kurang, tidak, paket, ...      1  negatif   
2   [sistem, login, nomor, utama, nomor, sekunder,...      3  negatif   
9   [layan, bayar, bulan, iseng, isi, pulsa, perlu...      1  negatif   
10  [im, jaringannyaa, jelek, banget, percumaa, mo...      1  negatif   
11  [sinyal, jelek, rugi, make, m, paket, mahal, s...      1  negatif   
12                    [tidak, registrasi, isi, pulsa]      1  negatif   
20                      [beli, internet, tidak, guna]      2  negatif   
22                             [top, up, game, tidak]      1  negatif   
24  [kecewa, banget, kartu, beli, paket, gb, habis...      1  negatif   
31  [aplikasi, g, benar, beli, kuota, padah, data,...      1  negatif   

    label_num  
0           0  
2           0  
9           0  
10          0  
11          0  
12          0  
20          0  
22          0  
24          0  


# SPLITTING DATA

In [30]:
X_texts = df_under['content'].values  # Array of token lists
y = df_under['label_num'].values

X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    X_texts, y, test_size=0.2, random_state=42, stratify=y
)

In [32]:
print(f"\nData training: {len(X_train_texts)} samples")
print(f"Data testing: {len(X_test_texts)} samples")
print(f"\nDistribusi label training:")
unique, counts = np.unique(y_train, return_counts=True)
for label, count in zip(unique, counts):
    label_name = 'Negatif' if label == 0 else 'Positif'
    print(f"  {label_name} ({label}): {count}")
print(f"\nDistribusi label testing:")
unique, counts = np.unique(y_test, return_counts=True)
for label, count in zip(unique, counts):
    label_name = 'Negatif' if label == 0 else 'Positif'
    print(f"  {label_name} ({label}): {count}")


Data training: 5998 samples
Data testing: 1500 samples

Distribusi label training:
  Negatif (0): 3153
  Positif (1): 2845

Distribusi label testing:
  Negatif (0): 788
  Positif (1): 712


# Word2Vec Embedding

In [33]:
sentences_train = X_train_texts.tolist()
print(f"\nJumlah kalimat untuk training Word2Vec: {len(sentences_train)}")

# Parameter Word2Vec
vector_size = 100  # dimensi vektor
window = 5  # jendela konteks
min_count = 2  # minimum frekuensi kata
workers = 4  # jumlah thread
sg = 1  # 1 = skip-gram, 0 = CBOW

print(f"\nParameter Word2Vec:")
print(f"- Vector size: {vector_size}")
print(f"- Window: {window}")
print(f"- Min count: {min_count}")
print(f"- Algorithm: {'Skip-gram' if sg == 1 else 'CBOW'}")


Jumlah kalimat untuk training Word2Vec: 5998

Parameter Word2Vec:
- Vector size: 100
- Window: 5
- Min count: 2
- Algorithm: Skip-gram


In [34]:
w2v_model = Word2Vec(
    sentences=sentences_train,
    vector_size=vector_size,
    window=window,
    min_count=min_count,
    workers=workers,
    sg=sg,
    epochs=10
)

print(f"\nVocabulary size: {len(w2v_model.wv)}")


Vocabulary size: 1595


In [35]:
# Fungsi untuk mengubah kalimat menjadi vektor rata-rata
def sentence_to_vec(sentence, model):
    vectors = []
    for word in sentence:
        if word in model.wv:
            vectors.append(model.wv[word])

    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [36]:
# Konversi data training ke vektor
print("\nMengkonversi data training ke vektor...")
X_train = np.array([sentence_to_vec(sent, w2v_model) for sent in sentences_train])
print(f"Shape X_train: {X_train.shape}")

# Konversi data testing ke vektor
print("\nMengkonversi data testing ke vektor...")
sentences_test = X_test_texts.tolist()
X_test = np.array([sentence_to_vec(sent, w2v_model) for sent in sentences_test])
print(f"Shape X_test: {X_test.shape}")

print(f"\nShape label y_train: {y_train.shape}")
print(f"Shape label y_test: {y_test.shape}")


Mengkonversi data training ke vektor...
Shape X_train: (5998, 100)

Mengkonversi data testing ke vektor...
Shape X_test: (1500, 100)

Shape label y_train: (5998,)
Shape label y_test: (1500,)


# SVM TRAINING

In [37]:
# Inisialisasi SVM dengan kernel RBF
svm_model = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    random_state=42,
    verbose=True
)

print("\nParameter SVM:")
print(f"- Kernel: {svm_model.kernel}")
print(f"- C: {svm_model.C}")
print(f"- Gamma: {svm_model.gamma}")

print("\nMulai training SVM...")
svm_model.fit(X_train, y_train)
print("Training selesai!")


Parameter SVM:
- Kernel: rbf
- C: 1.0
- Gamma: scale

Mulai training SVM...
[LibSVM]Training selesai!


In [38]:
# Prediksi
y_pred_train = svm_model.predict(X_train)
y_pred_test = svm_model.predict(X_test)

# Akurasi
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"\nAkurasi Training: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Akurasi Testing: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")


Akurasi Training: 0.8631 (86.31%)
Akurasi Testing: 0.8687 (86.87%)


In [39]:
print("\n" + "-"*50)
print("CLASSIFICATION REPORT (TEST SET)")
print("-"*50)
target_names = ['Negatif', 'Positif']
print(classification_report(y_test, y_pred_test, target_names=target_names))

# Confusion matrix
print("\n" + "-"*50)
print("CONFUSION MATRIX (TEST SET)")
print("-"*50)
cm = confusion_matrix(y_test, y_pred_test)
print("\n           Predicted")
print("           Negatif  Positif")
print(f"Actual Negatif  {cm[0][0]:6d}  {cm[0][1]:6d}")
print(f"       Positif  {cm[1][0]:6d}  {cm[1][1]:6d}")


--------------------------------------------------
CLASSIFICATION REPORT (TEST SET)
--------------------------------------------------
              precision    recall  f1-score   support

     Negatif       0.85      0.92      0.88       788
     Positif       0.90      0.82      0.86       712

    accuracy                           0.87      1500
   macro avg       0.87      0.87      0.87      1500
weighted avg       0.87      0.87      0.87      1500


--------------------------------------------------
CONFUSION MATRIX (TEST SET)
--------------------------------------------------

           Predicted
           Negatif  Positif
Actual Negatif     722      66
       Positif     131     581


# Testing Model

In [40]:
def predict_sentiment(text, w2v_model, svm_model, stemmer):
    # Preprocessing
    processed = preprocess_text(text)
    tokens = processed.split()

    # Try slang normalization
    try:
        tokens = change(tokens)
    except:
        pass

    # Try stopword removal
    try:
        tokens = stopword_removal(tokens)
    except:
        pass

    # Stemming
    tokens = stemming(tokens)

    # Convert to vector
    vec = sentence_to_vec(tokens, w2v_model)
    vec = vec.reshape(1, -1)

    # Predict
    prediction = svm_model.predict(vec)[0]
    sentiment = 'Positif' if prediction == 1 else 'Negatif'

    return sentiment, tokens

# Contoh testing berdasarkan data asli
test_samples = [
    "Kenapa Indosat ada tuyulnya sekarang pulsa tiba-tiba berkurang",
    "aplikasi ini sangat bagus",
    "Pelayanan bayar bulanan buruk",
]

for i, sample in enumerate(test_samples, 1):
    sentiment, tokens = predict_sentiment(sample, w2v_model, svm_model, stemmer)
    print(f"\n{i}. Text: {sample}")
    print(f"   Tokens: {tokens}")
    print(f"   Prediksi Sentimen: {sentiment}")

print("\n" + "="*50)
print("PROSES SELESAI")
print("="*50)


1. Text: Kenapa Indosat ada tuyulnya sekarang pulsa tiba-tiba berkurang
   Tokens: ['indosat', 'tuyul', 'pulsa', 'kurang']
   Prediksi Sentimen: Negatif

2. Text: aplikasi ini sangat bagus
   Tokens: ['aplikasi', 'bagus']
   Prediksi Sentimen: Positif

3. Text: Pelayanan bayar bulanan buruk
   Tokens: ['layan', 'bayar', 'bulan', 'buruk']
   Prediksi Sentimen: Negatif

PROSES SELESAI


In [41]:
# Simpan model
print("\nMenyimpan model...")
w2v_model.save("word2vec_model.model")
import pickle
with open('svm_model.pkl', 'wb') as f:
    pickle.dump(svm_model, f)

# Simpan juga data preprocessed untuk dashboard
df_under.to_csv('data_preprocessed.csv', index=False)
print("Model dan data berhasil disimpan!")
print("\nFile yang dihasilkan:")
print("- word2vec_model.model")
print("- svm_model.pkl")
print("- data_preprocessed.csv")


Menyimpan model...
Model dan data berhasil disimpan!

File yang dihasilkan:
- word2vec_model.model
- svm_model.pkl
- data_preprocessed.csv
