# Sentiment analysis of the Signal app

In [94]:
pip install Sastrawi

Note: you may need to restart the kernel to use updated packages.


## Import Library

In [95]:
import pandas as pd
pd.options.mode.chained_assignment = None
import re
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import csv
import requests
from io import StringIO
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aininurpadilah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aininurpadilah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [96]:
# load dataset
app_reviews_df = pd.read_csv('reviews_signal.csv')
app_reviews_df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,4fafbc04-ea97-455e-a287-33b937b31c3b,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Tampilan aplikasi sudah bagus dan enak dipanda...,2,15,1.5.12,2025-01-28 09:15:05,,,1.5.12
1,7fc0d4c4-eb64-4475-b67e-91431fa9ad3d,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Dulu mantap. Sekarang sampah. Maaf ya min. Mau...,1,37,1.5.12,2025-02-02 15:47:59,"Hai kak, terima kasih atas ulasannya. Terus gu...",2022-04-18 16:38:53,1.5.12
2,f5d168c3-84e8-456c-b042-30aad0d1b0ee,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Tidak ada opsi utk memilih pembayaran dgn bank...,3,20,1.5.12,2025-01-11 15:53:53,,,1.5.12
3,cfb09d3b-67ea-4072-a5bb-43e60203f2e2,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Mau TAMBAH kendaraan aja ga bisa terus muncul ...,1,34,1.5.12,2025-01-17 07:20:51,,,1.5.12
4,bc5b9102-b2b5-4125-a762-e7349dcb2e1a,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,ketika harus ada maintenace atau gangguan haru...,1,14,1.5.12,2025-01-16 12:13:59,,,1.5.12


In [97]:
# dataset information
app_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48794 entries, 0 to 48793
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              48794 non-null  object
 1   userName              48794 non-null  object
 2   userImage             48794 non-null  object
 3   content               48794 non-null  object
 4   score                 48794 non-null  int64 
 5   thumbsUpCount         48794 non-null  int64 
 6   reviewCreatedVersion  43729 non-null  object
 7   at                    48794 non-null  object
 8   replyContent          10645 non-null  object
 9   repliedAt             10645 non-null  object
 10  appVersion            43729 non-null  object
dtypes: int64(2), object(9)
memory usage: 4.1+ MB


In [98]:
# clean dataset
clean_df = app_reviews_df.dropna()
clean_df = clean_df.drop_duplicates()
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9750 entries, 1 to 47821
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              9750 non-null   object
 1   userName              9750 non-null   object
 2   userImage             9750 non-null   object
 3   content               9750 non-null   object
 4   score                 9750 non-null   int64 
 5   thumbsUpCount         9750 non-null   int64 
 6   reviewCreatedVersion  9750 non-null   object
 7   at                    9750 non-null   object
 8   replyContent          9750 non-null   object
 9   repliedAt             9750 non-null   object
 10  appVersion            9750 non-null   object
dtypes: int64(2), object(9)
memory usage: 914.1+ KB


### Text Preprocess Functions

In [99]:
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # menghapus mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # menghapus hashtag
    text = re.sub(r'RT[\s]', '', text) # menghapus RT
    text = re.sub(r"http\S+", '', text) # menghapus link
    text = re.sub(r'[0-9]+', '', text) # menghapus angka
    text = re.sub(r'[^\w\s]', '', text) # menghapus karakter selain huruf dan angka

    text = text.replace('\n', ' ') # mengganti baris baru dengan spasi
    text = text.translate(str.maketrans('', '', string.punctuation)) # menghapus semua tanda baca
    text = text.strip(' ') # menghapus karakter spasi dari kiri dan kanan teks
    return text

def casefoldingText(text): # Mengubah semua karakter dalam teks menjadi huruf kecil
    text = text.lower()
    return text

def tokenizingText(text): # Memecah atau membagi string, teks menjadi daftar token
    text = word_tokenize(text)
    return text

def filteringText(text): # Menghapus stopwords dalam teks
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords1 = set(stopwords.words('english'))
    listStopwords.update(listStopwords1)
    listStopwords.update([listStopwords.update(['aplikasi', 'aja', 'dong', 'tolong', 'nih', 'kayak', 'bisa', 'ga', 'gak', 'nya', 'deh', 'dong', 'min', 'sih', 'kan', 'loh', 'yah', 'ya'])
    ])
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

def stemmingText(text): # Mengurangi kata ke bentuk dasarnya yang menghilangkan imbuhan awalan dan akhiran atau ke akar kata
    # Membuat objek stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    # Memecah teks menjadi daftar kata
    words = text.split()

    # Menerapkan stemming pada setiap kata dalam daftar
    stemmed_words = [stemmer.stem(word) for word in words]

    # Menggabungkan kata-kata yang telah distem
    stemmed_text = ' '.join(stemmed_words)

    return stemmed_text

def toSentence(list_words): # Mengubah daftar kata menjadi kalimat
    sentence = ' '.join(word for word in list_words)
    return sentence

### Indonesian Slangwords

In [100]:
slangwords = {
    "@": "di",
    "abis": "habis",
    "wtb": "beli",
    "masi": "masih",
    "wts": "jual",
    "wtt": "tukar",
    "bgt": "banget",
    "maks": "maksimal",
    "sampe": "sampai",
    "tdk": "tidak",
    "krn": "karena",
    "dgn": "dengan",
    "sblm": "sebelum",
    "udh": "sudah",
    "blm": "belum",
    "trs": "terus",
    "lg": "lagi",
    "gpp": "tidak apa-apa",
    "tp": "tapi",
    "msh": "masih",
    "kyk": "kayak",
    "jg": "juga",
    "dr": "dari",
    "sm": "sama",
    "qta": "kita",
    "gak": "tidak",
    "ga": "tidak",
    "bsk": "besok",
    "kpn": "kapan",
    "kmrn": "kemarin",
    "td": "tadi",
    "sy": "saya",
    "gw": "saya",
    "gue": "saya",
    "loe": "kamu",
    "lu": "kamu",
    "knp": "kenapa",
    "brb": "sebentar",
    "btw": "ngomong-ngomong",
    "dmn": "dimana",
    "mlm": "malam",
    "pg": "pagi",
    "cm": "cuma"
}
def fix_slangwords(text):
    words = text.split()
    fixed_words = []

    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slangwords[word.lower()])
        else:
            fixed_words.append(word)

    fixed_text = ' '.join(fixed_words)
    return fixed_text


In [101]:
clean_df

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
1,7fc0d4c4-eb64-4475-b67e-91431fa9ad3d,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Dulu mantap. Sekarang sampah. Maaf ya min. Mau...,1,37,1.5.12,2025-02-02 15:47:59,"Hai kak, terima kasih atas ulasannya. Terus gu...",2022-04-18 16:38:53,1.5.12
6,5bf20a77-7c33-464b-ad95-6214b0c01eac,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,asli dan semua jadi lebih mudah.. tinggal dudu...,2,58,1.5.12,2025-01-07 11:42:54,"Hai kak, terima kasih atas ulasannya. Terus gu...",2023-01-16 09:05:24,1.5.12
136,9409d8cd-49be-4cfc-8d98-9acb896aeb49,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,aplikasi yang sangat susah dalam proses pendaf...,4,46,1.5.12,2025-01-14 11:57:57,"Hai, kami mohon maaf atas ketidaknyamanannya. ...",2022-08-08 13:43:46,1.5.12
207,4f688ed7-1d25-4363-bc92-8b5e524bd13d,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Kenapa aplikasi signal dari tgl 5 sampe tangga...,5,0,1.5.12,2025-01-09 18:24:05,"Hai kak, terima kasih atas respon positifnya. ...",2024-01-18 15:07:26,1.5.12
462,bb193c27-3235-48e1-a47e-98957264ae15,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"Dulu bisa dan lancar, tetapi setelah ganti kal...",2,0,1.5.12,2025-02-05 13:26:52,"Hai kak, terima kasih atas ulasannya. Terus gu...",2023-01-06 15:12:49,1.5.12
...,...,...,...,...,...,...,...,...,...,...,...
47816,d5f5db72-7949-490b-92b6-3be713f58c1e,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Mantap,5,0,1.2.4,2021-07-31 07:29:14,"Hai, terima kasih atas ulasannya :) Terus guna...",2021-08-03 17:41:35,1.2.4
47817,3c2adf77-77a1-46ff-8ad0-039c06d9955e,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Bagus,5,0,1.2.4,2021-08-09 08:58:18,"Hai Febri, terima kasih atas ulasannya :) Teru...",2021-08-10 18:03:05,1.2.4
47818,a544355d-7028-42e8-b1ce-f7fc391124f7,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Bermanfaat,5,0,1.2.4,2021-08-05 19:24:13,"Hai, terima kasih atas ulasannya :) Terus guna...",2021-08-04 18:46:49,1.2.4
47820,c860e4e0-496b-434e-ad2b-ee722c55bc1a,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,ngemudahin banget,5,0,1.2.4,2021-08-26 15:09:25,"Hai, terima kasih atas ulasannya :) Terus guna...",2021-08-27 10:55:46,1.2.4


### text preprocess

In [102]:
# Membersihkan teks dan menyimpannya di kolom 'text_clean'
clean_df['text_clean'] = clean_df['content'].apply(cleaningText)

# Mengubah huruf dalam teks menjadi huruf kecil dan menyimpannya di 'text_casefoldingText'
clean_df['text_casefoldingText'] = clean_df['text_clean'].apply(casefoldingText)

# Mengganti kata-kata slang dengan kata-kata standar dan menyimpannya di 'text_slangwords'
clean_df['text_slangwords'] = clean_df['text_casefoldingText'].apply(fix_slangwords)

# Memecah teks menjadi token (kata-kata) dan menyimpannya di 'text_tokenizingText'
clean_df['text_tokenizingText'] = clean_df['text_slangwords'].apply(tokenizingText)

# Menghapus kata-kata stop (kata-kata umum) dan menyimpannya di 'text_stopword'
clean_df['text_stopword'] = clean_df['text_tokenizingText'].apply(filteringText)

# Menggabungkan token-token menjadi kalimat dan menyimpannya di 'text_akhir'
clean_df['text_akhir'] = clean_df['text_stopword'].apply(toSentence)

### Fetch Positive and Negative Words Dictionary

In [103]:
lexicon_positive = dict()

response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv')

if response.status_code == 200:
    reader = csv.reader(StringIO(response.text), delimiter=',')
    for row in reader:
        lexicon_positive[row[0]] = int(row[1])
else:
    print("Failed to fetch positive lexicon data")

lexicon_negative = dict()

response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv')

if response.status_code == 200:
    reader = csv.reader(StringIO(response.text), delimiter=',')
    for row in reader:
        lexicon_negative[row[0]] = int(row[1])
else:
    print("Failed to fetch negative lexicon data")

### Sentiment Analysis Function

In [104]:
def sentiment_analysis(text):
    score = 0

    for word in text:
        if word in lexicon_positive:
            score += lexicon_positive[word]

    for word in text:
        if word in lexicon_negative:
            score += lexicon_negative[word]

    if score > 0:
        polarity = 'positive'
    elif score < 0:
        polarity = 'negative'
    else:
        polarity = 'neutral'

    return score, polarity

### Perform data labeling

In [105]:
results = clean_df['text_stopword'].apply(sentiment_analysis)
results = list(zip(*results))
clean_df['polarity_score'] = results[0]
clean_df['polarity'] = results[1]
print(clean_df['polarity'].value_counts())

polarity
positive    4349
negative    3806
neutral     1595
Name: count, dtype: int64


In [106]:
print(clean_df['polarity'].value_counts())

from sklearn.utils import resample

df_majority = clean_df[clean_df['polarity'] == 'positive']
df_minority_neg = clean_df[clean_df['polarity'] == 'negative']
df_minority_neu = clean_df[clean_df['polarity'] == 'neutral']

df_minority_neg_upsampled = resample(df_minority_neg,
                                     replace=True,
                                     n_samples=len(df_majority),
                                     random_state=42)

df_minority_neu_upsampled = resample(df_minority_neu,
                                     replace=True,
                                     n_samples=len(df_majority),
                                     random_state=42)

clean_df = pd.concat([df_majority, df_minority_neg_upsampled, df_minority_neu_upsampled])
print(clean_df['polarity'].value_counts())

polarity
positive    4349
negative    3806
neutral     1595
Name: count, dtype: int64
polarity
positive    4349
negative    4349
neutral     4349
Name: count, dtype: int64


### Data Extraction: TF-IDF

In [107]:
X = clean_df['text_akhir']
y = clean_df['polarity']

tfidf = TfidfVectorizer(max_features=5000, min_df=5, max_df=0.8, ngram_range=(1,3))
X_tfidf = tfidf.fit_transform(X)

tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())

tfidf_df

Unnamed: 0,aamiin,acc,account,ad,adakah,adany,admin,admin rp,admin signal,administrasi,...,yg terimapadahal,yg terimapadahal pake,yg terkait,yg udah,yg urus,yg yg,yng,youtube,zaman,zonk
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature Extraction

### Data Extraction: Word2Vec

In [108]:
X2 = clean_df['text_akhir'].apply(lambda x: x.split())

w2v_model = Word2Vec(sentences=X2, vector_size=100, window=5, min_count=2, workers=4)

def document_vector(w2v_model, doc):
    """Create document vectors by averaging the word vectors in the document"""
    doc = [word for word in doc if word in w2v_model.wv]
    if len(doc) == 0:
        return np.zeros(w2v_model.vector_size)
    return np.mean([w2v_model.wv[word] for word in doc], axis=0)

X_w2v = np.array([document_vector(w2v_model, doc) for doc in X2])

w2v_df = pd.DataFrame(X_w2v)

w2v_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.332504,0.172245,-0.116238,0.094762,-0.049907,-0.788415,0.094166,0.636442,-0.264668,-0.075670,...,0.386844,0.352235,0.041700,0.145713,0.755200,0.211378,0.181323,-0.343293,-0.139336,-0.316692
1,-0.589436,0.263487,-0.232126,0.173135,-0.038550,-1.217594,0.178163,0.976128,-0.433724,-0.089417,...,0.555034,0.582209,0.080897,0.221712,1.192142,0.329190,0.339222,-0.478558,-0.244708,-0.572417
2,-0.345709,0.153301,-0.141187,0.119342,-0.083962,-0.749887,0.090132,0.594755,-0.260055,-0.034043,...,0.347890,0.355982,0.014971,0.158488,0.716392,0.183754,0.217741,-0.293098,-0.167957,-0.323338
3,-0.432365,0.194523,-0.215070,0.132175,-0.079571,-0.929174,0.111993,0.738147,-0.323012,-0.022582,...,0.390612,0.463735,0.034536,0.160252,0.880091,0.239239,0.249958,-0.348369,-0.183390,-0.427813
4,-0.279774,0.139654,-0.120850,0.089880,-0.102853,-0.678232,0.051077,0.522569,-0.222429,-0.037328,...,0.314768,0.314552,0.003595,0.139178,0.617549,0.164465,0.165511,-0.268333,-0.132418,-0.258280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13042,-0.006089,0.004384,0.008655,0.009733,0.002660,-0.008338,-0.007564,-0.002700,-0.000516,-0.008517,...,0.009356,-0.004861,0.008112,0.009054,-0.008368,-0.002677,-0.008360,-0.008883,-0.008379,-0.001913
13043,-0.523359,0.305356,-0.149150,0.089477,0.042632,-1.293895,0.192093,1.063071,-0.363038,-0.233954,...,0.607264,0.589680,0.125108,0.160800,1.162051,0.376272,0.150795,-0.588625,-0.130647,-0.488587
13044,-0.689724,0.270339,-0.320044,0.261136,-0.002876,-1.259431,0.216002,1.008751,-0.477269,0.005765,...,0.484293,0.657897,0.106884,0.249119,1.301834,0.359773,0.435488,-0.426840,-0.326925,-0.675156
13045,0.007902,0.001804,-0.001760,0.008787,0.006511,-0.007556,0.006201,-0.008532,-0.004208,0.001095,...,-0.008522,0.005386,0.007392,-0.000901,-0.008288,-0.008883,-0.007196,0.006209,-0.007223,0.008165


### Split Dataset

In [109]:
# TF-IDF 80/20
tfidf_X_train1, tfidf_X_test1, tfidf_y_train1, tfidf_y_test1 = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Word2Vec 80/20
w2v_X_train, w2v_X_test, w2v_y_train, w2v_y_test = train_test_split(X_w2v, y, test_size=0.2, random_state=42)

# TF-IDF 70/30
tfidf_X_train2, tfidf_X_test2, tfidf_y_train2, tfidf_y_test2 = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

### Random Forest | TF-IDF | 70/30

In [110]:
random_forest_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_tfidf.fit(tfidf_X_train2, tfidf_y_train2)

y_pred_train_rf = random_forest_tfidf.predict(tfidf_X_train2)
accuracy_train_rf = accuracy_score(tfidf_y_train2, y_pred_train_rf)

y_pred_test_rf = random_forest_tfidf.predict(tfidf_X_test2)
accuracy_test_rf = accuracy_score(tfidf_y_test2, y_pred_test_rf)

print('Random Forest - TF-IDF 70/30')
print(f'Accuracy Train: {accuracy_train_rf * 100:.2f}%')
print(f'Accuracy Test: {accuracy_test_rf * 100:.2f}%')
print(classification_report(tfidf_y_test2, y_pred_test_rf))

Random Forest - TF-IDF 70/30
Accuracy Train: 99.93%
Accuracy Test: 91.34%
              precision    recall  f1-score   support

    negative       0.88      0.93      0.91      1309
     neutral       0.91      0.97      0.94      1266
    positive       0.95      0.84      0.89      1340

    accuracy                           0.91      3915
   macro avg       0.92      0.91      0.91      3915
weighted avg       0.92      0.91      0.91      3915



### Random Forest |  Word2Vec | 80/20

In [111]:
random_forest_w2v = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_w2v.fit(w2v_X_train, w2v_y_train)

y_pred_train_rf = random_forest_w2v.predict(w2v_X_train)
accuracy_train_rf = accuracy_score(w2v_y_train, y_pred_train_rf)

y_pred_test_rf = random_forest_w2v.predict(w2v_X_test)
accuracy_test_rf = accuracy_score(w2v_y_test, y_pred_test_rf)

print('Random Forest - Word2Vec 80/20')
print(f'Accuracy Train: {accuracy_train_rf * 100:.2f}%')
print(f'Accuracy Test: {accuracy_test_rf * 100:.2f}%')
print(classification_report(w2v_y_test, y_pred_test_rf))

Random Forest - Word2Vec 80/20
Accuracy Train: 99.98%
Accuracy Test: 87.09%
              precision    recall  f1-score   support

    negative       0.80      0.89      0.84       845
     neutral       0.95      0.96      0.95       876
    positive       0.87      0.76      0.81       889

    accuracy                           0.87      2610
   macro avg       0.87      0.87      0.87      2610
weighted avg       0.87      0.87      0.87      2610



### SVM | TF-IDF | 80/20

In [112]:
svm_model = SVC(kernel='linear')
svm_model.fit(tfidf_X_train1, tfidf_y_train1)

y_pred_train_svm = svm_model.predict(tfidf_X_train1)
accuracy_train_svm = accuracy_score(tfidf_y_train1, y_pred_train_svm)

y_pred_test_svm = svm_model.predict(tfidf_X_test1)
accuracy_test_svm = accuracy_score(tfidf_y_test1, y_pred_test_svm)

print('Support Vector Machine - TF-IDF 80/20')
print(f'Accuracy Train: {accuracy_train_svm * 100:.2f}%')
print(f'Accuracy Test: {accuracy_test_svm * 100:.2f}%')
print(classification_report(tfidf_y_test1, y_pred_test_svm))

Support Vector Machine - TF-IDF 80/20
Accuracy Train: 96.89%
Accuracy Test: 92.15%
              precision    recall  f1-score   support

    negative       0.92      0.94      0.93       845
     neutral       0.91      0.95      0.93       876
    positive       0.94      0.88      0.91       889

    accuracy                           0.92      2610
   macro avg       0.92      0.92      0.92      2610
weighted avg       0.92      0.92      0.92      2610



### Inference

In [113]:
# SVM TF-IDF
def predict_svm(text):
    cleaned = cleaningText(text)
    cleaned = casefoldingText(cleaned)
    cleaned = fix_slangwords(cleaned)
    tokens = tokenizingText(cleaned)
    filtered = filteringText(tokens)
    final_text = toSentence(filtered)
    vector = tfidf.transform([final_text])
    prediction = svm_model.predict(vector)
    return prediction[0]

# RF TF-IDF
def predict_rf_tfidf(text):
    cleaned = cleaningText(text)
    cleaned = casefoldingText(cleaned)
    cleaned = fix_slangwords(cleaned)
    tokens = tokenizingText(cleaned)
    filtered = filteringText(tokens)
    final_text = toSentence(filtered)
    vector = tfidf.transform([final_text])
    prediction = random_forest_tfidf.predict(vector)
    return prediction[0]

# RF Word2Vec
def predict_rf_word2vec(text):
    cleaned = cleaningText(text)
    cleaned = casefoldingText(cleaned)
    cleaned = fix_slangwords(cleaned)
    tokens = tokenizingText(cleaned)
    filtered = filteringText(tokens)
    final_text = toSentence(filtered)
    tokens = final_text.split()
    vector = document_vector(w2v_model, tokens).reshape(1, -1)
    prediction = random_forest_w2v.predict(vector)
    return prediction[0]

### Predict

In [114]:
sample_text = "Sangat dimudahkan dengan adanya aplikasi Caranya mudah dan cepat"

# SVM TF-IDF
svm_prediction = predict_svm(sample_text)
print(f"SVM Prediction: {svm_prediction}")

# Random Forest TF-IDF
rf_tfidf_prediction = predict_rf_tfidf(sample_text)
print(f"Random Forest Prediction (TF-IDF): {rf_tfidf_prediction}")

# Random Forest Word2Vec
rf_word2vec_prediction = predict_rf_word2vec(sample_text)
print(f"Random Forest Prediction (Word2Vec): {rf_word2vec_prediction}")

SVM Prediction: positive
Random Forest Prediction (TF-IDF): positive
Random Forest Prediction (Word2Vec): positive
