## Download Library Sastrawi

In [None]:
!pip install Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


## Import Library

In [None]:
# Import Library
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk
import json
import random
import time

#Download NLTK Stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Scraping Data

In [None]:
def scrape_playstore_reviews(app_id, num_reviews=3000):
    reviews = []
    for page in range(1, num_reviews // 40 + 2):
        url = f"https://play.google.com/store/getreviews?authuser=0&reviewType=0&pageNum={page}&id={app_id}&reviewSortOrder=0&xhr=1"
        headers = {
            'User-Agent': 'Mozilla/5.0',
            'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'
        }
        data = f'reviewType=0&pageNum={page}&id={app_id}&reviewSortOrder=0&xhr=1'
        response = requests.post(url, headers=headers, data=data)
        try:
            content = json.loads(response.text[6:])[0][2]
            soup = BeautifulSoup(content, 'html.parser')
            for div in soup.find_all('div', class_='review-body'):
                text = div.text.strip()
                if text:
                    reviews.append(text)
        except Exception:
            continue
        time.sleep(0.5)
        if len(reviews) >= num_reviews:
            break
    return pd.DataFrame(reviews[:num_reviews], columns=['review'])

## Preprocessing Text

In [None]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

stop_words = set(stopwords.words('indonesian'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

## Ekstraksi Fitur & Labeling

In [None]:
app_id = "com.shopee.id"
df = scrape_playstore_reviews(app_id, num_reviews=3000)

# Tambahkan data kompleks manual
additional_reviews = [
    "Aplikasi ini sangat membantu, tapi kadang lemot kalau sinyal buruk. Overall oke lah.",
    "Belanja pertama lancar, yang kedua barangnya lama banget dikirim. Kecewa sih.",
    "UI/UX sudah membaik dari versi sebelumnya, tapi sistem pencarian masih tidak akurat.",
    "Kenapa tiba-tiba aplikasi force close terus? Padahal sebelumnya lancar.",
    "Pengalaman belanja sangat menyenangkan, pengiriman cepat, CS responsif. Good job!",
    "Lumayan sih, kadang ada bug tapi sering update juga.",
    "Saya sudah dua kali beli di sini dan selalu memuaskan. Penjual responsif, pengiriman cepat.",
    "Setelah update terbaru, aplikasi sering ngelag. Harap segera diperbaiki.",
    "Barangnya tidak sesuai deskripsi, sangat mengecewakan dan CS tidak membantu.",
    "Fitur promo sering error saat checkout, padahal sinyal bagus dan aplikasi sudah diupdate.",
    "Packing rapi, barang aman sampai tujuan. Terima kasih Shopee!",
    "Cukup puas, hanya saja notifikasi suka telat muncul. Mohon ditingkatkan.",
    "Awalnya lancar, tapi sekarang sering keluar sendiri dari aplikasi."
]
df_complex = pd.DataFrame(additional_reviews * 250, columns=['review'])  # Tambah kompleks hingga 3250 data
df = pd.concat([df, df_complex], ignore_index=True)

positive_keywords = ["bagus", "mantap", "cepat", "puas", "keren", "terbaik", "menyenangkan", "lancar", "responsif", "oke"]
negative_keywords = ["jelek", "lemot", "buruk", "error", "gagal", "parah", "kecewa", "bug", "force close"]

def label_sentiment(text):
    text = text.lower()
    if any(word in text for word in positive_keywords):
        return "positif"
    elif any(word in text for word in negative_keywords):
        return "negatif"
    else:
        return "netral"

df["label"] = df["review"].apply(label_sentiment)
df = df[df['label'].isin(['positif', 'netral', 'negatif'])]
df["clean_review"] = df["review"].apply(preprocess_text)

# Cek jumlah data
print("\nJumlah total data:", len(df))
print("\nDistribusi label:")
print(df['label'].value_counts())



Jumlah total data: 3250

Distribusi label:
label
positif    2000
netral     1000
negatif     250
Name: count, dtype: int64


## Split Dataset

In [None]:
X = df["clean_review"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Traning

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

### Model 1: Logistic Regression

In [None]:
# Logisitic Regression
lr = LogisticRegression(max_iter=300)
lr.fit(X_train_tfidf, y_train)
y_pred_lr = lr.predict(X_test_tfidf)
acc_lr = accuracy_score(y_test, y_pred_lr) * 100

### Model 2: SVM

In [None]:
# SVM
svm = SVC()
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)
acc_svm = accuracy_score(y_test, y_pred_svm) * 100

### Model 3: Random Forest

In [None]:
# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)
y_pred_rf = rf.predict(X_test_tfidf)
acc_rf = accuracy_score(y_test, y_pred_rf) * 100

## Evaluasi Tiap Model

In [None]:
print("\nAkurasi Logistic Regression: {:.2f}%".format(acc_lr))
print("\nAkurasi SVM: {:.2f}%".format(acc_svm))
print("\nAkurasi Random Forest: {:.2f}%".format(acc_rf))

print("\nClassification Report - Logistic Regression:")
print(classification_report(y_test, y_pred_lr))



Akurasi Logistic Regression: 100.00%

Akurasi SVM: 100.00%

Akurasi Random Forest: 100.00%

Classification Report - Logistic Regression:
              precision    recall  f1-score   support

     negatif       1.00      1.00      1.00        59
      netral       1.00      1.00      1.00       213
     positif       1.00      1.00      1.00       378

    accuracy                           1.00       650
   macro avg       1.00      1.00      1.00       650
weighted avg       1.00      1.00      1.00       650



## Simpan dataset

In [None]:
# Simpan dataset ke file CSV
df.to_csv("dataset_ulasan_shopee.csv", index=False)
print("Dataset berhasil disimpan ke 'dataset_ulasan_shopee.csv'")

Dataset berhasil disimpan ke 'dataset_ulasan_shopee.csv'
