<a href="https://colab.research.google.com/github/AlbertSanjaya88/AlbertSanjaya88/blob/main/Sentiment_Analysis_from_Google_Play_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-play-scraper
from google_play_scraper import reviews, app
import pandas as pd

# Ulasan aplikasi
app_package = 'com.shopee.id'  # Ganti dengan ID aplikasi yang diinginkan
result, _ = reviews(
    app_package,
    lang='id',  # Bahasa ulasan
    country='id',  # Negara
    count=10000  # Jumlah ulasan
)

# Simpan ke DataFrame
df = pd.DataFrame(result)
df.to_csv('playstore_reviews.csv', index=False)
print(f"{len(result)} File Saved")

10000 File Saved


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import re
import nltk  # Import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords if not already downloaded
nltk.download('stopwords')  # Download stopwords
nltk.download('punkt_tab')

# Load data
file_path = 'playstore_reviews.csv'
df = pd.read_csv(file_path)

# Data preprocessing
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
    text = re.sub(r'\d', '', text)  # Remove digits
    text = text.lower().strip()  # Lowercase and strip
    return text

# Apply cleaning
df['cleaned_review'] = df['content'].apply(clean_text)

# Tokenization and Stopword Removal
stop_words = set(stopwords.words('indonesian'))  # Install stopwords first using nltk.download('stopwords')
df['tokenized_review'] = df['cleaned_review'].apply(
    lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words])
)

# Labeling Sentiment (Contoh: Positif jika bintang >= 3, Negatif jika bintang < 3)
df['sentiment'] = df['score'].apply(lambda x: 1 if x >= 3 else 0)

# Split data
X = df['tokenized_review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_preds = nb_model.predict(X_test_tfidf)

# SVM Model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)
svm_preds = svm_model.predict(X_test_tfidf)

# Evaluation
print("Naive Bayes Performance")
print(classification_report(y_test, nb_preds))
print(f"Accuracy: {accuracy_score(y_test, nb_preds)}")

print("\nSVM Performance")
print(classification_report(y_test, svm_preds))
print(f"Accuracy: {accuracy_score(y_test, svm_preds)}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Naive Bayes Performance
              precision    recall  f1-score   support

           0       0.83      0.46      0.59       372
           1       0.89      0.98      0.93      1628

    accuracy                           0.88      2000
   macro avg       0.86      0.72      0.76      2000
weighted avg       0.88      0.88      0.87      2000

Accuracy: 0.8815

SVM Performance
              precision    recall  f1-score   support

           0       0.75      0.61      0.67       372
           1       0.91      0.95      0.93      1628

    accuracy                           0.89      2000
   macro avg       0.83      0.78      0.80      2000
weighted avg       0.88      0.89      0.88      2000

Accuracy: 0.8895
