# Install dan Import Library

In [33]:
!pip install google-play-scraper nltk tensorflow scikit-learn pandas matplotlib numpy

import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
from google_play_scraper import reviews
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Scraping Ulasan dari Google Play Store

In [34]:
from google_play_scraper import reviews, Sort

result, _ = reviews(
    'com.getmimo',
    lang='en',
    country='us',
    sort=Sort.NEWEST,
    count=4000
)

df = pd.DataFrame(result)

df = df[['userName', 'score', 'content']]
df.dropna(inplace=True)

df.to_csv('mimo_reviews.csv', index=False)

print("Scraping selesai! Data tersimpan dalam mimo_reviews.csv")

Scraping selesai! Data tersimpan dalam mimo_reviews.csv


# Preprocessing Data

In [35]:
nltk.download('vader_lexicon')
nltk.download('stopwords')

sia = SentimentIntensityAnalyzer()

def label_sentiment(text):
    score = sia.polarity_scores(text)['compound']
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\W', ' ', text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
df = pd.read_csv("/content/mimo_reviews.csv")

print("Proses Cleaning Text...")
df['cleaned_text'] = df['content'].apply(clean_text)

df['cleaned_text'] = df['cleaned_text'].astype(str)
df = df[df['cleaned_text'].str.strip() != '']

print("Proses Sentiment Analysis...")
df['sentiment'] = df['cleaned_text'].apply(label_sentiment)

Proses Cleaning Text...
Proses Sentiment Analysis...


In [37]:
label_mapping = {"Positive": 1, "Neutral": 0, "Negative": -1}
df['sentiment_encoded'] = df['sentiment'].map(label_mapping)

print("Proses Tokenisasi...")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['cleaned_text'])
df['tokenized'] = tokenizer.texts_to_sequences(df['cleaned_text'])

df.to_csv("mimo_reviews_cleaned.csv", index=False)
print("Preprocessing selesai!")

df.head()

Proses Tokenisasi...
Preprocessing selesai!


Unnamed: 0,userName,score,content,cleaned_text,sentiment,sentiment_encoded,tokenized
0,A Google user,5,nice,nice,Positive,1,[12]
1,A Google user,3,it's really helpful application. We can enhanc...,really helpful application enhance skills smoo...,Positive,1,"[11, 24, 35, 957, 104, 471, 16]"
2,A Google user,4,Ok,ok,Positive,1,[274]
3,A Google user,4,"So far, I'm captivated. Seems like a great pla...",far captivated seems like great place begin le...,Positive,1,"[21, 1746, 147, 9, 6, 261, 522, 5, 59, 1210, 1..."
4,A Google user,5,I even just did not know anything about html c...,even know anything html coding etc using mimo ...,Positive,1,"[33, 39, 198, 40, 3, 313, 50, 10, 208, 32, 26,..."


#  Tokenisasi & Padding

In [38]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_text'])

sequences = tokenizer.texts_to_sequences(df['cleaned_text'])
X = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

encoder = LabelEncoder()
y = encoder.fit_transform(df['sentiment'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vocab_size = len(tokenizer.word_index) + 1

# Membangun Model LSTM

In [39]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=100, input_length=100),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))




Epoch 1/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 352ms/step - accuracy: 0.8571 - loss: 0.5541 - val_accuracy: 0.8613 - val_loss: 0.4852
Epoch 2/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 264ms/step - accuracy: 0.8726 - loss: 0.4696 - val_accuracy: 0.8613 - val_loss: 0.4917
Epoch 3/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 259ms/step - accuracy: 0.8677 - loss: 0.4802 - val_accuracy: 0.8613 - val_loss: 0.4863
Epoch 4/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 265ms/step - accuracy: 0.8713 - loss: 0.4621 - val_accuracy: 0.8613 - val_loss: 0.4853
Epoch 5/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 274ms/step - accuracy: 0.8648 - loss: 0.4769 - val_accuracy: 0.8613 - val_loss: 0.4882


# Evaluasi Model

In [40]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

accuracy = accuracy_score(y_test, y_pred_classes)
report = classification_report(y_test, y_pred_classes, target_names=encoder.classes_)

print(f"Akurasi Model: {accuracy:.4f}")
print(report)

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 78ms/step
Akurasi Model: 0.8613
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00        30
     Neutral       0.00      0.00      0.00        80
    Positive       0.86      1.00      0.93       683

    accuracy                           0.86       793
   macro avg       0.29      0.33      0.31       793
weighted avg       0.74      0.86      0.80       793



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Kesimpulan

In [41]:
print("Kesimpulan Proyek Analisis Sentimen ")
print("-"*50)

print(f"Akurasi Model pada Testing Set: {accuracy:.4f}")

from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_test, y_pred_classes, average='weighted')
recall = recall_score(y_test, y_pred_classes, average='weighted')
f1 = f1_score(y_test, y_pred_classes, average='weighted')

print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1-Score  : {f1:.4f}")

if accuracy >= 0.85:
    print("\nModel Memenuhi Standar")
else:
    print("\nModel Belum Memenuhi Standar")

Kesimpulan Proyek Analisis Sentimen 
--------------------------------------------------
Akurasi Model pada Testing Set: 0.8613
Precision : 0.7418
Recall    : 0.8613
F1-Score  : 0.7971

Model Memenuhi Standar


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
