In [20]:
import tensorflow as tf
import tensorflow_hub as tf_hub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import phik
import re
import ast
import nltk

# Pre-processing & Feature engineering
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, TextVectorization, Reshape
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import STOPWORDS
from wordcloud import WordCloud

# Modelling
from tensorflow import keras
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.layers import Input, LSTM, Dropout, Dense, Bidirectional
from tensorflow.keras.models import Model, Sequential
from keras.callbacks import EarlyStopping

# Save Model
import joblib

import warnings
warnings.filterwarnings("ignore")

from tensorflow.keras.models import load_model

from selenium import webdriver
from selenium.webdriver.common.by import By
from time import sleep
from bs4 import BeautifulSoup
import pandas as pd
from selenium.common.exceptions import NoSuchElementException, TimeoutException


In [11]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]

True

In [12]:
# Define Stopwords
stpwds_id = list(set(stopwords.words('indonesian')))

# Define Stemming
stemmer = StemmerFactory().create_stemmer()

# Open slangwords_indonesian.txt
with open('slangwords_indonesian.txt') as f:
    data = f.read()

slangwords_indonesian =  ast.literal_eval(data)

In [13]:
# Membuat suatu fungsi yang berisi full preprocessing step
def text_preprocessing(text, slangwords_indonesian):
    # Fungsi untuk mengubah teks menjadi huruf kecil
    def lower(text):
        return text.lower()

    # Fungsi untuk mengganti abbreviation
    def check_slang(text):
        temp = []
        for slang in text.split():
            if slang in slangwords_indonesian:
                temp.append(slangwords_indonesian[slang])
            else:
                temp.append(slang)
        return " ".join(temp)

    # Fungsi untuk menghapus tanda baca, newlines, dan whitespace ekstra
    def check_punctuation(text):
        # Non-letter removal (seperti emoticon, symbol (seperti μ, $, 兀), dan lain-lain
        text = re.sub("[^a-zA-Z]", ' ', text)
        # Hashtags removal
        text = re.sub("#[A-Za-z0-9_]+", " ", text)
        # Mention removal
        text = re.sub("@[A-Za-z0-9_]+", " ", text)
        # Menghapus teks yang ada di dalam tanda kurung siku ([...]) dalam document
        text = re.sub('\[[^]]*\]', ' ', text)
        # Mengganti setiap baris baru (newline) dengan spasi
        text = re.sub(r"\\n", " ", text)
        # Menghapus whitespace ekstra di awal dan akhir token
        text = text.strip()
        # Menghapus spasi berlebih di antara kata-kata (hanya menyisakan satu spasi antar kata)
        text = ' '.join(text.split())
        return text

    # Fungsi untuk tokenisasi, menghapus stopwords, dan stemming
    def token_stopwords_stem(text):
        # Tokenization
        tokens = word_tokenize(text)
        # Stopwords removal
        tokens = [word for word in tokens if word not in stpwds_id]
        # Stemming
        tokens = [stemmer.stem(word) for word in tokens]
        # Combining Tokens
        text = ' '.join(tokens)  # Menggunakan 'text' untuk menggabungkan kembali tokens
        return text

    # Proses Preprocessing
    text = lower(text)
    text = check_slang(text)
    text = check_punctuation(text)
    text = token_stopwords_stem(text)
    
    return text

In [14]:
final_model = tf.keras.models.load_model('nlp_model')





In [25]:
def scrape_reviews_and_ratings(product_urls):
    reviews = []
    ratings = []

    options = webdriver.ChromeOptions()
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
    options.add_argument(f'user-agent={user_agent}')
    options.add_argument("--headless")  # Run browser in headless mode
    options.add_argument("--disable-gpu")  # Disable GPU (recommended for headless mode)
    options.add_argument("--enable-automation")
    options.add_argument("--useAutomationExtension")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-extensions")
    options.add_argument("--dns-prefetch-disable")
    options.add_argument("--disable-dev-shm-usage")  # Fix shared memory issues in some environments
    options.add_argument("window-size=1920,1080")

    driver = webdriver.Chrome(options=options)

    try:
        driver.get(product_urls)
        sleep(3)

        current_page = 1  # Initialize current page

        while True:
            # Scroll the page to load all reviews
            for _ in range(20):
                driver.execute_script("window.scrollBy(0, 250)")
                sleep(1)

            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")

            # Extract reviews and ratings
            for product in soup.find_all('div', {"class": "css-1k41fl7"}):
                review_element = product.find('span', {"data-testid": "lblItemUlasan"})
                reviews.append(review_element.get_text() if review_element else 'None')

                rating_element = product.find('div', {"class": "rating"})
                ratings.append(rating_element.get('aria-label') if rating_element else 'None')

            # Break if the maximum number of pages (e.g., 2) is reached
            if current_page >= 2:
                break

            # Check if "Next" button exists and is enabled
            try:
                next_button_container = driver.find_element(By.CLASS_NAME, "css-1xqkwi8")
                next_button = next_button_container.find_element(
                    By.XPATH, './/button[contains(@class, "css-16uzo3v-unf-pagination-item") and @aria-label="Laman berikutnya"]'
                )
                is_disabled = next_button.get_attribute("disabled")  # Check if button is disabled
                if is_disabled:
                    break

                # Scroll to and click the next button
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
                driver.execute_script("arguments[0].click();", next_button)
                sleep(2)
                current_page += 1  # Increment current page
            except (NoSuchElementException, TimeoutException):
                break

    finally:
        driver.quit()

    print(f"Scraped {len(reviews)} reviews from {current_page} pages.")
    data = pd.DataFrame({'Review': reviews, 'Rating': ratings})
    return data

In [26]:
data_inf = scrape_reviews_and_ratings('https://www.tokopedia.com/aerostreet/aerostreet-37-44-massive-low-hitam-natural-sepatu-sneakers-casual-39?extParam=ivf%3Dtrue')

Scraped 20 reviews from 2 pages.


In [27]:
# Applying Text Preprocessing to the Dataset
data_inf['Review_processed'] = data_inf['Review'].apply(lambda x: text_preprocessing(x, slangwords_indonesian))
data_inf

Unnamed: 0,Review,Rating,Review_processed
0,"bahan tebal tapi nyaman, soalnya empuk jadi ny...",bintang 5,bahan tebal nyaman empuk nyaman dipake jahit r...
1,"untuk ukuran sangat pas , dan gua saranin kala...",bintang 5,ukur pas saranin pakai sandal sepatu yh kaki g...
2,"modelnya keren , nyaman dipakai , ada bagian t...",bintang 5,model keren nyaman pakai rapi over all bagus
3,"Bahan bagus, dipake nyaman, model keren abis, ...",bintang 5,bahan bagus dipake nyaman model keren abis sol...
4,"- Suka sama modelnya,bahan menurut saya bagus,...",bintang 5,suka model bahan bagus produk aman sesuai kelu...
5,"Model nya keren abis sih, barang nya juga data...",bintang 5,model nya keren habis sih barang nya cepat iya...
6,pengiriman lama bgt asli dah. terus untuk harg...,bintang 4,kirim banget asli dah harga sih oke model sepa...
7,"Terjamin kualitasnya, ukuran pas, cocok dipaka...",bintang 5,jamin kualitas ukur pas cocok pakai cwk cwk
8,Ada sedikit cacat produk dibagian bawah. tapi ...,bintang 3,cacat produk bagi ganti kasih voucher kompensasi
9,"mantab... pengiriman cepat, kualitas terbaik t...",bintang 5,mantab kirim cepat kualitas baik murah sesuai ...


In [28]:
data_inf['Review_processed']

0     bahan tebal nyaman empuk nyaman dipake jahit r...
1     ukur pas saranin pakai sandal sepatu yh kaki g...
2          model keren nyaman pakai rapi over all bagus
3     bahan bagus dipake nyaman model keren abis sol...
4     suka model bahan bagus produk aman sesuai kelu...
5     model nya keren habis sih barang nya cepat iya...
6     kirim banget asli dah harga sih oke model sepa...
7           jamin kualitas ukur pas cocok pakai cwk cwk
8      cacat produk bagi ganti kasih voucher kompensasi
9     mantab kirim cepat kualitas baik murah sesuai ...
10         ok cocok pas kaki ringan model simpel trendi
11    sepatu bagus keren coba bantal kaki nyaman emp...
12           sepatu bagus kokoh moga awet saran up size
13    terimakasih sepatu bagus pas coba dalam jahit ...
14    bahan bagus awet kirim lumayan cepat sol sepat...
15    kirim cepat jelek masa emang si sepatu engak p...
16    barang sesuai deskripsi emas bubblewarp lapis ...
17    sumpah kali nemu brand lokal pas kak sempi

In [39]:
y_pred_inf = final_model.predict(data_inf['Review_processed'])
y_pred_inf = np.where(y_pred_inf >= 0.5, 1, 0)
y_pred_inf = y_pred_inf.ravel()  # Ensures it's a 1D array




In [40]:
# Combine into dataframe
inffinal = pd.DataFrame()
inffinal['Review_processed'] = data_inf['Review_processed']
inffinal['Recommended'] = pd.DataFrame(y_pred_inf)
inffinal

Unnamed: 0,Review_processed,Recommended
0,bahan tebal nyaman empuk nyaman dipake jahit r...,0
1,ukur pas saranin pakai sandal sepatu yh kaki g...,1
2,model keren nyaman pakai rapi over all bagus,0
3,bahan bagus dipake nyaman model keren abis sol...,1
4,suka model bahan bagus produk aman sesuai kelu...,0
5,model nya keren habis sih barang nya cepat iya...,1
6,kirim banget asli dah harga sih oke model sepa...,0
7,jamin kualitas ukur pas cocok pakai cwk cwk,1
8,cacat produk bagi ganti kasih voucher kompensasi,0
9,mantab kirim cepat kualitas baik murah sesuai ...,1


In [41]:
inffinal['Recommended_meaning'] = inffinal['Recommended'].apply(lambda x: 'Positive' if x != 0 else 'Negative')
inffinal

Unnamed: 0,Review_processed,Recommended,Recommended_meaning
0,bahan tebal nyaman empuk nyaman dipake jahit r...,0,Negative
1,ukur pas saranin pakai sandal sepatu yh kaki g...,1,Positive
2,model keren nyaman pakai rapi over all bagus,0,Negative
3,bahan bagus dipake nyaman model keren abis sol...,1,Positive
4,suka model bahan bagus produk aman sesuai kelu...,0,Negative
5,model nya keren habis sih barang nya cepat iya...,1,Positive
6,kirim banget asli dah harga sih oke model sepa...,0,Negative
7,jamin kualitas ukur pas cocok pakai cwk cwk,1,Positive
8,cacat produk bagi ganti kasih voucher kompensasi,0,Negative
9,mantab kirim cepat kualitas baik murah sesuai ...,1,Positive
