<a href="https://colab.research.google.com/github/ArmFriiz/Dicoding-Submission-FDL/blob/main/Analisis%20Sentimen/scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Muhammad Faris Akbar**
<br></br>
**Fundamental Deep Learning - Sentimen Analisis Shopee**

In [1]:
!pip install google-play-scraper transformers

Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl.metadata (50 kB)
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/50.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m50.2/50.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Downloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: google-play-scraper
Successfully installed google-play-scraper-1.2.7


In [2]:
import sys
import os
import pandas as pd
import re

In [3]:
# try:
#     # Jika dijalankan sebagai file .py
#     base_path = os.path.dirname(os.path.abspath(__file__))
# except NameError:
#     # Jika dijalankan di Jupyter/Interactive
#     base_path = os.getcwd()

# parent_dir = os.path.abspath(os.path.join(base_path, '..'))
# sys.path.append(parent_dir)

# print("Base Path:", base_path)
# print("Parent Directory:", parent_dir)

# from Helper import *

In [4]:
from google_play_scraper import reviews, Sort
from transformers import pipeline
from tqdm import tqdm

**Helper Function**

In [5]:
def cek_nan(data):
  """
  Function yang digunakan untuk memeriksa nilai nan dari data
  input   : data
  output  : nan_info (DataFrame)
  return  : nan_info (DataFrame)
  """
  nan_info = pd.DataFrame(data.isna().sum().sort_values(ascending=False), columns=['Jumlah Nilai Missing'])

  if nan_info['Jumlah Nilai Missing'].sum() == 0:
    print("Tidak ada nilai missing")
    return None
  else:
    nan_info['Persentase Nilai Missing (%)'] = ((nan_info['Jumlah Nilai Missing'] / len(data)) * 100).round(3)

    # return data dengan nilai missing, apabila ingin mereturn keseluruhan data maka ubah menjadi return nan_info
    return nan_info[nan_info['Jumlah Nilai Missing'] > 0]

In [6]:
def visualize_row_with_nan(data, method='all', columns=None):
  """
  Procedure yang digunakan untuk menvisualisasikan baris yang mengandung NaN berdasarkan metode yang dipilih
  input   : data, method, columns
  output  : baris_nan (DataFrame)
  """
  try:
    if method not in ['all', 'column', 'columns']:
      raise ValueError("Metode tidak valid. Pilih salah satu dari 'all', 'column', 'columns'.")

    if method == 'all':
      display(data[data.isna().any(axis=1)])

    elif method == 'column':
      if not isinstance(columns, str):
          raise TypeError("Untuk metode 'column', argumen 'columns' harus berupa String.")
      if columns not in data.columns:
          raise KeyError(f"Kolom '{columns}' tidak ditemukan dalam DataFrame.")
      display(data[data[columns].isna()])

    elif method == 'columns':
      if not isinstance(columns, list):
          raise TypeError("Untuk metode 'columns', argumen 'columns' harus berupa list.")
      for column in columns:
          if column not in data.columns:
              raise KeyError(f"Kolom '{column}' tidak ditemukan dalam DataFrame.")
      display(data[data[columns].isna().all(axis=1)])

  except (TypeError, KeyError) as e:
    print(f"Kesalahan dalam memproses data: {e}")

In [7]:
def visualize_row_with_duplicated(data):
  """
  Procedure yang digunakan untuk menvisualisasikan baris yang mengandung data duplikat
  input   : data
  output  : baris_duplikat (DataFrame)
  """
  duplicated = data.duplicated().sum()

  if duplicated > 0:
    print("Jumlah Data Duplikat :", duplicated)
    all_duplicates = data[data.duplicated(keep=False)]

    duplicate_indices = all_duplicates.groupby(list(all_duplicates.columns)).groups
    print("Pasangan Data Duplikat :")
    for group_indices in duplicate_indices.values():
      if len(group_indices) > 1:
        display(data.iloc[list(group_indices)])
        print("\n")
  else:
    print("Tidak ada data duplikat")

**Main Section**

In [8]:
def scrape_google_play(app_id, total_count, country='id', lang='id'):
    """
    Fungsi untuk melakukan scraping ulasan dari Google Play Store.

    app_id (str): ID aplikasi di Play Store (contoh: 'com.shopee.id')
    total_count (int): Target jumlah data yang ingin diambil
    country (str): Kode negara (default Indonesia 'id')
    lang (str): Bahasa ulasan (default Indonesia 'id')
    """
    print(f"Proses scraping untuk aplikasi: {app_id}...")

    result, continuation_token = reviews(
        app_id,
        lang=lang,
        country=country,
        sort=Sort.NEWEST, # Sort.NEWEST digunakan untuk mendapat data terbaru
        count=total_count,
        filter_score_with=None # Mengambil semua rating (1-5)
    )

    print(f"Berhasil mengambil {len(result)} data mentah.")

    # Konversi hasil scraping ke dalam DataFrame (Tabel)
    df = pd.DataFrame(result)

    return df

In [9]:
TARGET_APP = 'com.shopee.id'
JUMLAH_DATA = 5000

df_ulasan = scrape_google_play(TARGET_APP, JUMLAH_DATA)

Proses scraping untuk aplikasi: com.shopee.id...
Berhasil mengambil 5000 data mentah.


In [10]:
# df_ulasan = pd.read_csv('https://raw.githubusercontent.com/ArmFriiz/Dicoding-Submission-FDL/refs/heads/main/Analisis%20Sentimen/dataset_ulasan_playstore.csv')

In [11]:
df_ulasan.head(5)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,1c630f10-f578-4425-8836-1098530bb01f,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,siiip..,5,0,3.67.25,2026-01-28 07:44:28,"Hi kak Heryanto Yusuf, makasih buat review bin...",2026-01-28 08:08:51,3.67.25
1,88526ce0-4a8c-4fe4-9c09-bb3a6785a12e,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Dengan adanya jasa pengiriman di bawah naungan...,1,0,3.66.27,2026-01-28 07:43:29,"Hai kak Qinenk Tj, mohon maaf atas ketidaknyam...",2026-01-28 08:15:32,3.66.27
2,6c159037-ed16-4166-a0b6-f31025de419b,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,bagus,5,0,,2026-01-28 07:43:28,"Hi Kak Yudi Hartono, makasih ya buat bintang 5...",2026-01-28 08:07:43,
3,dfa14d20-03a3-45fa-907e-29a839b58af7,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,mantap,5,0,3.67.25,2026-01-28 07:42:49,Hai kak it's me Rapzz. Maaf terkait kendala pe...,2025-05-23 06:12:11,3.67.25
4,f9cdba56-cd79-4cee-a603-7954a4e3c537,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"over all uda bgs, tp tlg tambahin gopay dong m...",4,0,,2026-01-28 07:41:59,Wuiihhh makasih banyak review dan bintangnya :...,2026-01-28 08:09:56,


In [12]:
df_ulasan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              5000 non-null   object        
 1   userName              5000 non-null   object        
 2   userImage             5000 non-null   object        
 3   content               5000 non-null   object        
 4   score                 5000 non-null   int64         
 5   thumbsUpCount         5000 non-null   int64         
 6   reviewCreatedVersion  3868 non-null   object        
 7   at                    5000 non-null   datetime64[ns]
 8   replyContent          4832 non-null   object        
 9   repliedAt             4832 non-null   datetime64[ns]
 10  appVersion            3868 non-null   object        
dtypes: datetime64[ns](2), int64(2), object(7)
memory usage: 429.8+ KB


In [13]:
df_ulasan.describe(include='all')

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
count,5000,5000,5000,5000,5000.0,5000.0,3868,5000,4832,4832,3868
unique,5000,496,498,4032,,,139,,4706,,139
top,b95255e2-3381-4088-a02a-f6767979e794,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,bagus,,,3.66.27,,"Hai Kak , makasih ya buat penilaiannya, semoga...",,3.66.27
freq,1,4504,4503,165,,,2271,,72,,2271
mean,,,,,4.1152,2.559,,2026-01-25 07:37:25.935800064,,2025-12-21 18:54:08.749171968,
min,,,,,1.0,0.0,,2026-01-22 11:01:25,,2020-11-18 09:06:01,
25%,,,,,4.0,0.0,,2026-01-23 22:58:31.500000,,2026-01-23 14:11:27.750000128,
50%,,,,,5.0,0.0,,2026-01-25 06:51:57.500000,,2026-01-25 04:14:12.500000,
75%,,,,,5.0,0.0,,2026-01-26 14:56:57.500000,,2026-01-26 13:42:58.249999872,
max,,,,,5.0,8284.0,,2026-01-28 07:44:28,,2026-01-28 12:57:14,


**Cek Validitas dan Kebersihan Data**

In [14]:
cek_nan(df_ulasan)

Unnamed: 0,Jumlah Nilai Missing,Persentase Nilai Missing (%)
reviewCreatedVersion,1132,22.64
appVersion,1132,22.64
replyContent,168,3.36
repliedAt,168,3.36


In [15]:
visualize_row_with_nan(df_ulasan)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
2,6c159037-ed16-4166-a0b6-f31025de419b,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,bagus,5,0,,2026-01-28 07:43:28,"Hi Kak Yudi Hartono, makasih ya buat bintang 5...",2026-01-28 08:07:43,
4,f9cdba56-cd79-4cee-a603-7954a4e3c537,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"over all uda bgs, tp tlg tambahin gopay dong m...",4,0,,2026-01-28 07:41:59,Wuiihhh makasih banyak review dan bintangnya :...,2026-01-28 08:09:56,
14,45b1a79b-aa5f-4e1d-bcd1-8b357257003f,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,uang jualan selalu di tilep shopee,1,0,,2026-01-28 07:35:46,"Hi kak Nugroho Nusantoro, maaf banget ya udah ...",2026-01-28 08:10:20,
15,cb767cb6-76ef-41ed-aae9-0e6ac5a2443a,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,barang tiba tepat waktu dan ordersnnya sesuai,5,0,,2026-01-28 07:35:33,Hallo kak Suhatim Suhatim. Makasih yaa buat re...,2026-01-28 08:07:43,
18,c8dc9394-0a8b-4006-af40-08020600cc07,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,sangat mantap.mempermudah urusan financial saya,5,0,,2026-01-28 07:33:03,"Hi kak Syamsoel Boy, makasih ya buat bintang 5...",2026-01-28 08:08:41,
...,...,...,...,...,...,...,...,...,...,...,...
4980,ce0fb20c-e105-4d17-b0c1-49dd8e97784a,Indri Jon istovel,https://play-lh.googleusercontent.com/a-/ALV-U...,Shopee mantab,5,0,,2026-01-22 11:17:35,"Hai Kak , makasih ya buat penilaiannya, semoga...",2026-01-22 13:35:58,
4983,d28174e4-258c-49a8-91c0-bc24d54aa64d,Helmi Alfariz,https://play-lh.googleusercontent.com/a/ACg8oc...,sangat memuaskan dalam berbelanja segala kebut...,5,0,,2026-01-22 11:15:47,Hallo kak Helmi Alfariz. Makasih yaa buat revi...,2026-01-22 13:40:40,
4985,68ab749f-7f5d-40fc-ba36-cd61c572bbf3,Indriyani 17,https://play-lh.googleusercontent.com/a/ACg8oc...,iklanny ada di mna' gedeg bngt tiba' masuk k a...,3,0,,2026-01-22 11:15:29,"Hi kak Indriyani 17, mohon maaf ya kak atas ke...",2026-01-22 13:36:48,
4991,4ca309b3-da0b-4ae0-82d9-fc0b9639d49f,Wisonggeni,https://play-lh.googleusercontent.com/a-/ALV-U...,üëç,5,0,,2026-01-22 11:09:29,"Hai Kak , makasih ya buat penilaiannya, semoga...",2026-01-22 13:38:56,


In [16]:
visualize_row_with_duplicated(df_ulasan)

Tidak ada data duplikat


**Filter kolom content dan score untuk konten data**

In [17]:
df = df_ulasan[['content', 'score']]

In [18]:
visualize_row_with_duplicated(df)

Jumlah Data Duplikat : 921
Pasangan Data Duplikat :


Unnamed: 0,content,score
424,Alhamdulillah,5
826,Alhamdulillah,5
1829,Alhamdulillah,5
3009,Alhamdulillah,5






Unnamed: 0,content,score
232,Bagus,5
663,Bagus,5
719,Bagus,5
1442,Bagus,5
1739,Bagus,5
1808,Bagus,5
1925,Bagus,5
3203,Bagus,5
3694,Bagus,5
3761,Bagus,5






Unnamed: 0,content,score
354,Baik,5
1563,Baik,5
3502,Baik,5
4318,Baik,5






Unnamed: 0,content,score
1519,Good,5
1683,Good,5
3198,Good,5
3636,Good,5
4647,Good,5






Unnamed: 0,content,score
303,Mantap,5
1225,Mantap,5
1385,Mantap,5
1806,Mantap,5
1858,Mantap,5
2719,Mantap,5
2743,Mantap,5
3290,Mantap,5
3766,Mantap,5
4487,Mantap,5






Unnamed: 0,content,score
646,OK,5
2421,OK,5
2867,OK,5
4063,OK,5






Unnamed: 0,content,score
230,Ok,5
414,Ok,5
1693,Ok,5
3107,Ok,5
4386,Ok,5
4595,Ok,5






Unnamed: 0,content,score
130,Sangat membantu,5
399,Sangat membantu,5
2456,Sangat membantu,5






Unnamed: 0,content,score
798,amanah,5
2355,amanah,5
3168,amanah,5
3575,amanah,5
4279,amanah,5
4581,amanah,5






Unnamed: 0,content,score
3321,aplikasi yg sangat membantu,5
4192,aplikasi yg sangat membantu,5






Unnamed: 0,content,score
1319,bagus,1
1489,bagus,1
3743,bagus,1






Unnamed: 0,content,score
153,bagus,4
205,bagus,4
1079,bagus,4
1809,bagus,4
2480,bagus,4
4312,bagus,4
4994,bagus,4






Unnamed: 0,content,score
2,bagus,5
25,bagus,5
41,bagus,5
58,bagus,5
88,bagus,5
...,...,...
4911,bagus,5
4919,bagus,5
4951,bagus,5
4955,bagus,5






Unnamed: 0,content,score
591,bagus banget,5
761,bagus banget,5
823,bagus banget,5
2684,bagus banget,5
2724,bagus banget,5
2825,bagus banget,5
3221,bagus banget,5
3303,bagus banget,5
3699,bagus banget,5
4653,bagus banget,5






Unnamed: 0,content,score
2939,bagus bgt,5
4844,bagus bgt,5






Unnamed: 0,content,score
2880,bagus dan sangat membantu,5
4441,bagus dan sangat membantu,5






Unnamed: 0,content,score
1080,bagus mantap,5
1916,bagus mantap,5






Unnamed: 0,content,score
244,bagus sekali,5
249,bagus sekali,5
433,bagus sekali,5
3813,bagus sekali,5
3954,bagus sekali,5
4489,bagus sekali,5
4803,bagus sekali,5






Unnamed: 0,content,score
1649,bagus terpercaya,5
2991,bagus terpercaya,5






Unnamed: 0,content,score
1023,baguss,5
1044,baguss,5
1340,baguss,5
2585,baguss,5
2951,baguss,5
2959,baguss,5
4170,baguss,5
4186,baguss,5
4871,baguss,5






Unnamed: 0,content,score
121,bagusss,5
2369,bagusss,5
3067,bagusss,5
3427,bagusss,5






Unnamed: 0,content,score
1859,baik,1
3262,baik,1
4087,baik,1






Unnamed: 0,content,score
12,baik,5
61,baik,5
359,baik,5
367,baik,5
596,baik,5
843,baik,5
847,baik,5
888,baik,5
1090,baik,5
1157,baik,5






Unnamed: 0,content,score
1421,baik sekali,5
2132,baik sekali,5






Unnamed: 0,content,score
659,barang sesuai dengan pesanan,5
4211,barang sesuai dengan pesanan,5






Unnamed: 0,content,score
2815,berkualitas,5
3024,berkualitas,5






Unnamed: 0,content,score
1244,best,5
3589,best,5
3992,best,5






Unnamed: 0,content,score
144,bgus,5
1427,bgus,5
1938,bgus,5
2162,bgus,5
2499,bgus,5
2887,bgus,5
3541,bgus,5






Unnamed: 0,content,score
1613,buruk,1
2541,buruk,1
3187,buruk,1






Unnamed: 0,content,score
2099,cukup memuaskan,5
4477,cukup memuaskan,5






Unnamed: 0,content,score
453,good,4
3243,good,4






Unnamed: 0,content,score
129,good,5
211,good,5
300,good,5
426,good,5
704,good,5
749,good,5
771,good,5
820,good,5
982,good,5
1058,good,5






Unnamed: 0,content,score
294,good aplikasi,5
1365,good aplikasi,5






Unnamed: 0,content,score
119,good job,5
240,good job,5
259,good job,5
594,good job,5
1424,good job,5
3419,good job,5
4553,good job,5






Unnamed: 0,content,score
3185,is the best,5
3488,is the best,5






Unnamed: 0,content,score
1776,jelek,1
4172,jelek,1
4269,jelek,1
4912,jelek,1






Unnamed: 0,content,score
43,jos,5
802,jos,5
1437,jos,5
1487,jos,5
1940,jos,5
2960,jos,5
3887,jos,5
4822,jos,5






Unnamed: 0,content,score
1642,josss,4
1898,josss,4






Unnamed: 0,content,score
1625,josss,5
3136,josss,5
3627,josss,5
4722,josss,5






Unnamed: 0,content,score
328,keren,5
441,keren,5
657,keren,5
911,keren,5
1194,keren,5
1523,keren,5
1689,keren,5
1736,keren,5
1982,keren,5
2268,keren,5






Unnamed: 0,content,score
219,lancar,5
1038,lancar,5






Unnamed: 0,content,score
607,luar biasa,5
1096,luar biasa,5
1356,luar biasa,5
1400,luar biasa,5
2218,luar biasa,5
2859,luar biasa,5
4125,luar biasa,5
4645,luar biasa,5






Unnamed: 0,content,score
1133,lumayan bagus,5
4574,lumayan bagus,5






Unnamed: 0,content,score
3031,lumayan lah,5
4928,lumayan lah,5






Unnamed: 0,content,score
446,mantaaap,5
1488,mantaaap,5






Unnamed: 0,content,score
2506,mantaap,5
4105,mantaap,5






Unnamed: 0,content,score
5,mantab,5
1046,mantab,5
1878,mantab,5
1964,mantab,5
2550,mantab,5
2715,mantab,5
3397,mantab,5
4552,mantab,5






Unnamed: 0,content,score
1056,mantap,1
3401,mantap,1






Unnamed: 0,content,score
2937,mantap,3
4080,mantap,3






Unnamed: 0,content,score
227,mantap,4
346,mantap,4
537,mantap,4
909,mantap,4
1221,mantap,4
2390,mantap,4
2393,mantap,4
3377,mantap,4
3908,mantap,4
4421,mantap,4






Unnamed: 0,content,score
3,mantap,5
111,mantap,5
124,mantap,5
133,mantap,5
185,mantap,5
...,...,...
4781,mantap,5
4799,mantap,5
4814,mantap,5
4949,mantap,5






Unnamed: 0,content,score
67,mantap banget,5
4673,mantap banget,5






Unnamed: 0,content,score
777,mantap keren,5
1819,mantap keren,5






Unnamed: 0,content,score
4131,mantap lah,5
4481,mantap lah,5






Unnamed: 0,content,score
2547,mantap shopee,5
3714,mantap shopee,5






Unnamed: 0,content,score
1267,mantul,5
2573,mantul,5
4661,mantul,5






Unnamed: 0,content,score
556,membantu,5
1813,membantu,5
3124,membantu,5






Unnamed: 0,content,score
555,membantu sekali,5
737,membantu sekali,5






Unnamed: 0,content,score
369,memuaskan,4
3129,memuaskan,4






Unnamed: 0,content,score
2255,memuaskan,5
2332,memuaskan,5
2678,memuaskan,5
2847,memuaskan,5
2896,memuaskan,5
3127,memuaskan,5
4649,memuaskan,5
4797,memuaskan,5
4859,memuaskan,5
4929,memuaskan,5






Unnamed: 0,content,score
3233,menyenangkan,5
3728,menyenangkan,5
3837,menyenangkan,5






Unnamed: 0,content,score
799,mudah dan murah,5
1697,mudah dan murah,5






Unnamed: 0,content,score
1758,mudah di gunakan,5
3513,mudah di gunakan,5
4424,mudah di gunakan,5






Unnamed: 0,content,score
3088,murah,5
4868,murah,5






Unnamed: 0,content,score
7,nice,5
1187,nice,5
1391,nice,5
2296,nice,5
2718,nice,5
2801,nice,5
3439,nice,5
3640,nice,5
3783,nice,5
4320,nice,5






Unnamed: 0,content,score
112,ok,1
3145,ok,1






Unnamed: 0,content,score
442,ok,4
456,ok,4
1321,ok,4
3441,ok,4
3550,ok,4
3749,ok,4
3847,ok,4






Unnamed: 0,content,score
109,ok,5
188,ok,5
324,ok,5
363,ok,5
493,ok,5
543,ok,5
549,ok,5
711,ok,5
862,ok,5
905,ok,5






Unnamed: 0,content,score
183,ok banget,5
840,ok banget,5
1943,ok banget,5
2434,ok banget,5
2717,ok banget,5
4578,ok banget,5






Unnamed: 0,content,score
161,oke,5
214,oke,5
411,oke,5
597,oke,5
776,oke,5
1041,oke,5
1648,oke,5
1652,oke,5
1662,oke,5
1699,oke,5






Unnamed: 0,content,score
1001,oke banget,5
2694,oke banget,5
4098,oke banget,5
4270,oke banget,5






Unnamed: 0,content,score
2574,pengiriman cepat,5
4744,pengiriman cepat,5






Unnamed: 0,content,score
2663,pokoknya mantap,5
4321,pokoknya mantap,5






Unnamed: 0,content,score
458,puas,5
2891,puas,5
3154,puas,5
3869,puas,5
4058,puas,5
4303,puas,5
4877,puas,5






Unnamed: 0,content,score
476,sampah,1
3332,sampah,1






Unnamed: 0,content,score
675,sangat bagus,4
2965,sangat bagus,4
3504,sangat bagus,4






Unnamed: 0,content,score
265,sangat bagus,5
542,sangat bagus,5
575,sangat bagus,5
673,sangat bagus,5
1083,sangat bagus,5
1138,sangat bagus,5
1171,sangat bagus,5
1364,sangat bagus,5
1432,sangat bagus,5
1465,sangat bagus,5






Unnamed: 0,content,score
533,sangat bagus dan mudah,5
1190,sangat bagus dan mudah,5






Unnamed: 0,content,score
3382,sangat bagus sekali,5
3919,sangat bagus sekali,5






Unnamed: 0,content,score
166,sangat baik,5
236,sangat baik,5
403,sangat baik,5
829,sangat baik,5
890,sangat baik,5
1149,sangat baik,5
1339,sangat baik,5
1431,sangat baik,5
2591,sangat baik,5
3328,sangat baik,5






Unnamed: 0,content,score
1338,sangat bermanfaat,5
1525,sangat bermanfaat,5
2599,sangat bermanfaat,5
2876,sangat bermanfaat,5






Unnamed: 0,content,score
2990,sangat cocok bagi saya,5
3008,sangat cocok bagi saya,5






Unnamed: 0,content,score
23,sangat membantu,5
92,sangat membantu,5
301,sangat membantu,5
448,sangat membantu,5
582,sangat membantu,5
906,sangat membantu,5
938,sangat membantu,5
1268,sangat membantu,5
1694,sangat membantu,5
1752,sangat membantu,5






Unnamed: 0,content,score
507,sangat memuaskan,5
688,sangat memuaskan,5
766,sangat memuaskan,5
958,sangat memuaskan,5
1156,sangat memuaskan,5
1208,sangat memuaskan,5
1346,sangat memuaskan,5
3036,sangat memuaskan,5
3041,sangat memuaskan,5
3044,sangat memuaskan,5






Unnamed: 0,content,score
1363,sangat menyenangkan,5
3141,sangat menyenangkan,5






Unnamed: 0,content,score
38,sangat puas,5
79,sangat puas,5
563,sangat puas,5
762,sangat puas,5
797,sangat puas,5
1218,sangat puas,5
1381,sangat puas,5
1402,sangat puas,5
1658,sangat puas,5
1914,sangat puas,5






Unnamed: 0,content,score
1269,sangat puas belanja di shopee,5
1843,sangat puas belanja di shopee,5
3693,sangat puas belanja di shopee,5
4165,sangat puas belanja di shopee,5






Unnamed: 0,content,score
463,sangat puas dgn shopee,5
3089,sangat puas dgn shopee,5
3896,sangat puas dgn shopee,5






Unnamed: 0,content,score
200,sangat rekomen,5
362,sangat rekomen,5






Unnamed: 0,content,score
1609,sangat senang,5
4623,sangat senang,5






Unnamed: 0,content,score
1578,sesuai,5
1857,sesuai,5
4406,sesuai,5






Unnamed: 0,content,score
2192,shopee is the best,5
2334,shopee is the best,5
2515,shopee is the best,5
2762,shopee is the best,5
4247,shopee is the best,5
4697,shopee is the best,5






Unnamed: 0,content,score
376,shopee terbaik,5
1834,shopee terbaik,5






Unnamed: 0,content,score
922,siiip,5
1491,siiip,5






Unnamed: 0,content,score
1091,simpel,5
1588,simpel,5






Unnamed: 0,content,score
292,sippp,5
1231,sippp,5
2454,sippp,5






Unnamed: 0,content,score
4345,suka,5
4557,suka,5
4591,suka,5






Unnamed: 0,content,score
53,terbaik,5
275,terbaik,5
283,terbaik,5
561,terbaik,5
669,terbaik,5
1479,terbaik,5
1672,terbaik,5
1968,terbaik,5
2227,terbaik,5
2416,terbaik,5






Unnamed: 0,content,score
881,terimakasih,5
1467,terimakasih,5
1954,terimakasih,5
2826,terimakasih,5
3178,terimakasih,5
4468,terimakasih,5
4815,terimakasih,5
4835,terimakasih,5






Unnamed: 0,content,score
616,terpercaya,5
2219,terpercaya,5
3196,terpercaya,5
3228,terpercaya,5
3284,terpercaya,5
4758,terpercaya,5






Unnamed: 0,content,score
81,the best,5
1627,the best,5
4867,the best,5






Unnamed: 0,content,score
637,top,5
1937,top,5
2078,top,5
2625,top,5
2653,top,5
3062,top,5
3730,top,5
3810,top,5
3949,top,5
4187,top,5






Unnamed: 0,content,score
3072,top markotop,5
3703,top markotop,5
4795,top markotop,5






Unnamed: 0,content,score
1835,very good,5
2252,very good,5






Unnamed: 0,content,score
1160,üëç,5
1372,üëç,5
2381,üëç,5
3528,üëç,5
3776,üëç,5
4154,üëç,5
4290,üëç,5
4457,üëç,5
4991,üëç,5






Unnamed: 0,content,score
3105,üëçüèª,5
3113,üëçüèª,5






Unnamed: 0,content,score
1216,üëçüëçüëç,5
1263,üëçüëçüëç,5
2414,üëçüëçüëç,5
3085,üëçüëçüëç,5
3456,üëçüëçüëç,5






In [19]:
df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


In [20]:
visualize_row_with_duplicated(df)

Tidak ada data duplikat


In [21]:
print(f"Ukuran Data Setelah Pembersihan NaN dan Duplicated: {df.shape}")

Ukuran Data Setelah Pembersihan NaN dan Duplicated: (4079, 2)


**Soft Cleaning Data**

In [22]:
def cleaning_untuk_labeling(text):
    text = str(text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # Hapus URL
    text = re.sub(r'<.*?>', '', text) # Hapus HTML tags
    text = re.sub(r'@[A-Za-z0-9_]+', '', text) # Hapus Mentions (@user)
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # Hapus hashtag
    text = re.sub(r'(.)\1{2,}', r'\1\1', text) # Hapus kata berulang yang muncul lebih dari 2x
    text = re.sub(r'[a-zA-Z]+\d+\w*|\w*\d+[a-zA-Z]+', '', text) # Hapus kombinasi angka dan huruf seperti m4ndi, 4yam, dll
    text = re.sub(r'\b\d{7,}\b', '', text) # Hapus angka yang panjangnya lebih dari 7
    text = ' '.join(text.split())

    return text

In [23]:
df['soft_clean_content'] = df['content'].apply(cleaning_untuk_labeling)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['soft_clean_content'] = df['content'].apply(cleaning_untuk_labeling)


In [24]:
df.head(5)

Unnamed: 0,content,score,soft_clean_content
0,siiip..,5,siip..
1,Dengan adanya jasa pengiriman di bawah naungan...,1,Dengan adanya jasa pengiriman di bawah naungan...
2,bagus,5,bagus
3,mantap,5,mantap
4,"over all uda bgs, tp tlg tambahin gopay dong m...",4,"over all uda bgs, tp tlg tambahin gopay dong m..."


In [25]:
cek_nan(df)

Tidak ada nilai missing


In [26]:
visualize_row_with_nan(df)

Unnamed: 0,content,score,soft_clean_content


In [27]:
df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


**Labeling Data**

In [28]:
# def labeling_data(df):
#     """
#     Fungsi untuk memberikan label otomatis berdasarkan skor bintang.
#     Menggunakan logika:
#     1-2 Bintang = Negatif
#     3 Bintang   = Netral
#     4-5 Bintang = Positif
#     """
#     def get_sentiment(score):
#         if score <= 2:
#             return 'Negatif'
#         elif score == 3:
#             return 'Netral'
#         else:
#             return 'Positif'

#     # Terapkan fungsi get_sentiment ke kolom 'score'
#     df['label'] = df['score'].apply(get_sentiment)

#     return df

In [29]:
# print("Labeling data berdasarkan skor bintang")
# df_labeled = labeling_data(df)

In [30]:
def label_with_indobert(df):
  pretrained_name = "w11wo/indonesian-roberta-base-sentiment-classifier"

  nlp = pipeline(
      "sentiment-analysis",
      model=pretrained_name,
      tokenizer=pretrained_name,
      truncation=True, # Potong teks jika terlalu panjang (>512 kata)
      max_length=512
    )

  labels = []
  scores = []

  for text in tqdm(df['soft_clean_content']):
    try:
      result = nlp(text)[0] # Prediksi sentimen

      label = result['label'] # output: 'positive', 'neutral', 'negative'

      # Mapping ulang ke format Bahasa Indonesia
      label_map = {
        'positive': 'Positif',
        'neutral': 'Netral',
        'negative': 'Negatif'
      }
      labels.append(label_map.get(label, label))
      scores.append(result['score'])

    except Exception as e:
      print(f"Error pada teks: {text}")
      labels.append("Netral")
      scores.append(0.0)

  df['sentiment_label'] = labels
  df['confidence_score'] = scores

  return df

In [None]:
df_labeled = label_with_indobert(df)

In [32]:
df_labeled.head(5)

Unnamed: 0,content,score,soft_clean_content,sentiment_label,confidence_score
0,siiip..,5,siip..,Negatif,0.998343
1,Dengan adanya jasa pengiriman di bawah naungan...,1,Dengan adanya jasa pengiriman di bawah naungan...,Negatif,0.569122
2,bagus,5,bagus,Positif,0.998097
3,mantap,5,mantap,Positif,0.999183
4,"over all uda bgs, tp tlg tambahin gopay dong m...",4,"over all uda bgs, tp tlg tambahin gopay dong m...",Netral,0.789292


**Pemeriksaan Distribusi Data, memastikan apakah terdapat imbalance atau tidak**

In [33]:
print("Distribusi Data per Kelas:")
print(df_labeled['sentiment_label'].value_counts())

Distribusi Data per Kelas:
sentiment_label
Positif    2478
Negatif    1376
Netral      225
Name: count, dtype: int64


**Konversi ke csv untuk mempermudah dalam pembersihan data lebih lanjut**

In [34]:
nama_file = 'dataset_ulasan_playstore.csv'
df_labeled.to_csv(nama_file, index=False)

print(f"Selesai! Data berhasil disimpan ke '{nama_file}'")
print(f"Total data: {len(df_labeled)}")

Selesai! Data berhasil disimpan ke 'dataset_ulasan_playstore.csv'
Total data: 4079
