**Load Package dan Library**

In [74]:
!pip install gensim



In [75]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import string
import nltk

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [76]:
# nltk.download('punkt_tab')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [77]:
import os
import kagglehub
from google.colab import userdata

# 1. Ambil kredensial dari Colab Secrets
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')

# 2. Login (Sekarang otomatis mendeteksi variabel di atas)
# kagglehub.login()

**Load Dataset Sentimen**

In [78]:
df = pd.read_csv('https://raw.githubusercontent.com/ArmFriiz/Dicoding-Submission-FDL/refs/heads/main/Analisis%20Sentimen/dataset_ulasan_playstore.csv')

In [79]:
df.head(5)

Unnamed: 0,content,score,label
0,CEPAT AMAN TERPERCAYA,5,Positif
1,top markotop,5,Positif
2,belanja online nomor 1 satu di Indonesia.,5,Positif
3,GANGGU DOANG LU ORG LAGI MAIN JUGA,1,Negatif
4,jadi langganan belanja di shopee..,5,Positif


In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73478 entries, 0 to 73477
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  73478 non-null  object
 1   score    73478 non-null  int64 
 2   label    73478 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.7+ MB


In [81]:
df.describe(include='all')

Unnamed: 0,content,score,label
count,73478,73478.0,73478
unique,72705,,3
top,Baik,,Positif
freq,5,,51173
mean,,3.862802,
std,,1.668418,
min,,1.0,
25%,,2.0,
50%,,5.0,
75%,,5.0,


In [82]:
df.drop(columns=['score'], inplace=True)

In [83]:
df.columns

Index(['content', 'label'], dtype='object')

**Load Model Normalisasi**

In [84]:
normalization_df = pd.read_csv('https://raw.githubusercontent.com/nasalsabila/kamus-alay/refs/heads/master/colloquial-indonesian-lexicon.csv')

In [85]:
normalization_df.head()

Unnamed: 0,slang,formal,In-dictionary,context,category1,category2,category3
0,woww,wow,1,wow,elongasi,0,0
1,aminn,amin,1,Selamat ulang tahun kakak tulus semoga panjang...,elongasi,0,0
2,met,selamat,1,Met hari netaas kak!? Wish you all the best @t...,abreviasi,0,0
3,netaas,menetas,1,Met hari netaas kak!? Wish you all the best @t...,afiksasi,elongasi,0
4,keberpa,keberapa,0,Birthday yg keberpa kak?,abreviasi,0,0


In [86]:
normalization_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15006 entries, 0 to 15005
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   slang          15006 non-null  object
 1   formal         15006 non-null  object
 2   In-dictionary  15006 non-null  int64 
 3   context        15006 non-null  object
 4   category1      15006 non-null  object
 5   category2      15006 non-null  object
 6   category3      15006 non-null  object
dtypes: int64(1), object(6)
memory usage: 820.8+ KB


In [87]:
normalization_dict = dict(zip(normalization_df['slang'], normalization_df['formal']))

In [88]:
def cleaning_text(text, normalization_dict):
    text = text.lower() # Ubah teks menjadi huruf kecil
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # Hapus mention
    text = re.sub(r'http\S+', '', text) # Hapus URL
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # Hapus hashtag
    text = re.sub(r'(.)\1{2,}', r'\1\1', text) # Hapus kata berulang yang muncul lebih dari 2x

    words = text.split()
    words = [normalization_dict.get(word, word) for word in words]
    text = ' '.join(words)

    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    text = ' '.join(text.split()) # Bersihkan spasi berlebih di tengah dan di ujung
    return text

In [89]:
# def stemmingText(text, stemmer): # Mengurangi kata ke bentuk dasarnya yang menghilangkan imbuhan awalan dan akhiran atau ke akar kata
#     # Memecah teks menjadi daftar kata
#     words = text

#     # Menerapkan stemming pada setiap kata dalam daftar
#     stemmed_words = [stemmer.stem(word) for word in words]

#     # Menggabungkan kata-kata yang telah distem
#     stemmed_text = ' '.join(stemmed_words)

#     return stemmed_text

In [90]:
# def tokenizingText(text): # Memecah atau membagi string, teks menjadi daftar token
#     text = word_tokenize(text)
#     return text

In [91]:
# def filteringText(text): # Menghapus stopwords dalam teks
#     listStopwords = set(stopwords.words('indonesian'))
#     # listStopwords1 = set(stopwords.words('english'))
#     # listStopwords.update(listStopwords1)
#     # listStopwords.update(['iya','yaa','gak','nya','na','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy"])
#     filtered = []
#     for txt in text:
#         if txt not in listStopwords:
#             filtered.append(txt)
#     text = filtered
#     return text

In [92]:
# factory = StemmerFactory()
# stemmer = factory.create_stemmer()

In [93]:
df['clean_content'] = df['content'].apply(cleaning_text, normalization_dict=normalization_dict)
# df['tokenized_content'] = df['clean_content'].apply(tokenizingText)
# df['filtered_content'] = df['tokenized_content'].apply(filteringText)
# df['stemmed_content'] = df['filtered_content'].apply(lambda x: stemmingText(x, stemmer))

In [94]:
df

Unnamed: 0,content,label,clean_content
0,CEPAT AMAN TERPERCAYA,Positif,cepat aman terpercaya
1,top markotop,Positif,top markotop
2,belanja online nomor 1 satu di Indonesia.,Positif,belanja online nomor satu di indonesia
3,GANGGU DOANG LU ORG LAGI MAIN JUGA,Negatif,ganggu doang lu orang lagi main juga
4,jadi langganan belanja di shopee..,Positif,jadi langganan belanja di shopee
...,...,...,...
73473,murah pokoknu beli di shopee,Positif,murah pokoknu beli di shopee
73474,"selalu puas belanja di shopee, mudah di aplika...",Positif,selalu puas belanja di shopee mudah di aplikas...
73475,"berbelanja murah, pengiriman cepat dan pelayan...",Positif,berbelanja murah pengiriman cepat dan pelayana...
73476,"sejak shopee indonesia ada, yg namanya outfit,...",Positif,sejak shopee indonesia ada yang namanya outfit...


In [95]:
# top_20 = df['tokenized_content'].explode().value_counts().head(20)
# top_20