<a href="https://colab.research.google.com/github/ArmFriiz/Dicoding-Submission-FDL/blob/main/Analisis%20Sentimen/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Load package dan library**

In [37]:
!pip install gensim



In [72]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import string
import nltk
import kagglehub
import gensim

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import class_weight

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

In [39]:
path = kagglehub.dataset_download("bhimantoros/pretrained-word2vec-indonesia")
print("Path to dataset files:", path)

Using Colab cache for faster access to the 'pretrained-word2vec-indonesia' dataset.
Path to dataset files: /kaggle/input/pretrained-word2vec-indonesia


**Load dataset sentimen**

In [40]:
df = pd.read_csv('https://raw.githubusercontent.com/ArmFriiz/Dicoding-Submission-FDL/refs/heads/main/Analisis%20Sentimen/dataset_ulasan_playstore.csv')

In [41]:
df.head()

Unnamed: 0,content,score,label
0,CEPAT AMAN TERPERCAYA,5,Positif
1,top markotop,5,Positif
2,belanja online nomor 1 satu di Indonesia.,5,Positif
3,GANGGU DOANG LU ORG LAGI MAIN JUGA,1,Negatif
4,jadi langganan belanja di shopee..,5,Positif


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73478 entries, 0 to 73477
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  73478 non-null  object
 1   score    73478 non-null  int64 
 2   label    73478 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.7+ MB


In [43]:
df.describe(include='all')

Unnamed: 0,content,score,label
count,73478,73478.0,73478
unique,72705,,3
top,Baik,,Positif
freq,5,,51173
mean,,3.862802,
std,,1.668418,
min,,1.0,
25%,,2.0,
50%,,5.0,
75%,,5.0,


**Drop column score karena hanya digunakan untuk labelling awal saja**

In [44]:
df.drop(columns=['score'], inplace=True)

In [45]:
df.columns

Index(['content', 'label'], dtype='object')

**Cek ulang kondisi data**

In [46]:
df.isna().sum()

Unnamed: 0,0
content,0
label,0


In [47]:
df.duplicated().sum()

np.int64(472)

In [48]:
df.drop_duplicates(inplace=True)

In [49]:
df.duplicated().sum()

np.int64(0)

In [50]:
df.describe(include='all')

Unnamed: 0,content,label
count,73006,73006
unique,72705,3
top,bagus,Positif
freq,3,50760


**Load model normalisasi kata untuk perbaikan kata slank atau typo**

In [51]:
normalization_df = pd.read_csv('https://raw.githubusercontent.com/nasalsabila/kamus-alay/refs/heads/master/colloquial-indonesian-lexicon.csv')

In [52]:
normalization_df.head()

Unnamed: 0,slang,formal,In-dictionary,context,category1,category2,category3
0,woww,wow,1,wow,elongasi,0,0
1,aminn,amin,1,Selamat ulang tahun kakak tulus semoga panjang...,elongasi,0,0
2,met,selamat,1,Met hari netaas kak!? Wish you all the best @t...,abreviasi,0,0
3,netaas,menetas,1,Met hari netaas kak!? Wish you all the best @t...,afiksasi,elongasi,0
4,keberpa,keberapa,0,Birthday yg keberpa kak?,abreviasi,0,0


In [53]:
normalization_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15006 entries, 0 to 15005
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   slang          15006 non-null  object
 1   formal         15006 non-null  object
 2   In-dictionary  15006 non-null  int64 
 3   context        15006 non-null  object
 4   category1      15006 non-null  object
 5   category2      15006 non-null  object
 6   category3      15006 non-null  object
dtypes: int64(1), object(6)
memory usage: 820.8+ KB


In [54]:
normalization_dict = dict(zip(normalization_df['slang'], normalization_df['formal']))

**Cleaning data (Case folding, Punctuation, Kata berulang, Slankword atau typo)**

In [55]:
def cleaning_text(text, normalization_dict):
  text = text.lower() # Ubah teks menjadi huruf kecil
  text = re.sub(r'@[A-Za-z0-9]+', '', text) # Hapus mention
  text = re.sub(r'http\S+', '', text) # Hapus URL
  text = re.sub(r'#[A-Za-z0-9]+', '', text) # Hapus hashtag
  text = re.sub(r'(.)\1{2,}', r'\1\1', text) # Hapus kata berulang yang muncul lebih dari 2x

  words = text.split()
  words = [normalization_dict.get(word, word) for word in words]
  text = ' '.join(words)

  text = ''.join([char for char in text if char.isalpha() or char.isspace()])
  text = ' '.join(text.split()) # Bersihkan spasi berlebih di tengah dan di ujung
  return text

In [56]:
df['clean_content'] = df['content'].apply(cleaning_text, normalization_dict=normalization_dict)

In [57]:
df.head()

Unnamed: 0,content,label,clean_content
0,CEPAT AMAN TERPERCAYA,Positif,cepat aman terpercaya
1,top markotop,Positif,top markotop
2,belanja online nomor 1 satu di Indonesia.,Positif,belanja online nomor satu di indonesia
3,GANGGU DOANG LU ORG LAGI MAIN JUGA,Negatif,ganggu doang lu orang lagi main juga
4,jadi langganan belanja di shopee..,Positif,jadi langganan belanja di shopee


**Encoding label data**

In [58]:
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

In [59]:
df.head()

Unnamed: 0,content,label,clean_content,label_encoded
0,CEPAT AMAN TERPERCAYA,Positif,cepat aman terpercaya,2
1,top markotop,Positif,top markotop,2
2,belanja online nomor 1 satu di Indonesia.,Positif,belanja online nomor satu di indonesia,2
3,GANGGU DOANG LU ORG LAGI MAIN JUGA,Negatif,ganggu doang lu orang lagi main juga,0
4,jadi langganan belanja di shopee..,Positif,jadi langganan belanja di shopee,2


In [60]:
x = df['clean_content'].values

In [61]:
print(x)

['cepat aman terpercaya' 'top markotop'
 'belanja online nomor satu di indonesia' ...
 'berbelanja murah pengiriman cepat dan pelayanan yang ramah'
 'sejak shopee indonesia ada yang namanya outfit sepatu perlengkapan saya membuat kue semuanya saya pesan di shoopee apalagi kosmetik saya dan anak saya berlangganan di shoopee kalau ty pernah mengalami kekecewaan enggak karena shoope pernah tapi mereka siap terima keritikan dan masukan dari kita mungkin shallernya atau penjualnya yang harus jujur dan terbuka soalnya kadang kita meminta yang model atau warna apa kirimmya yang model dan warna lain tapi jarang sih pokoknya sukses ya shoope sudah membantu kami'
 'bagus banget buat belanja secara daring dan berguna untuk orang mageran']


In [62]:
y = df['label_encoded'].values

In [63]:
print(y)
print(le.classes_)

[2 2 2 ... 2 2 2]
['Negatif' 'Netral' 'Positif']


In [64]:
y_onehot = to_categorical(y, num_classes=3)
y_onehot

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

**Splitting Data (Training-Testing-Validation) (80-20)**

In [65]:
X_train, X_test, y_train, y_test = train_test_split(x, y_onehot, test_size=0.20, random_state=42, stratify=y)

print(f"Jumlah Data Train : {len(X_train)}")
print(f"Jumlah Data Test  : {len(X_test)}")

Jumlah Data Train : 58404
Jumlah Data Test  : 14602


**Tokenization, sequence dan padding**

In [67]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

lens = [len(s) for s in X_train_seq]
max_len_dynamic = int(np.percentile(lens, 95))

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len_dynamic, padding='post')
X_test_pad  = pad_sequences(X_test_seq,  maxlen=max_len_dynamic, padding='post')

**Load model Word2Vec untuk transfer learning (input model)**

In [69]:
w2v_model = gensim.models.Word2Vec.load('/kaggle/input/pretrained-word2vec-indonesia/wiki.id.case.model').wv

In [66]:
EMBEDDING_DIM = 400

In [75]:
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

hits = 0
misses = 0

for word, i in word_index.items():
  if word in w2v_model:
    embedding_matrix[i] = w2v_model[word]
    hits += 1
  else:
    misses += 1

print(f"Berhasil: {hits}, Gagal (OOV): {misses}")

Berhasil: 11052, Gagal (OOV): 24729


**Balancing class menggunakan pembobotan**

In [74]:
y_train_int = np.argmax(y_train, axis=1)

class_weights_val = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_int),
    y=y_train_int
)
class_weights_dict = dict(enumerate(class_weights_val))
print(class_weights_dict)

{0: np.float64(1.2827304473874943), 1: np.float64(7.433371515845742), 2: np.float64(0.4794129235618597)}


**Modelling**