# Bangkit 2022 Capstone Project
This project aims to classify the sentiment of a text as either positive or negative. It involves transfer learning using IndoBERT. The data collection is done through a combination of semi-manual scraping, automated scraping, and open data from the internet.

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertModel

  from .autonotebook import tqdm as notebook_tqdm


### Training data preprocessing:

In [3]:
df_gmaps = pd.read_csv("./data/google_maps.csv")
df_tokped = pd.read_csv("./data/dataset_review_tokped_labelled.csv")

df_tokped.head()

Unnamed: 0,Review,Rating,Sentiment
0,enak kuacinya,5,positive
1,pengiriman cepat packing bagus sesuai pesanan ...,5,positive
2,pengemasan luar biasa baik untuk rasa menurut ...,4,negative
3,terimakasih min,5,neutral
4,udah order untuk kesekian kali jos,5,neutral


In [4]:
df_tokped.drop("Rating", axis="columns", inplace=True)
df_tokped.head()

Unnamed: 0,Review,Sentiment
0,enak kuacinya,positive
1,pengiriman cepat packing bagus sesuai pesanan ...,positive
2,pengemasan luar biasa baik untuk rasa menurut ...,negative
3,terimakasih min,neutral
4,udah order untuk kesekian kali jos,neutral


In [5]:
df_tokped.columns = ["text", "label"]
df_tokped.head()

Unnamed: 0,text,label
0,enak kuacinya,positive
1,pengiriman cepat packing bagus sesuai pesanan ...,positive
2,pengemasan luar biasa baik untuk rasa menurut ...,negative
3,terimakasih min,neutral
4,udah order untuk kesekian kali jos,neutral


In [6]:
# change "positive" or "neutral" to 1, change "negative" to 0
df_tokped["label"] = df_tokped["label"].map(lambda row: 0 if row == "negative" else 1)
df_tokped.head()

Unnamed: 0,text,label
0,enak kuacinya,1
1,pengiriman cepat packing bagus sesuai pesanan ...,1
2,pengemasan luar biasa baik untuk rasa menurut ...,0
3,terimakasih min,1
4,udah order untuk kesekian kali jos,1


In [7]:
df_tokped["label"].value_counts()

1    3488
0     572
Name: label, dtype: int64

In [8]:
df = pd.concat([df_gmaps, df_tokped], ignore_index=True)

df.head()

Unnamed: 0,text,label
0,Tempat yang enak untuk hang out bersama teman ...,1
1,Tempatnya nyaman krn smoking areanya benar2 te...,1
2,Tempat ternyaman dan deket banget sama kantor....,1
3,"Tempatnya luas bgtt, nyaman kalo buat nugas ku...",1
4,Tempatnya cukup luas. Bisa blocking. Instagram...,1


### Train test split

In [9]:
# shuffle training data
df = df.sample(frac=1, ignore_index=True)

df.head()

Unnamed: 0,text,label
0,agak lama sih ngirimnya,0
1,kurang sedap kurang kerasa ikan nya,0
2,selesai dgn baik dan dpt bonus pie tq seller,1
3,pengiriman cepat sesuai pesanan recommended se...,1
4,ada bonus nya juga,1


In [10]:
# train-valid-test split 70-20-10
train_size = int(len(df) * 0.7)
valid_size = int(len(df) * 0.2)

df_train = df[:train_size]
df_valid = df[train_size:train_size + valid_size]
df_test = df[train_size + valid_size:]

print(len(df_train))
print(len(df_valid))
print(len(df_test))

2982
852
426


In [11]:
x_train = df_train["text"].values
y_train = df_train["label"].values

x_valid = df_valid["text"].values
y_valid = df_valid["label"].values

x_test = df_test["text"].values
y_test = df_test["label"].values

### Modelling

In [13]:
# download the IndoBERT pre-trained model
model_name='cahya/bert-base-indonesian-522M'
bert_tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = TFBertModel.from_pretrained(model_name)
bert_model.trainable = False

Some layers from the model checkpoint at cahya/bert-base-indonesian-522M were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at cahya/bert-base-indonesian-522M.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [14]:
# tokenize the training data using bert tokenizer
x_train_tokenized = bert_tokenizer(x_train.tolist(), truncation=True, max_length=100, padding=True, return_tensors="tf")
x_valid_tokenized = bert_tokenizer(x_valid.tolist(), truncation=True, max_length=100, padding=True, return_tensors="tf")
x_test_tokenized = bert_tokenizer(x_test.tolist(), truncation=True, max_length=100, padding=True, return_tensors="tf")

In [15]:
# define the model layers
input_layer = tf.keras.Input(shape=(x_train_tokenized.input_ids.shape[1],), dtype=tf.int32, name="input_ids")
bert_output = bert_model(input_layer)
last_hidden_states = bert_output.last_hidden_state
hidden_layer = tf.keras.layers.Dense(128, activation='relu')(last_hidden_states[:, 0, :])
classification_layer = tf.keras.layers.Dense(1, activation='sigmoid')(hidden_layer)

model = tf.keras.models.Model(inputs=input_layer, outputs=classification_layer)

In [14]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [15]:
model.fit(x_train_tokenized.input_ids, y_train, epochs=10, batch_size=100, validation_data=(x_valid_tokenized.input_ids, y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2304947af10>

### Evaluation

In [16]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(x_test_tokenized.input_ids, y_test, batch_size=100)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

# Make predictions on the test set
predictions = model.predict(x_test_tokenized.input_ids)
predicted_labels = (predictions > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report:")
print(classification_report(y_test, predicted_labels))

print("Confusion Matrix:")
print(confusion_matrix(y_test, predicted_labels))


Test Loss: 0.3344416320323944
Test Accuracy: 0.8497652411460876
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.21      0.33        75
           1       0.85      0.99      0.92       351

    accuracy                           0.85       426
   macro avg       0.81      0.60      0.62       426
weighted avg       0.84      0.85      0.81       426

Confusion Matrix:
[[ 16  59]
 [  5 346]]


In [17]:
# Convert the model to TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# Save the TFLite model to a file
tflite_model_file = "model.tflite"
with open(tflite_model_file, "wb") as f:
    f.write(tflite_model)



INFO:tensorflow:Assets written to: C:\Users\vania\AppData\Local\Temp\tmpt00l2wwr\assets


INFO:tensorflow:Assets written to: C:\Users\vania\AppData\Local\Temp\tmpt00l2wwr\assets
