# Skema Pelatihan 3

### Menggunakan Transfer Learning dengan cara finetune distilbert indo dengan pelabelan menggunakan model pretrained

In [None]:
import torch
import pandas as pd
import re
import requests
import ast
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Preprocessing

In [3]:
def getSlangWordList(url):
    response = requests.get(url)
    response.raise_for_status()  
    content = response.text
    data_dict = ast.literal_eval(content)
    return data_dict

slangwords = getSlangWordList('https://raw.githubusercontent.com/louisowen6/NLP_bahasa_resources/master/combined_slang_words.txt')
slangwords['gak'] = 'tidak'
slangwords['ga'] = 'tidak'
slangwords['bagu'] = 'bagus'
slangwords['gk'] ='tidak'
slangwords['udh'] = 'sudah'
slangwords['sdh'] = 'sudah'



In [None]:
df = pd.read_csv('data/ulasan_aplikasi_gojek_clean_100k.csv',on_bad_lines='skip')

In [5]:
df

Unnamed: 0,Review,Rating,tokenizing_clean,final_text,polarity_score_lexicon,polarity_lexicon,polarity_rating
0,terlalu terlalu terlalu... apk yg tidak bisa d...,1,"['apk', 'percaya', 'kuota', 'cepat', 'habis', ...",apk percaya kuota cepat habis update bug hp la...,-6,negative,negative
1,"Gak usah pasang tarif tarif hemat, soalnya par...",1,"['pasang', 'tarif', 'tarif', 'hemat', 'drivern...",pasang tarif tarif hemat drivernya ambil order...,25,positive,negative
2,tinggal 2menit lg driver sampe di lokasi tiba-...,1,"['tinggal', 'menit', 'lokasi', 'tibatiba', 'di...",tinggal menit lokasi tibatiba dibatalin otomat...,9,positive,negative
3,sebagai pengguna lama baru kali ini saya kecew...,1,"['pengguna', 'kali', 'kecewa', 'gocar', 'kadan...",pengguna kali kecewa gocar kadang sesuai harap...,26,positive,negative
4,susah untuk dpt driver walaupun di map ada ban...,1,"['susah', 'map', 'jalan', 'sekalinya', 'hujan'...",susah map jalan sekalinya hujan harga langsung...,16,positive,negative
...,...,...,...,...,...,...,...
129851,Cuman lupa email semua transaksi hafal akun ma...,1,"['cuman', 'lupa', 'email', 'transaksi', 'hafal...",cuman lupa email transaksi hafal akun nyangkut...,8,positive,negative
129852,Untuk apk gojek tetap di pertahankan dalam pel...,5,"['apk', 'pertahankan', 'pelayanan', 'goride', ...",apk pertahankan pelayanan goride dsb min terim...,9,positive,positive
129853,Dengan Go-Jek saya lebih terbantu kalau mau pe...,5,"['terbantu', 'pesan', 'makan', 'tinggal', 'men...",terbantu pesan makan tinggal mengunakan,4,positive,positive
129854,Driverny pada blagu2.aplikasi on tapi gak mau ...,1,"['driverny', 'blaguaplikasi', 'narikkan', 'don...",driverny blaguaplikasi narikkan dongo,0,neutral,negative


In [6]:
def processingTextForBert(text):
    # Cleaning Text
    text = re.sub(r'@[A-Za-z0-9]+', '', text) 
    text = re.sub(r'#[A-Za-z0-9]+', '', text) 
    text = re.sub(r'RT[\s]', '', text) 
    text = re.sub(r"http\S+", '', text) 
    text = re.sub(r'[0-9]+', '', text)

    text = text.replace('\n', ' ') 
    text = text.strip(' ') 

    # Casefolding Text
    text = text.lower()

    tokenize = text.split()
    tokenize = [word for word in tokenize if word not in slangwords]
    text = ' '.join(tokenize)
    return text



In [7]:
df['processed_bert'] = df['Review'].apply(processingTextForBert)

In [8]:
df['polarity_lexicon'] = df['polarity_lexicon'].apply(lambda x: 0 if x == 'negative' else 1 if x == 'neutral' else 2)
df['polarity_rating'] = df['polarity_rating'].apply(lambda x: 0 if x == 'negative' else 1 if x == 'neutral' else 2)

# Labeling Using Model Pretrained (w11wo/indonesian-roberta-base-sentiment-classifier)

In [9]:
tokenizer_predict = AutoTokenizer.from_pretrained("w11wo/indonesian-roberta-base-sentiment-classifier")
model_predict = AutoModelForSequenceClassification.from_pretrained(
    "w11wo/indonesian-roberta-base-sentiment-classifier",
    num_labels=3
)
model_predict.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [10]:
datasets_for_predict = Dataset.from_pandas(df[['processed_bert']])

def tokenize(examples):
    return tokenizer_predict(examples["processed_bert"], padding="max_length", truncation=True, max_length=35)
datasets_for_predict = datasets_for_predict.map(tokenize, batched=True)

Map: 100%|██████████| 129856/129856 [00:06<00:00, 19245.98 examples/s]


In [11]:
def predict_batch(batch):
    input_ids = torch.tensor(batch["input_ids"]).to(device)
    attention_mask = torch.tensor(batch["attention_mask"]).to(device)

    with torch.no_grad():
        output = model_predict(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(output.logits, dim=1).cpu().numpy()

    return {"predicted_label": predictions}

# Apply map with batched=True
dataset = datasets_for_predict.map(predict_batch, batched=True, batch_size=64)  # kamu bisa atur batch_size sesuai GPU


Map: 100%|██████████| 129856/129856 [09:47<00:00, 220.94 examples/s]


In [None]:
dataset.to_pandas()[['processed_bert', 'predicted_label']].to_csv('data/dataset_labeling_with_model_100k.csv', index=False)

# Import dataset dengan pelabelan baru

In [None]:
df = pd.read_csv('data/dataset_labeling_with_model_100k.csv')

In [14]:
print(df['predicted_label'].value_counts())

predicted_label
2    71471
0    42616
1    15769
Name: count, dtype: int64


# Cek hasil prediksi

In [None]:
df[df['predicted_label'] == 2].values

array([['terlalu terlalu terlalu... apk tidak bisa di percaya. kuota cepat minta update terus. bug banyak. hp !!!.. kok bisa nyuri data pribadi ya... diliat dari apk terus berjln dilatar belakang dan terus mengirim data. g data dikirim ?? wah wah wah... bahaya bobol bisa atm wkwkwk',
        2],
       ['pasang tarif tarif hemat, soalnya para driver nya ada yang mau ambil orderan, dengan alasan sesuai dengan harga, dan bahasa mereka merendahkan costumer, karena pesen paket hemat, di adain paket hemat, salah punya aplikasi lah, pengen kasih promo tapi para driver nya mau.',
        2],
       ['tinggal menit driver di lokasi tiba-tiba dibatalin otomatis sama aplikasinya!!! mencari driver baru, dan ini kejadian sekali dua kali tapi sering! map sering di arahin ke jalan sulit di lewati mobil!! alhasil banyak batalin! komplain via chat cuma bot doang, kita complain bisa ketik sendiri masalahnya, complain lewat email atau bisa!!! aplikasi niat terima komplain!!!',
        2],
       ...,
  

In [16]:
df[df['predicted_label'] == 1].values

array([['sudah top up dan konfirmasi sudah masuk pas di cek saldo ada, padahal transaksi juga sukses dan di riwayat gopay juga ada dana masuk tapi tetep saldo nya ada, di cek takut nya dana nya sudah keluar tapi tetep ada, fix bug nya aneh',
        1],
       ['saya mengisi saldo go-pay sebesar . di tanggal // di keterangan history sudah berhasil, tetapi saldo tidak bertambah, kemudian saya mengisi lagi saldo sebesar . di tanggal yang sama tetap saja saldo tidak bertambah, kemudian saya melaporkan masalah tersebut di fitur pusat bantuan yang ada di aplikasi tetapi tanggapan nya sangat lama sekali, tolong di perbaiki kualitas aplikasi nya',
        1],
       ['buat aku jika gofood bisa di ambil langsung cancel aja. bukan nunggu sejam lebih, wa cs baru di camcel. kasih soal gocar hemat, gocar biasa dan prioritas. sampai klik hemat eh nya ac.',
        1],
       ...,
       ['cuman lupa email semua transaksi hafal akun masih nyangkut di hp suruh buat pin dan nomer buat verif hilang big

In [17]:
df[df['predicted_label'] == 0].values

array([['suka sih sama apk ini,promo nya gede",tapi giliran kita promonya driver banyak orderan kita, alasan driver nya nya kecil mknya pada gakmau ambil,yg bermasalah brti di apk nya dong,yg kena customer,pesan makanan sejam lebih baru ada mau ambil orderan nya,mau marah ke driver nya yaa mereka juga salah,lebih di tingkatkan lagi yaa',
        0],
       ['gocar lebih sering dapat ac yang tidak dingin...sy kasih bintang untuk apresiasi pada unit yang ac nya dingin saja, selebihnya buat ac di unit nya tidak dingin, padahal pakai gocar biasa dan gocar comfort, bukan gocar hemat,,, ttp gitulah maka bersyukur banget jika dapat unit yang ac nya',
        0],
       ['pernah bisa % percaya map nya gojek walaupun sudah di rumah, sudah di pin poin, pakai koordinat, walaupun hidup gps, map ny sepintar map nya grab yang titik % sesuai ekspektasi. sebagai pengguna disuruh cek and recheck yang mana efektif kalau buru buru. berkali kali pesan goride ujung disuruh jalan sm mitra karena titik berbe

Dari hasil yang dilihat, hal yang mungkin untuk label adalah
- 0 positif
- 2 negatif
- 1 netral

# Splitting Dataset

In [18]:
from datasets import ClassLabel

datasets = Dataset.from_pandas(df)
class_label = ClassLabel(num_classes=3, names=["negative", "neutral", "positive"])
datasets = datasets.cast_column("predicted_label", class_label)

splitted = datasets.train_test_split(test_size=0.2, stratify_by_column="predicted_label")
data_train = splitted['train']
data_test = splitted['test']

Casting the dataset: 100%|██████████| 129856/129856 [00:00<00:00, 2803499.85 examples/s]


# Load Model For Fine Tune

In [19]:
tokenizer = AutoTokenizer.from_pretrained("cahya/distilbert-base-indonesian")
model = AutoModelForSequenceClassification.from_pretrained(
    "cahya/distilbert-base-indonesian",
    num_labels=3
)
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at cahya/distilbert-base-indonesian and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


# Tokenizing Dataset

In [20]:
datasets = Dataset.from_pandas(df)

def tokenize(examples):
    return tokenizer(examples["processed_bert"], padding="max_length", truncation=True, max_length=35)

data_train_tokenized = data_train.map(tokenize, batched=True)
data_test_tokenized = data_test.map(tokenize, batched=True)


Map: 100%|██████████| 103884/103884 [00:09<00:00, 10686.44 examples/s]
Map: 100%|██████████| 25972/25972 [00:02<00:00, 12189.52 examples/s]


In [21]:
def rename_label(example):
    example["labels"] = example["predicted_label"]
    return example

data_train_tokenized = data_train_tokenized.map(rename_label)
data_test_tokenized = data_test_tokenized.map(rename_label)

Map: 100%|██████████| 103884/103884 [00:07<00:00, 14142.97 examples/s]
Map: 100%|██████████| 25972/25972 [00:01<00:00, 13643.67 examples/s]


In [22]:
data_train_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
data_test_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [23]:
data_train_tokenized = data_train_tokenized.remove_columns(["predicted_label",'processed_bert'])
data_test_tokenized = data_test_tokenized.remove_columns(["predicted_label",'processed_bert'])

In [24]:
data_train_tokenized

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 103884
})

In [25]:
data_test_tokenized

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 25972
})

In [26]:
train_loader = DataLoader(data_train_tokenized, batch_size=32, shuffle=True)
test_loader = DataLoader(data_test_tokenized, batch_size=32, shuffle=False)

# Training

In [None]:
# Menggunakan focal loss untuk handle imbalance data
class FocalLoss(nn.Module):
    def __init__(self, gamma=2, weight=None):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.weight = weight

    def forward(self, logits, targets):
        ce_loss = F.cross_entropy(logits, targets, weight=self.weight, reduction="none")
        p_t = torch.exp(-ce_loss)  
        focal_loss = ((1 - p_t) ** self.gamma) * ce_loss
        return focal_loss.mean()

In [28]:

from sklearn.utils import compute_class_weight
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

labels = np.array(data_train_tokenized['labels'])
unique_labels = np.unique(labels)
class_weights = compute_class_weight('balanced', classes=unique_labels, y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
criterion = FocalLoss(gamma=2, weight=class_weights_tensor.to(device))

  class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)


In [29]:
num_epochs = 10
weight_decay = 1e-4 


for epoch in range(num_epochs):
    # Training Phase
    model.train()
    running_loss = 0.0
    correct, total = 0, 0

    train_loop = tqdm(train_loader, leave=True)
    for batch in train_loop:
        inputs = batch['input_ids'].to(device)
        masks = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=inputs, attention_mask=masks)
        logits = outputs.logits

        loss = criterion(logits, labels)

        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(logits, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

        train_loop.set_description(f"Epoch [{epoch+1}/{num_epochs}]")
        train_loop.set_postfix(loss=loss.item(), acc=100 * correct / total)

    train_loss = running_loss / len(train_loader)
    train_accuracy = 100 * correct / total
    
    # Validation Phase
    model.eval()
    val_running_loss = 0.0
    val_correct, val_total = 0, 0
    
    with torch.no_grad():
        val_loop = tqdm(test_loader, leave=True, desc="Validation")
        for batch in val_loop:
            inputs = batch['input_ids'].to(device)
            masks = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=inputs, attention_mask=masks)
            logits = outputs.logits

            val_loss = criterion(logits, labels)

            val_running_loss += val_loss.item()
            _, predicted = torch.max(logits, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)
            
            val_loop.set_postfix(loss=val_loss.item(), acc=100 * val_correct / val_total)
    
    val_loss = val_running_loss / len(test_loader)
    val_accuracy = 100 * val_correct / val_total
    
    
    print(f"Epoch {epoch+1} - Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}% | "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")
    
    
    model.save_pretrained("saved_model")
    tokenizer.save_pretrained("saved_model")
    

Epoch [1/10]: 100%|██████████| 3247/3247 [11:46<00:00,  4.60it/s, acc=76.9, loss=0.064] 
Validation: 100%|██████████| 812/812 [01:08<00:00, 11.87it/s, acc=78.7, loss=0.0751]


Epoch 1 - Train Loss: 0.1832, Train Acc: 76.87% | Val Loss: 0.1530, Val Acc: 78.65%


Epoch [2/10]: 100%|██████████| 3247/3247 [12:06<00:00,  4.47it/s, acc=83.2, loss=0.139] 
Validation: 100%|██████████| 812/812 [01:08<00:00, 11.81it/s, acc=83.6, loss=0.152] 


Epoch 2 - Train Loss: 0.1264, Train Acc: 83.20% | Val Loss: 0.1971, Val Acc: 83.58%


Epoch [3/10]: 100%|██████████| 3247/3247 [12:05<00:00,  4.48it/s, acc=87.2, loss=0.0299] 
Validation: 100%|██████████| 812/812 [01:08<00:00, 11.92it/s, acc=86, loss=0.172]   


Epoch 3 - Train Loss: 0.0908, Train Acc: 87.20% | Val Loss: 0.2229, Val Acc: 86.02%


Epoch [4/10]: 100%|██████████| 3247/3247 [11:59<00:00,  4.51it/s, acc=90.4, loss=0.0159] 
Validation: 100%|██████████| 812/812 [01:07<00:00, 11.98it/s, acc=84.8, loss=0.0959] 


Epoch 4 - Train Loss: 0.0655, Train Acc: 90.38% | Val Loss: 0.2922, Val Acc: 84.76%


Epoch [5/10]: 100%|██████████| 3247/3247 [11:59<00:00,  4.51it/s, acc=92.7, loss=0.267]  
Validation: 100%|██████████| 812/812 [01:07<00:00, 11.96it/s, acc=85.7, loss=0.24]   


Epoch 5 - Train Loss: 0.0504, Train Acc: 92.71% | Val Loss: 0.3306, Val Acc: 85.72%


Epoch [6/10]: 100%|██████████| 3247/3247 [12:09<00:00,  4.45it/s, acc=94.2, loss=0.0114]  
Validation: 100%|██████████| 812/812 [01:07<00:00, 12.03it/s, acc=85.7, loss=0.0947]


Epoch 6 - Train Loss: 0.0418, Train Acc: 94.23% | Val Loss: 0.2890, Val Acc: 85.74%


Epoch [7/10]: 100%|██████████| 3247/3247 [12:11<00:00,  4.44it/s, acc=95.5, loss=0.00307] 
Validation: 100%|██████████| 812/812 [01:10<00:00, 11.46it/s, acc=86.7, loss=0.357]  


Epoch 7 - Train Loss: 0.0330, Train Acc: 95.49% | Val Loss: 0.3960, Val Acc: 86.72%


Epoch [8/10]: 100%|██████████| 3247/3247 [12:08<00:00,  4.46it/s, acc=96.4, loss=0.0779]  
Validation: 100%|██████████| 812/812 [01:08<00:00, 11.84it/s, acc=85.8, loss=0.538]   


Epoch 8 - Train Loss: 0.0287, Train Acc: 96.37% | Val Loss: 0.4150, Val Acc: 85.80%


Epoch [9/10]: 100%|██████████| 3247/3247 [12:00<00:00,  4.51it/s, acc=96.9, loss=0.717]   
Validation: 100%|██████████| 812/812 [01:07<00:00, 11.97it/s, acc=87, loss=0.353]     


Epoch 9 - Train Loss: 0.0261, Train Acc: 96.94% | Val Loss: 0.4490, Val Acc: 86.98%


Epoch [10/10]: 100%|██████████| 3247/3247 [11:52<00:00,  4.56it/s, acc=97.5, loss=0.000738]
Validation: 100%|██████████| 812/812 [01:07<00:00, 12.00it/s, acc=86.4, loss=0.348]  


Epoch 10 - Train Loss: 0.0234, Train Acc: 97.51% | Val Loss: 0.4103, Val Acc: 86.43%


# Evaluasi Model

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import torch


def evaluate_model(model, test_dataloader, device):
    model.eval() 
    
    all_preds = []
    all_labels = []
    all_loss = []
    
    with torch.no_grad(): 
        for batch in test_dataloader:

            inputs = batch['input_ids']
            masks = batch['attention_mask']
            labels = batch['labels']

            inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)
            
            outputs = model(input_ids=inputs, attention_mask=masks)
            logits = outputs.logits
            
            loss = criterion(logits, labels)
            all_loss.append(loss.item())
            
            _, preds = torch.max(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = sum(all_loss) / len(all_loss)
    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds)
    cm = confusion_matrix(all_labels, all_preds)
    
    print(f"Average Loss: {avg_loss:.4f}")
    print(f"Accuracy: {accuracy:.2f}%")
    print("Classification Report:")
    print(report)
    print("Confusion Matrix:")
    print(cm)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)  

evaluate_model(model, test_loader, device)



Average Loss: 0.4103
Accuracy: 0.86%
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.91      0.89      8523
           1       0.66      0.73      0.69      3154
           2       0.91      0.87      0.89     14295

    accuracy                           0.86     25972
   macro avg       0.81      0.84      0.82     25972
weighted avg       0.87      0.86      0.87     25972

Confusion Matrix:
[[ 7715   220   588]
 [  263  2305   586]
 [  882   985 12428]]
