In [1]:
import pandas as pd
import numpy as np

import torch
from tqdm import tqdm
# from tf2crf import CRF

import gc

# from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dropout, Dense, TimeDistributed, MaxPool1D
from tensorflow.keras.callbacks import ModelCheckpoint,TensorBoard,EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional, Dropout, BatchNormalization


# from skmultilearn.problem_transform import LabelPowerset
# from sklearn.multioutput import ClassifierChain

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, hamming_loss, f1_score
from sklearn.preprocessing import LabelEncoder

# from imblearn.over_sampling import SMOTE

from keras import layers
import keras
from keras.utils import to_categorical
from keras_preprocessing import sequence
from keras_preprocessing.text import Tokenizer
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model

import seaborn as sns
# import xgboost as xgb
from sklearn.linear_model import LogisticRegression
# from skmultilearn.problem_transform import BinaryRelevance

from transformers import BertTokenizer, BertModel

from nltk.tokenize import word_tokenize
import os

import matplotlib.pyplot as plt

In [2]:
PRETRAINED_MODEL_NAME = 'indobenchmark/indobert-base-p1'
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

In [3]:
from torch.utils.data import Dataset

class FakeNewsDataset(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ['train', 'val', 'test']
        self.mode = mode
        self.df = pd.read_csv(mode + '.tsv', sep='\t').fillna("")
        self.len = len(self.df)
        self.tokenizer = tokenizer  # BERT tokenizer
    
    def __getitem__(self, idx):
        if self.mode == 'test':
            statement, label = self.df.iloc[idx, :].values
            label_tensor = torch.tensor(label)
        else:
            statement, label = self.df.iloc[idx, :].values
            label_tensor = torch.tensor(label)
            
        word_pieces = ['[CLS]']
        statement = self.tokenizer.tokenize(statement)
        word_pieces += statement + ['[SEP]']
        len_st = len(word_pieces)
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        segments_tensor = torch.tensor([0] * len_st, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
    
# Initialize Datasets for Transformation
trainset = FakeNewsDataset('train', tokenizer=tokenizer)
valset = FakeNewsDataset('val', tokenizer=tokenizer)
testset = FakeNewsDataset('test', tokenizer=tokenizer)

print('trainset size:' ,trainset.__len__())
print('valset size:',valset.__len__())
print('testset size: ',testset.__len__())

trainset size: 3194
valset size: 798
testset size:  998


In [4]:
sample_idx = 0
statement, label = trainset.df.iloc[sample_idx].values
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = " ".join(tokens)

print(f"""
original_statement: 
{statement}

tokens: 
{tokens}

label: {label}

--------------------

tokens_tensor: 
{tokens_tensor}

segments_tensor: 
{segments_tensor}

label_tensor: 
{label_tensor}

""")


original_statement: 
anies ganjar kemarin ngasih kritik gibran sopan mahfud bain dukung

tokens: 
['[CLS]', 'anies', 'ganja', '##r', 'kemarin', 'ngasih', 'kritik', 'gib', '##ran', 'sopan', 'mahfud', 'bai', '##n', 'dukung', '[SEP]']

label: 5

--------------------

tokens_tensor: 
tensor([    2, 19145, 18006, 30359,  3601, 13710,  6476, 19044,  3823,  9036,
        20469, 11217, 30355,  9162,     3])

segments_tensor: 
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

label_tensor: 
5




In [5]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)
    

    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

BATCH_SIZE = 16
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)
valloader = DataLoader(valset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)
testloader = DataLoader(testset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch)
     

In [6]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([16, 23]) 
tensor([[    2, 19145, 18006, 30359,  3601, 13710,  6476, 19044,  3823,  9036,
         20469, 11217, 30355,  9162,     3,     0,     0,     0,     0,     0,
             0,     0,     0],
        [    2, 19145,  1023,  8137,  9420, 21167,  5822, 10634,  3500,     3,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [    2, 19145,  2911, 14979,  2424,  1375,  3341, 27715,  5454,  5035,
          8292, 22710,   678,  9558, 12058,     3,     0,     0,     0,     0,
             0,     0,     0],
        [    2,  3681,    84,  3681,    84,  8181,  3624,  2655,   986,   986,
          4772,  1587,  3577,   986,  2501, 15994, 18006, 30359,  7979,  6378,
         20469,     3,     0],
        [    2, 11283,  2873, 11935,  2690,   494, 23918,     3,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
       

In [7]:
from transformers import BertForSequenceClassification
from IPython.display import display, clear_output

NUM_LABELS = 8

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()

print("""
name             module
-----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:16} {}".format(name, module))


name             module
-----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout          Dropout(p=0.1, inplace=False)
classifier       Linear(in_features=768, out_features=8, bias=True)


In [22]:
import torch.nn as nn

class BERT_BiLSTM(nn.Module):
    def __init__(self, pretrained_model_name, hidden_dim, num_labels):
        super(BERT_BiLSTM, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size,
                            hidden_size=hidden_dim,
                            num_layers=1,
                            bidirectional=True,
                            batch_first=True)
        self.classifier = nn.Linear(hidden_dim * 2, num_labels)
    
    def forward(self, input_ids, token_type_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, 
                            token_type_ids=token_type_ids, 
                            attention_mask=attention_mask)
        
        sequence_output = outputs[0]  # shape: (batch_size, sequence_length, hidden_size)
        
        lstm_output, _ = self.lstm(sequence_output)  # shape: (batch_size, sequence_length, hidden_dim*2)
        
        lstm_output = lstm_output[:, 0, :]  # Take the output of the first token (CLS token)
        
        logits = self.classifier(lstm_output)  # shape: (batch_size, num_labels)
        
        return logits

# Initialize the model
NUM_LABELS = 8
HIDDEN_DIM = 100
model = BERT_BiLSTM(PRETRAINED_MODEL_NAME, HIDDEN_DIM, NUM_LABELS)

# Print the model architecture
print(model)



BERT_BiLSTM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [23]:
# Example training loop (simplified)
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = Adam(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()

# Training loop
for epoch in range(3):  # Example: 3 epochs
    model.train()
    for data in trainloader:
        tokens_tensors, segments_tensors, masks_tensors, label_ids = [t.to(device) for t in data]
        
        optimizer.zero_grad()
        
        outputs = model(tokens_tensors, segments_tensors, masks_tensors)
        loss = loss_fn(outputs, label_ids)
        
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")
    
# Validation loop (simplified)
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for data in valloader:
        tokens_tensors, segments_tensors, masks_tensors, label_ids = [t.to(device) for t in data]
        
        outputs = model(tokens_tensors, segments_tensors, masks_tensors)
        _, predicted = torch.max(outputs, 1)
        
        total += label_ids.size(0)
        correct += (predicted == label_ids).sum().item()

print(f"Validation Accuracy: {100 * correct / total}%")