### Import Data yang dibutuhkan

In [3]:
import os, sys
sys.path.append('../')
os.chdir('../')

In [4]:
pwd

'C:\\Users\\ahmad\\Project\\Tugas Akhir'

In [5]:
import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

from indonlu.utils.forward_fn import forward_sequence_classification
from indonlu.utils.metrics import document_sentiment_metrics_fn
from indonlu.utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

In [6]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [7]:
# Set random seed
set_seed(26092020)

In [8]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [11]:
import pandas as pd

df = pd.read_table(r"C:\Users\ahmad\Project\Tugas Akhir\Platinum\train_preprocess.tsv.txt")
df.head()

Unnamed: 0,"warung ini dimiliki oleh pengusaha pabrik tahu yang sudah puluhan tahun terkenal membuat tahu putih di bandung . tahu berkualitas , dipadu keahlian memasak , dipadu kretivitas , jadilah warung yang menyajikan menu utama berbahan tahu , ditambah menu umum lain seperti ayam . semuanya selera indonesia . harga cukup terjangkau . jangan lewatkan tahu bletoka nya , tidak kalah dengan yang asli dari tegal !",positive
0,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
1,lokasi strategis di jalan sumatera bandung . t...,positive
2,betapa bahagia nya diri ini saat unboxing pake...,positive
3,duh . jadi mahasiswa jangan sombong dong . kas...,negative
4,"makanan beragam , harga makanan di food stall ...",positive


In [12]:
from fast_ml.model_development import train_valid_test_split

X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(df, target = 'positive', 
                                                                            train_size=0.8, valid_size=0.1, test_size=0.1)

print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

(8799, 1)
(8799,)
(1100, 1)
(1100,)
(1100, 1)
(1100,)


(None, None)

In [13]:
train_preprocess = pd.concat([X_train,y_train],axis=1)
valid_preprocess = pd.concat([X_valid,y_valid],axis=1)
test_preprocess = pd.concat([X_test,y_test],axis=1)

In [19]:
test_preprocess

Unnamed: 0,"warung ini dimiliki oleh pengusaha pabrik tahu yang sudah puluhan tahun terkenal membuat tahu putih di bandung . tahu berkualitas , dipadu keahlian memasak , dipadu kretivitas , jadilah warung yang menyajikan menu utama berbahan tahu , ditambah menu umum lain seperti ayam . semuanya selera indonesia . harga cukup terjangkau . jangan lewatkan tahu bletoka nya , tidak kalah dengan yang asli dari tegal !",positive
8358,saya bekerja di jakarta yang sering melakukan ...,positive
307,penjual meminta saya membuat video testing unt...,negative
5773,"tjiang , memiliki beberapa cabang di bandung y...",positive
82,"kalau djarum memang agak tidak kuat , dia baru...",negative
6106,kopitiam oey yang dimiliki oleh pak bondan mak...,positive
...,...,...
4710,hotel mercure bangkok siam payah ! pelayanan t...,negative
561,menu nya lumayan banyak pilihan dan pelayanan ...,positive
4571,restoran ini berada di kawasan dago . waktu pe...,positive
3162,aman gemstone menyediakan berbagai cincin / pe...,neutral


In [14]:
train_preprocess.to_csv('train.tsv', sep='\t', index=False)
valid_preprocess.to_csv('valid.tsv', sep='\t', index=False)
test_preprocess.to_csv('test.tsv', sep='\t', index=False)

In [15]:
train_dataset_path = 'train.tsv'
valid_dataset_path = 'valid.tsv'
test_dataset_path = 'test.tsv'

In [16]:
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=4, num_workers=0, shuffle=True)  
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=4, num_workers=0, shuffle=False)  
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=4, num_workers=0, shuffle=False)

In [18]:
test_dataset

<indonlu.utils.data_utils.DocumentSentimentDataset at 0x237d7432790>

In [17]:
test_loader

<indonlu.utils.data_utils.DocumentSentimentDataLoader at 0x237d59d8e80>

In [36]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


# Test model on sample sentences

In [37]:
text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : neutral (41.519%)


In [38]:
text = 'Dasar anak sialan!! Kurang ajar!!'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')
logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : positive (44.977%)
Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : neutral (40.136%)


In [39]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)

In [40]:
# Train
n_epochs = 3
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cpu')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cpu')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

(Epoch 1) TRAIN LOSS:0.2462 LR:0.00000300: 100%|█████████████████████████████████| 2200/2200 [1:40:01<00:00,  2.73s/it]


(Epoch 1) TRAIN LOSS:0.2462 ACC:0.91 F1:0.88 REC:0.86 PRE:0.90 LR:0.00000300


VALID LOSS:0.1865 ACC:0.94 F1:0.91 REC:0.89 PRE:0.93: 100%|██████████████████████████| 276/276 [02:09<00:00,  2.13it/s]


(Epoch 1) VALID LOSS:0.1865 ACC:0.94 F1:0.91 REC:0.89 PRE:0.93


(Epoch 2) TRAIN LOSS:0.1266 LR:0.00000300: 100%|█████████████████████████████████| 2200/2200 [1:29:03<00:00,  2.43s/it]


(Epoch 2) TRAIN LOSS:0.1266 ACC:0.96 F1:0.95 REC:0.94 PRE:0.95 LR:0.00000300


VALID LOSS:0.1896 ACC:0.94 F1:0.93 REC:0.92 PRE:0.94: 100%|██████████████████████████| 276/276 [02:47<00:00,  1.64it/s]


(Epoch 2) VALID LOSS:0.1896 ACC:0.94 F1:0.93 REC:0.92 PRE:0.94


(Epoch 3) TRAIN LOSS:0.0770 LR:0.00000300: 100%|█████████████████████████████████| 2200/2200 [1:33:33<00:00,  2.55s/it]


(Epoch 3) TRAIN LOSS:0.0770 ACC:0.98 F1:0.97 REC:0.97 PRE:0.97 LR:0.00000300


VALID LOSS:0.1926 ACC:0.95 F1:0.93 REC:0.92 PRE:0.94: 100%|██████████████████████████| 276/276 [02:06<00:00,  2.18it/s]

(Epoch 3) VALID LOSS:0.1926 ACC:0.95 F1:0.93 REC:0.92 PRE:0.94





In [42]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cpu')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('pred.txt', index=False)

print(df)

100%|████████████████████████████████████████████████████████████████████████████████| 276/276 [01:57<00:00,  2.35it/s]

      index     label
0         0  positive
1         1  positive
2         2  negative
3         3  positive
4         4  negative
...     ...       ...
1096   1096  negative
1097   1097  positive
1098   1098  positive
1099   1099   neutral
1100   1100   neutral

[1101 rows x 2 columns]





In [45]:
model.save_pretrained('PretrainedBART.pkl')