In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [None]:
# negatif = 2
# positif = 0
# netral = 1

In [None]:
from transformers import pipeline
import transformers

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

# Load external function

In [None]:
import torch

###
# Forward Function
###

# Forward function for sequence classification
def forward_sequence_classification(model, batch_data, i2w, is_test=False, device='cpu', **kwargs):
    # Unpack batch data
    if len(batch_data) == 3:
        (subword_batch, mask_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 4:
        (subword_batch, mask_batch, token_type_batch, label_batch) = batch_data

    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    label_batch = torch.LongTensor(label_batch)

    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        label_batch = label_batch.cuda()

    # Forward model
    outputs = model(subword_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    loss, logits = outputs[:2]

    # generate prediction & label list
    list_hyp = []
    list_label = []
    hyp = torch.topk(logits, 1)[1]
    for j in range(len(hyp)):
        list_hyp.append(i2w[hyp[j].item()])
        list_label.append(i2w[label_batch[j][0].item()])

    return loss, list_hyp, list_label

# Forward function for word classification
def forward_word_classification(model, batch_data, i2w, is_test=False, device='cpu', **kwargs):
    # Unpack batch data
    if len(batch_data) == 4:
        (subword_batch, mask_batch, subword_to_word_indices_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 5:
        (subword_batch, mask_batch, token_type_batch, subword_to_word_indices_batch, label_batch) = batch_data

    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    subword_to_word_indices_batch = torch.LongTensor(subword_to_word_indices_batch)
    label_batch = torch.LongTensor(label_batch)

    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        subword_to_word_indices_batch = subword_to_word_indices_batch.cuda()
        label_batch = label_batch.cuda()

    # Forward model
    outputs = model(subword_batch, subword_to_word_indices_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    loss, logits = outputs[:2]

    # generate prediction & label list
    list_hyps = []
    list_labels = []
    hyps_list = torch.topk(logits, k=1, dim=-1)[1].squeeze(dim=-1)
    for i in range(len(hyps_list)):
        hyps, labels = hyps_list[i].tolist(), label_batch[i].tolist()
        list_hyp, list_label = [], []
        for j in range(len(hyps)):
            if labels[j] == -100:
                break
            else:
                list_hyp.append(i2w[hyps[j]])
                list_label.append(i2w[labels[j]])
        list_hyps.append(list_hyp)
        list_labels.append(list_label)

    return loss, list_hyps, list_labels

# Forward function for sequence multilabel classification
def forward_sequence_multi_classification(model, batch_data, i2w, is_test=False, device='cpu', **kwargs):
    # Unpack batch data
    if len(batch_data) == 3:
        (subword_batch, mask_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 4:
        (subword_batch, mask_batch, token_type_batch, label_batch) = batch_data

    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    label_batch = torch.LongTensor(label_batch)

    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        label_batch = label_batch.cuda()

    # Forward model
    outputs = model(subword_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    loss, logits = outputs[:2] # logits list<tensor(bs, num_label)> ~ list of batch prediction per class

    # generate prediction & label list
    list_hyp = []
    list_label = []
    hyp = [torch.topk(logit, 1)[1] for logit in logits] # list<tensor(bs)>
    batch_size = label_batch.shape[0]
    num_label = len(hyp)
    for i in range(batch_size):
        hyps = []
        labels = label_batch[i,:].cpu().numpy().tolist()
        for j in range(num_label):
            hyps.append(hyp[j][i].item())
        list_hyp.append([i2w[hyp] for hyp in hyps])
        list_label.append([i2w[label] for label in labels])

    return loss, list_hyp, list_label


# Load pre-trained model

In [None]:
pretrained_name = "afbudiman/indobert-classification"

nlp2 = pipeline(
    "sentiment-analysis",
    model=pretrained_name,
    tokenizer=pretrained_name
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/522 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/709k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
nlp2.model.num_labels

3

In [None]:
nlp2("""
Dasar anak sialan!! Kurang ajar!!
""")

[{'label': 'LABEL_2', 'score': 0.9998917579650879}]

In [None]:
nlp2("Pelayanan hotel ini sangat baik.")

[{'label': 'LABEL_0', 'score': 0.9996488094329834}]

In [None]:
nlp2("'Min tes antigen bisa di pelabuhan makassar pas hari keberangkatan?")

[{'label': 'LABEL_1', 'score': 0.9995580315589905}]

# Fine Tuned new Model

## Load Model

In [None]:
import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

import sys
sys.path.append('/content/drive/MyDrive/nolimit/sentiment/sentiment-v3-bert/indonlu/utils')

from forward_fn import forward_sequence_classification
# from metrics import document_sentiment_metrics_fn
from data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

In [None]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [None]:
# Set random seed
set_seed(2023)

In [None]:
# Load Tokenizer and Config
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig

# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model
# model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-lite-large-p2', config=config,
#                                                       ignore_mismatched_sizes=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

## Dataset

In [None]:
import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

import sys
sys.path.append('/content/drive/MyDrive/nolimit/sentiment/sentiment-v3-bert/indonlu/utils')

from forward_fn import forward_sequence_classification
# from metrics import document_sentiment_metrics_fn
from data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

In [None]:
%cd /content/drive/MyDrive/nolimit/sentiment/sentiment-v3-bert/indonlu

/content/drive/MyDrive/nolimit/sentiment/sentiment-v3-bert/indonlu


In [None]:
train_dataset_path = 'dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv'
valid_dataset_path = 'dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv'
test_dataset_path = 'dataset/smsa_doc-sentiment-prosa/test_preprocess_masked_label.tsv'

In [None]:
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)



In [None]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


## Test model pre-trained

In [None]:
text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : positive (37.290%)


In [None]:
text = 'Budi pergi ke pondok indah mall membeli cakwe'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Budi pergi ke pondok indah mall membeli cakwe | Label : positive (40.026%)


In [None]:
text = 'Dasar anak sialan!! Kurang ajar!!'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Dasar anak sialan!! Kurang ajar!! | Label : positive (38.593%)


# Train new model (di server)

In [None]:
# optimizer = optim.Adam(model.parameters(), lr=3e-6)

In [None]:
# """## Train new model"""
# print('TRAINING DIMULAIII....')

# optimizer = optim.Adam(model.parameters(), lr=3e-6)

# # Train
# n_epochs = 1
# for epoch in range(n_epochs):
#     model.train()
#     torch.set_grad_enabled(True)

#     total_train_loss = 0
#     list_hyp, list_label = [], []

#     train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
#     for i, batch_data in enumerate(train_pbar):
#         # Forward model
#         loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cpu')

#         # Update model
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         tr_loss = loss.item()
#         total_train_loss = total_train_loss + tr_loss

#         # Calculate metrics
#         list_hyp += batch_hyp
#         list_label += batch_label

#         train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
#             total_train_loss/(i+1), get_lr(optimizer)))

TRAINING DIMULAIII....


(Epoch 1) TRAIN LOSS:1.1007 LR:0.00000300:   0%|          | 1/344 [02:16<13:00:05, 136.46s/it]

In [None]:
# import pickle

# filename = 'model_sentiment_bert_10ep_combine_20rb.pkl'
# pickle.dump(model, open(filename, 'wb'))

# print('TRAINING SELESAIIII....')

# Test new model

In [None]:
import torch
import pickle
from transformers import BertTokenizer
from transformers import pipeline
import transformers

# Load the model from the .pkl file
with open('/content/drive/MyDrive/nolimit/sentiment/sentiment-v3-bert/model_sentiment_bert_10ep_combine95rb_nobalance_basep1.pkl', 'rb') as f:
    model = pickle.load(f)

tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

# Set the model to evaluation mode
# model.eval()

In [None]:
i2w

{0: 'positive', 1: 'neutral', 2: 'negative'}

In [None]:
text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

label

Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : positive (97.478%)


0

In [None]:
text = 'Budi pergi ke pondok indah mall membeli cakwe'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

label

Text: Budi pergi ke pondok indah mall membeli cakwe | Label : neutral (99.825%)


1

In [None]:
text = 'Dasar anak sialan!! Kurang ajar!!'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

label

Text: Dasar anak sialan!! Kurang ajar!! | Label : negative (99.770%)


2

In [None]:
df = pd.read_csv('/content/drive/MyDrive/nolimit/sentiment/sentiment-v3-bert/dataset/data_untuk_test_20rb_januari2023.csv', encoding='latin-1')
df.head()

Unnamed: 0,original_id,object_id,content,fixed_sentiment
0,'1618561744867323905,'07febd73-283e-4886-a74b-28738834e948,'@BFIFinance saya kecewa dengan sistem BFI FIn...,1
1,'1618980900443938816,'178c9be6-e4c5-4dfa-828b-37a95a11c9e1,'âÃÂ¶@FIFCLUBâÃÂ© luar biasa matketing F...,1
2,'1618903837661863942,'178c9be6-e4c5-4dfa-828b-37a95a11c9e1,'@henagoesra @FIFCLUB Palsu pak.. di mintain i...,1
3,'1618440694364008459,'178c9be6-e4c5-4dfa-828b-37a95a11c9e1,'@FIFCLUB mau bayar angsuran tapi di livin man...,0
4,'1618043178275463172,'178c9be6-e4c5-4dfa-828b-37a95a11c9e1,'@FIFCLUB kenapa angsuran naik Rp.7.500 ya?,0


In [None]:
df.columns

Index(['original_id', 'object_id', 'content', 'fixed_sentiment'], dtype='object')

In [None]:
df = df.rename(columns={'fixed_sentiment': 'final_sentiment_human'})
df = df.rename(columns={'content': 'text'})

In [None]:
df = df[['text', 'final_sentiment_human']]

In [None]:
df.head()

Unnamed: 0,text,final_sentiment_human
0,'@BFIFinance saya kecewa dengan sistem BFI FIn...,1
1,'âÃÂ¶@FIFCLUBâÃÂ© luar biasa matketing F...,1
2,'@henagoesra @FIFCLUB Palsu pak.. di mintain i...,1
3,'@FIFCLUB mau bayar angsuran tapi di livin man...,0
4,'@FIFCLUB kenapa angsuran naik Rp.7.500 ya?,0


In [None]:
# (df['final_sentiment_machine'] == df['final_sentiment_human']).sum()

1393

In [None]:
# # mengambil baris yang memenuhi kriteria
# idx_to_drop = df.index[df['final_sentiment_machine'] == df['final_sentiment_human']].tolist()

# # menjatuhkan baris yang memenuhi kriteria
# df = df.drop(idx_to_drop)

In [None]:
# (df['final_sentiment_machine'] == df['final_sentiment_human']).sum()

0

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words_id = stopwords.words("indonesian")

def text_cleaning_id(text, remove_stop_words=False):
    # Clean the text
    # text = re.sub(r"[^A-Za-z0-9]", " ", text) #1
    # text = re.sub(r"\'s", " ", text) #2
    # text = re.sub(r"http\S+", " link ", text) #3
    # text = re.sub(r"\b\d+(?:\.\d+)?\s+", "", text)  # 4

    # # Remove punctuation from text
    # text = "".join([c for c in text if c not in punctuation]) #5

    text = text.lower() #lowercase atau case folding
    text = re.sub('@[^\s]+', '', text) #remove username
    text = re.sub('\[.*?\]', '', text) # remove square brackets
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', text) # remove URLs
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punctuation
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)

    # Optionally, remove stop words
    if remove_stop_words:
        # load stopwords
        text = text.split()
        text = [w for w in text if not w in stop_words_id] #6
        text = " ".join(text)
        text = text.lower()

    # Optionally, shorten words to their stems
    # if lemmatize_words:
    #     text = text.split()
    #     lemmatizer = WordNetLemmatizer()
    #     lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
    #     text = " ".join(lemmatized_words)
    # Return a list of words
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
print(i2w)
# di dataset 0:neutral, 1:negatif, 2:positif

{0: 'positive', 1: 'neutral', 2: 'negative'}


In [None]:
df.head(1)

Unnamed: 0,text,final_sentiment_human
0,'@BFIFinance saya kecewa dengan sistem BFI FIn...,1


In [None]:
import warnings
warnings.filterwarnings('ignore')

df['final_sentiment_human'] = df['final_sentiment_human'].replace(0, 'neutral')
df['final_sentiment_human'] = df['final_sentiment_human'].replace(1, 'negative')
df['final_sentiment_human'] = df['final_sentiment_human'].replace(2, 'positive')

In [None]:
df.final_sentiment_human.value_counts()

negative    19782
positive      111
neutral       107
Name: final_sentiment_human, dtype: int64

In [None]:
df['final_sentiment_human'] = df['final_sentiment_human'].replace('positive', 0)
df['final_sentiment_human'] = df['final_sentiment_human'].replace('neutral', 1)
df['final_sentiment_human'] = df['final_sentiment_human'].replace('negative', 2)

df['final_sentiment_human'] = df['final_sentiment_human'].replace(i2w)

In [None]:
df.final_sentiment_human.value_counts()

negative    19782
positive      111
neutral       107
Name: final_sentiment_human, dtype: int64

In [None]:
df.isna().sum()

text                     0
final_sentiment_human    0
dtype: int64

In [None]:
df = df.dropna().reset_index(drop=True)
df = df.drop_duplicates().reset_index(drop=True)

In [None]:
import string

df['clean_fix'] = df['text'].apply(text_cleaning_id)

In [None]:
i2w

{0: 'positive', 1: 'neutral', 2: 'negative'}

In [None]:
df = df[['text', 'clean_fix', 'final_sentiment_human']]

In [None]:
df

Unnamed: 0,text,clean_fix,final_sentiment_human
0,'@BFIFinance saya kecewa dengan sistem BFI FIn...,saya kecewa dengan sistem bfi financecek dm,negative
1,'âÃÂ¶@FIFCLUBâÃÂ© luar biasa matketing F...,âãâ¶â© luar biasa matketing fif sampai mal...,negative
2,'@henagoesra @FIFCLUB Palsu pak.. di mintain i...,palsu pak di mintain info doang tp tdk pern...,negative
3,'@FIFCLUB mau bayar angsuran tapi di livin man...,mau bayar angsuran tapi di livin mandiri opsi...,neutral
4,'@FIFCLUB kenapa angsuran naik Rp.7.500 ya?,kenapa angsuran naik ya,neutral
...,...,...,...
19995,'Saldo gue pernah keambil juga pas isi emoney ...,saldo gue pernah keambil juga pas isi emoney d...,negative
19996,'Gblk bgt telkomsel,gblk bgt telkomsel,negative
19997,"'Cape bgt pake telkomsel, orbit. Busuk semua g...",cape bgt pake telkomsel orbit busuk semua gaad...,negative
19998,'Sepertinya di Telkomsel sekarang lagi ada hur...,sepertinya di telkomsel sekarang lagi ada huru...,negative


In [None]:
df = df.drop_duplicates(subset=['clean_fix']).reset_index(drop=True)

In [None]:
df.final_sentiment_human.value_counts()

negative    19600
neutral        89
positive       80
Name: final_sentiment_human, dtype: int64

In [None]:
pos = df[df['final_sentiment_human'] == 'positive']
neg = df[df['final_sentiment_human'] == 'negative'].sample(1500)
neu = df[df['final_sentiment_human'] == 'neutral']

In [None]:
df = pd.concat([pos,neg,neu]).reset_index(drop=True)

In [None]:
df.final_sentiment_human.value_counts()

negative    1500
neutral       89
positive      80
Name: final_sentiment_human, dtype: int64

In [None]:
df.duplicated(subset=['clean_fix']).sum()

0

In [None]:
df = df.drop_duplicates(subset=['clean_fix']).reset_index(drop=True)

In [None]:
for i, row in df.iterrows():
  try:
    text = row['clean_fix']
    subwords = tokenizer.encode(text)
    subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

    logits = model(subwords)[0]
    probs = F.softmax(logits, dim=-1).squeeze().tolist() # convert probabilities to a list
    label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

    df.at[i, 'hasil_prediksi_mesin_baru'] = i2w[label]
    df.at[i, 'confidence_new_model'] = probs[label]

  except:
    pass

In [None]:
df

Unnamed: 0,text,clean_fix,final_sentiment_human,hasil_prediksi_mesin_baru,confidence_new_model
0,'Buat perbandingan. DC @kredivo dateng baik2 d...,buat perbandingan dc dateng dan bayar minimu...,positive,positive,0.760830
1,'Kemudahan dalam proses pengajuan. Cukup unduh...,kemudahan dalam proses pengajuan cukup unduh a...,positive,positive,0.999493
2,'Kredivo memberikan kamu kemudahan dalam membe...,kredivo memberikan kamu kemudahan dalam membel...,positive,positive,0.998311
3,'Karna yang murni itu tidak bercampur dengan u...,karna yang murni itu tidak bercampur dengan un...,positive,positive,0.932126
4,'Menparekraf: Presiden Minta Target Wisman dan...,menparekraf presiden minta target wisman dan w...,positive,positive,0.500274
...,...,...,...,...,...
1664,'@sandiuno @Kemenparekraf Udah gituu aja?? Kun...,udah gituu aja kunjungan terus melihathemmmh,neutral,positive,0.999571
1665,"'@sandiuno fyi, libur sekolah 2022, kami juga ...",fyi libur sekolah kami juga ke jogja dengan ...,neutral,positive,0.990642
1666,'@sandiuno @Kemenparekraf mana hasil kerjanya?,mana hasil kerjanya,neutral,negative,0.995669
1667,'Mulu,mulu,neutral,negative,0.548094


In [None]:
df['final_sentiment_human'] = df['final_sentiment_human'].replace('positive', 0)
df['final_sentiment_human'] = df['final_sentiment_human'].replace('neutral', 1)
df['final_sentiment_human'] = df['final_sentiment_human'].replace('negative', 2)

df['final_sentiment_human'] = df['final_sentiment_human'].replace(i2w)

In [None]:
df.final_sentiment_human.value_counts()

negative    1500
neutral       89
positive      80
Name: final_sentiment_human, dtype: int64

In [None]:
df.hasil_prediksi_mesin_baru.value_counts()

negative    1392
neutral      169
positive     107
Name: hasil_prediksi_mesin_baru, dtype: int64

In [None]:
(df['final_sentiment_human'] == df['hasil_prediksi_mesin_baru']).sum()

1468

In [None]:
df.to_csv('/content/drive/MyDrive/nolimit/sentiment/sentiment-v3-bert/hasil_untuk_dicek2.csv', index=False)

# Test old Model

In [None]:
import joblib
from os.path import dirname, join, realpath

# text preprocessing modules
from string import punctuation
# text preprocessing modules
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re  # regular expression
import os
from os.path import dirname, join, realpath
import joblib
import pickle
import string

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# scikit-learn              1.1.1
# scipy                     1.8.0

In [None]:
!pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import joblib
filename = '/content/drive/MyDrive/nolimit/sentiment/model/new/model_bulan7.pkl'
model = joblib.load(filename)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/nolimit/sentiment/sentiment-v3-bert/dataset/data_test_balance15rb.csv', encoding='latin-1')
df.head()

Unnamed: 0,clean_fix,label
0,sukses sahabat sayang,positive
1,acara usaha mikro,positive
2,bismilahirahmanirahim melik hajar dewantara ha...,positive
3,ninjaxpres ninjagiveaway ayo,positive
4,salah huruf makhraj langam latih sifat huruf t...,positive


In [None]:
i2w

{0: 'positive', 1: 'neutral', 2: 'negative'}

In [None]:
df = df.rename(columns={'label' : 'final_sentiment_human'})

In [None]:
y_preds = model.predict(df['clean_fix'])

df['hasil_prediksi_mesin_lama'] = y_preds

# di dataset internal 0:neutral, 1:negative, 2:positive

df['hasil_prediksi_mesin_lama'] = df['hasil_prediksi_mesin_lama'].replace(0, 'neutral')
df['hasil_prediksi_mesin_lama'] = df['hasil_prediksi_mesin_lama'].replace(1, 'negative')
df['hasil_prediksi_mesin_lama'] = df['hasil_prediksi_mesin_lama'].replace(2, 'positive')

# {0: 'positive', 1: 'neutral', 2: 'negative'}

df['hasil_prediksi_mesin_lama'] = df['hasil_prediksi_mesin_lama'].replace('positive', 0)
df['hasil_prediksi_mesin_lama'] = df['hasil_prediksi_mesin_lama'].replace('neutral', 1)
df['hasil_prediksi_mesin_lama'] = df['hasil_prediksi_mesin_lama'].replace('negative', 2)

# df['hasil_prediksi_mesin_lama'] = df['hasil_prediksi_mesin_lama'].replace(i2w)

In [None]:
df['final_sentiment_human'] = df['final_sentiment_human'].replace('positive', 0)
df['final_sentiment_human'] = df['final_sentiment_human'].replace('neutral', 1)
df['final_sentiment_human'] = df['final_sentiment_human'].replace('negative', 2)

In [None]:
df

Unnamed: 0,clean_fix,final_sentiment_human,hasil_prediksi_mesin_lama
0,sukses sahabat sayang,0,0
1,acara usaha mikro,0,1
2,bismilahirahmanirahim melik hajar dewantara ha...,0,0
3,ninjaxpres ninjagiveaway ayo,0,0
4,salah huruf makhraj langam latih sifat huruf t...,0,0
...,...,...,...
14995,'Jelek bener dah jaringan Telkomsel perbaiki napa,2,2
14996,'min mau nanya nih admin gak ngerasa pelit ata...,2,2
14997,'Udah ngumpulin banyak banyak poin 400 lebih.....,2,1
14998,'@habibthink Ternyata selama inii yang gue min...,2,1


In [None]:
(df['final_sentiment_human'] == df['hasil_prediksi_mesin_lama']).sum()

9216

In [None]:
example = "'Momen haru Mas Menteri @sandiuno dengen Emak-Emak, Mashallah ?ü??ü´?  . . . #sandiuno #sandiagauno #bolu #WeGotYou Ngobrolin UrusanGue Di Bandung Serunya MilenialGenZ YukGabung Kerenin Indonesia BPJS tontonan hari ramadhan Lampung kandidat menteri pilihan rakyat https://t.co/9wCzwuWwbu"

cleaned_review = text_cleaning_id(example)
prediction = model.predict([cleaned_review])
int(prediction[0])

2

In [None]:
# sentiments = {0: "neutral", 1: "negative", 2: "positive"}

In [None]:
model.classes_

array([0, 1, 2])

# Kesimpulan Evaluasi

Model BERT di training dengan data training sebanyak 20rb, batch_size = 32, epoch = 10, menghasilkan akurasi saat training 92%, ini akurasi saat training.

Di testing kembali dengan menggunakan 1607 data yang telah di validasi oleh validator dengan *semua kalimat gagal di prediksi oleh model yang lama*. dengan confidence dari model yang lama seperti ini :
- mean        0.042950
- min         0.000000
- max         0.679861

*Jadi dari confidence 0 - 0.67 dari model lama dengan semua nya salah di prediksi*

Model BERT terbaru mencoba melakukan prediksi 1607 data tersebut, yang menghasilkan predict yang *benar 1049 data* dan yang *salah 558 data*

Dengan ini model BERT terbaru mampu memprediksi 1049 kalimat yang semua salah di prediksi model lama

Sebelumnya juga model BERT di uji beberapa kali dengan 10rb data, epoch 50, dan mencoba memprediksi data yang sama hanya berhasil benar sekitar 500 data.

Asumsi saat ini, semakin banyak data yang di combine ke data BERT maka semakin bagus model tersebut, kendala nya ada di server beberapa kali kena SIG KILL, baik itu dari RAM atau dari segi codingan yang gagal. masih di tuning dengan implementasi code yang sederhana, dengan harapan walaupun proses lama tapi model berhasil di train.

Model BERT juga menghasilkan probabilitas/confidence yang berbeda walaupun kalimat yang di input sama. sangat mungkin bahwa kalimat yang sama seperti "Saya sangat senang hari ini" dapat menghasilkan probabilitas sentimen yang berbeda saat dijalankan beberapa kali dengan model BERT yang sama.

Hal ini terjadi karena model BERT memerlukan beberapa pemrosesan dan optimisasi sebelum memberikan hasil yang akurat, termasuk dalam hal penyesuaian bobot dan bias dalam jaringan saraf. Selain itu, setiap kali model BERT dijalankan pada kalimat yang sama, ada beberapa faktor yang dapat mempengaruhi hasilnya, seperti variabilitas dalam inisialisasi bobot model, variasi dalam sampling mini-batch pada saat pelatihan, atau ketidaktepatan dalam estimasi gradien selama pelatihan.

Oleh karena itu, perbedaan kecil dalam probabilitas sentimen yang dihasilkan dapat terjadi ketika model BERT dijalankan beberapa kali pada kalimat yang sama. Namun, secara umum, jika model BERT telah dilatih dengan baik dan diuji pada dataset yang memadai, maka probabilitas sentimen yang dihasilkan harus konsisten dalam tingkat kepercayaan yang tinggi.

Melakukan beberapa kali training model, setelah dilakukan berulang kali untuk model BERT yang LARGE parameter dan data nya server selalu SIGKILL, jadi saya menggunakan model BERT yang lebih sederhana, saat ini yang paling tinggi akurasi nya adalah indobert-lite-base-p2

referensi bisa dilihat disini : https://www.indobenchmark.com/leaderboard.html

In [None]:
# from prettytable import PrettyTable

# # Membuat table baru dengan nama "my_table"
# my_table = PrettyTable()

# # Menambahkan kolom ke table
# my_table.add_column("Model", ["indobert-base-p1", "indobert-base-p1", "indobert-lite-base-p2", "indobert-large-p2", "indobert-lite-large-p2"])
# my_table.add_column("Total Data Train", ['10rb', '20rb', '20rb', '10rb', '10rb'])
# my_table.add_column("Epoch", [50, 1, 10, 1, 1])
# my_table.add_column("Hasil", ["Berhasil", "Berhasil", "Berhasil", "Gagal SIGKILL", "Gagal SIGKILL"])

# # Menampilkan table
# print(my_table)

Selanjutnya saya mau kalkulasi confidence level untuk model terbaru, mau training model lebih banyak terlebih dahulu agar mendapatkan hasil perbandingan yg sesuai. Karena model yang lama menggunakan data sekitar kurang lebih 30rb. Model yang baru masih pakai 20rb itu juga 10rb dari BERT dan 10rb dari data internal kita. Mau mencoba seluruh combine data dengan model yang paling bagus, 10rb, 20rb, 35rb, 70rb. Diharapkan dengan bertambahnya data maka model semakin bagus, agar bisa membuat kalkulasi confidence nya

*notes : melihat komputasi server juga, apabila untuk improve model masih gagal, bisa planning kan lagi nambah RAM.

In [None]:
from sklearn import metrics

In [None]:
df

Unnamed: 0,clean_fix,final_sentiment_human,hasil_prediksi_mesin_baru,confidence_new_model
0,sukses sahabat sayang,positive,positive,0.575606
1,acara usaha mikro,positive,positive,0.966490
2,bismilahirahmanirahim melik hajar dewantara ha...,positive,positive,0.999915
3,ninjaxpres ninjagiveaway ayo,positive,positive,0.999913
4,salah huruf makhraj langam latih sifat huruf t...,positive,positive,0.996290
...,...,...,...,...
14995,jelek bener dah jaringan telkomsel perbaiki napa,negative,negative,0.999869
14996,min nanya nih admin gak ngerasa pelit nyh gtuh...,negative,negative,0.992691
14997,udah ngumpulin poin ilangin gimana telkomsel,negative,negative,0.999850
14998,inii gue minum bahaya,negative,negative,0.999910


In [None]:
df['final_sentiment_human'] = df['final_sentiment_human'].replace('positive', 0)
df['final_sentiment_human'] = df['final_sentiment_human'].replace('neutral', 1)
df['final_sentiment_human'] = df['final_sentiment_human'].replace('negative', 2)

df['hasil_prediksi_mesin_baru'] = df['hasil_prediksi_mesin_baru'].replace('positive', 0)
df['hasil_prediksi_mesin_baru'] = df['hasil_prediksi_mesin_baru'].replace('neutral', 1)
df['hasil_prediksi_mesin_baru'] = df['hasil_prediksi_mesin_baru'].replace('negative', 2)

In [None]:
df = df.dropna().reset_index(drop=True)

In [None]:
df['hasil_prediksi_mesin_baru'] = df['hasil_prediksi_mesin_baru'].astype(int)

In [None]:
metrics.accuracy_score(df['final_sentiment_human'], df['hasil_prediksi_mesin_baru'])

0.9213895186024803

In [None]:
df_conf = df.copy()

In [None]:
conf = []
total_text = []
true = []
false = []
accuracy = []

for i in np.arange(0.00, 1.01, 0.01):
    # print(i)
    conf.append(i)

conf = np.round(conf, 2)
conf = -np.sort(-conf)

for i in conf:
  total_text.append(len(df_conf.loc[(df_conf['confidence_new_model'] >= i)]))

  true.append(
      len(df_conf.loc[(df_conf['confidence_new_model'] >= i) &
                      (df_conf['hasil_prediksi_mesin_baru'] == df_conf['final_sentiment_human'])]))

  false.append(len(df_conf.loc[(df_conf['confidence_new_model'] >= i)]) -
               (len(df_conf.loc[(df_conf['confidence_new_model'] >= i) &
                                (df_conf['hasil_prediksi_mesin_baru'] == df_conf['final_sentiment_human'])])))

  try :
    accuracy.append((len(df_conf.loc[(df_conf['confidence_new_model'] >= i) &
                      (df_conf['hasil_prediksi_mesin_baru'] == df_conf['final_sentiment_human'])])) /
                    len(df_conf.loc[(df_conf['confidence_new_model'] >= i)])
                    * 100)

  except:
    accuracy.append(0)


print(total_text)
print(true)
print(false)
print(accuracy)

[0, 10134, 11059, 11526, 11876, 12121, 12327, 12491, 12617, 12760, 12884, 12975, 13073, 13169, 13251, 13340, 13408, 13467, 13515, 13574, 13629, 13683, 13739, 13787, 13838, 13890, 13943, 13998, 14043, 14081, 14111, 14161, 14205, 14238, 14271, 14301, 14352, 14379, 14417, 14459, 14489, 14531, 14571, 14609, 14643, 14675, 14718, 14757, 14796, 14838, 14879, 14902, 14914, 14932, 14943, 14950, 14958, 14964, 14976, 14979, 14986, 14989, 14994, 14995, 14996, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998, 14998]
[0, 9989, 10839, 11276, 11598, 11810, 11991, 12129, 12227, 12348, 12445, 12518, 12602, 12684, 12735, 12798, 12848, 12890, 12933, 12977, 13015, 13057, 13098, 13134, 13171, 13199, 13230, 13270, 13295, 13315, 13332, 13365, 13386, 13408, 13430, 13449, 13480, 13497, 13520, 13543, 13555, 13580, 13601,

In [None]:
conf_level = pd.DataFrame({'conf' : conf,
                           'total_text' : total_text,
                           'true' : true,
                           'false' : false,
                           'accuracy' : accuracy
                           })

In [None]:
conf_level

Unnamed: 0,conf,total_text,true,false,accuracy
0,1.00,0,0,0,0.000000
1,0.99,10134,9989,145,98.569173
2,0.98,11059,10839,220,98.010670
3,0.97,11526,11276,250,97.830991
4,0.96,11876,11598,278,97.659144
...,...,...,...,...,...
96,0.04,14998,13819,1179,92.138952
97,0.03,14998,13819,1179,92.138952
98,0.02,14998,13819,1179,92.138952
99,0.01,14998,13819,1179,92.138952


In [None]:
conf_level[conf_level['conf'] >= 0.8]

Unnamed: 0,conf,total_text,true,false,accuracy
0,1.0,0,0,0,0.0
1,0.99,10134,9989,145,98.569173
2,0.98,11059,10839,220,98.01067
3,0.97,11526,11276,250,97.830991
4,0.96,11876,11598,278,97.659144
5,0.95,12121,11810,311,97.434205
6,0.94,12327,11991,336,97.274276
7,0.93,12491,12129,362,97.101913
8,0.92,12617,12227,390,96.908932
9,0.91,12760,12348,412,96.77116


In [None]:
conf_level[conf_level['conf'] >= 0.9]

Unnamed: 0,conf,total_text,true,false,accuracy
0,1.0,0,0,0,0.0
1,0.99,10134,9989,145,98.569173
2,0.98,11059,10839,220,98.01067
3,0.97,11526,11276,250,97.830991
4,0.96,11876,11598,278,97.659144
5,0.95,12121,11810,311,97.434205
6,0.94,12327,11991,336,97.274276
7,0.93,12491,12129,362,97.101913
8,0.92,12617,12227,390,96.908932
9,0.91,12760,12348,412,96.77116


In [None]:
conf_level.to_excel('/content/drive/MyDrive/nolimit/sentiment/sentiment-v3-bert/confidence/confidence_model7.xlsx', index=False)

# Train New Model (code server)

In [None]:
from transformers import pipeline

"""# Load external function"""

import torch

###
# Forward Function
###

# Forward function for sequence classification
def forward_sequence_classification(model, batch_data, i2w, is_test=False, device='cpu', **kwargs):
    # Unpack batch data
    if len(batch_data) == 3:
        (subword_batch, mask_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 4:
        (subword_batch, mask_batch, token_type_batch, label_batch) = batch_data

    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    label_batch = torch.LongTensor(label_batch)

    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        label_batch = label_batch.cuda()

    # Forward model
    outputs = model(subword_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    loss, logits = outputs[:2]

    # generate prediction & label list
    list_hyp = []
    list_label = []
    hyp = torch.topk(logits, 1)[1]
    for j in range(len(hyp)):
        list_hyp.append(i2w[hyp[j].item()])
        list_label.append(i2w[label_batch[j][0].item()])

    return loss, list_hyp, list_label

# Forward function for word classification
def forward_word_classification(model, batch_data, i2w, is_test=False, device='cpu', **kwargs):
    # Unpack batch data
    if len(batch_data) == 4:
        (subword_batch, mask_batch, subword_to_word_indices_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 5:
        (subword_batch, mask_batch, token_type_batch, subword_to_word_indices_batch, label_batch) = batch_data

    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    subword_to_word_indices_batch = torch.LongTensor(subword_to_word_indices_batch)
    label_batch = torch.LongTensor(label_batch)

    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        subword_to_word_indices_batch = subword_to_word_indices_batch.cuda()
        label_batch = label_batch.cuda()

    # Forward model
    outputs = model(subword_batch, subword_to_word_indices_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    loss, logits = outputs[:2]

    # generate prediction & label list
    list_hyps = []
    list_labels = []
    hyps_list = torch.topk(logits, k=1, dim=-1)[1].squeeze(dim=-1)
    for i in range(len(hyps_list)):
        hyps, labels = hyps_list[i].tolist(), label_batch[i].tolist()
        list_hyp, list_label = [], []
        for j in range(len(hyps)):
            if labels[j] == -100:
                break
            else:
                list_hyp.append(i2w[hyps[j]])
                list_label.append(i2w[labels[j]])
        list_hyps.append(list_hyp)
        list_labels.append(list_label)

    return loss, list_hyps, list_labels

# Forward function for sequence multilabel classification
def forward_sequence_multi_classification(model, batch_data, i2w, is_test=False, device='cpu', **kwargs):
    # Unpack batch data
    if len(batch_data) == 3:
        (subword_batch, mask_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 4:
        (subword_batch, mask_batch, token_type_batch, label_batch) = batch_data

    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    label_batch = torch.LongTensor(label_batch)

    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        label_batch = label_batch.cuda()

    # Forward model
    outputs = model(subword_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    loss, logits = outputs[:2] # logits list<tensor(bs, num_label)> ~ list of batch prediction per class

    # generate prediction & label list
    list_hyp = []
    list_label = []
    hyp = [torch.topk(logit, 1)[1] for logit in logits] # list<tensor(bs)>
    batch_size = label_batch.shape[0]
    num_label = len(hyp)
    for i in range(batch_size):
        hyps = []
        labels = label_batch[i,:].cpu().numpy().tolist()
        for j in range(num_label):
            hyps.append(hyp[j][i].item())
        list_hyp.append([i2w[hyp] for hyp in hyps])
        list_label.append([i2w[label] for label in labels])

    return loss, list_hyp, list_label

"""# Fine Tuned new Model

## Load Model
"""

# Commented out IPython magic to ensure Python compatibility.
# %cd /content

# Commented out IPython magic to ensure Python compatibility.
# %cd /content/indonlu/utils

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

import sys
sys.path.append('/content/indonlu/utils')

from forward_fn import forward_sequence_classification
# from metrics import document_sentiment_metrics_fn
from data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

# Set random seed
set_seed(2023)

# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-large-p2')
config = BertConfig.from_pretrained('indobenchmark/indobert-large-p2')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-large-p2', config=config)

model

count_param(model)

"""## Dataset"""

train_dataset_path = '/content/drive/MyDrive/nolimit/sentiment/sentiment-v3-bert/indonlu/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv'
valid_dataset_path = '/content/drive/MyDrive/nolimit/sentiment/sentiment-v3-bert/indonlu/dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv'
test_dataset_path = '/content/drive/MyDrive/nolimit/sentiment/sentiment-v3-bert/indonlu/dataset/smsa_doc-sentiment-prosa/test_preprocess_masked_label.tsv'

train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=4, shuffle=True)
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=32, num_workers=4, shuffle=False)
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=4, shuffle=False)

w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

"""## Test model pre-trained"""

text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

text = 'Budi pergi ke pondok indah mall membeli cakwe'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

text = 'Dasar anak sialan!! Kurang ajar!!'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-large-p2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}
Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : positive (74.266%)
Text: Budi pergi ke pondok indah mall membeli cakwe | Label : positive (72.016%)
Text: Dasar anak sialan!! Kurang ajar!! | Label : positive (76.330%)


In [None]:
"""## Train new model"""
print('TRAINING DIMULAIII....')

optimizer = optim.Adam(model.parameters(), lr=3e-6)

from pytorchtools import EarlyStopping

early_stopping = EarlyStopping(patience=5, mode='min')

# Train
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)

    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cpu')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

        # Check early stopping
        early_stopping.check(val_loss=total_train_loss/(i+1), model=model)

        # Save checkpoint if early stopping condition is met
        if early_stopping.should_stop():
            print("Early stopping!")
            early_stopping.save_checkpoint(model=model, epoch=epoch, val_loss=total_train_loss/(i+1))
            break

    if early_stopping.should_stop():
        break

# Save final model
filename = 'model_sentiment_bert_10ep_combine_20rb.pkl'
pickle.dump(model, open(filename, 'wb'))

print('TRAINING SELESAIIII....')

# Test data di cek

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/nolimit/sentiment/sentiment-v3-bert/hasil_untuk_dicek - hasil_untuk_dicek.csv')

In [None]:
df.head(1)

Unnamed: 0,text,clean_fix,final_sentiment_human,hasil_prediksi_mesin_baru,confidence_new_model,hasil_prediksi_validator
0,-dips! Saran provider yg sinyalnya stabil kalo...,dips saran provider yg sinyalnya stabil kalo u...,negative,negative,0.999976,1


In [None]:
df.hasil_prediksi_validator.value_counts()

1    1501
0     131
2      76
?       2
Name: hasil_prediksi_validator, dtype: int64

In [None]:
df = df.drop(index=df[df['hasil_prediksi_validator'] == '?'].index).reset_index(drop=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1710 entries, 0 to 1709
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   text                       1708 non-null   object 
 1   clean_fix                  1706 non-null   object 
 2   final_sentiment_human      1708 non-null   object 
 3   hasil_prediksi_mesin_baru  1707 non-null   object 
 4   confidence_new_model       1707 non-null   float64
 5   hasil_prediksi_validator   1708 non-null   object 
dtypes: float64(1), object(5)
memory usage: 80.3+ KB


In [None]:
import warnings
warnings.filterwarnings('ignore')

df['hasil_prediksi_validator'] = df['hasil_prediksi_validator'].replace(0, 'neutral')
df['hasil_prediksi_validator'] = df['hasil_prediksi_validator'].replace(1, 'negative')
df['hasil_prediksi_validator'] = df['hasil_prediksi_validator'].replace(2, 'positive')

In [None]:
df[['hasil_prediksi_mesin_baru', 'hasil_prediksi_validator']] = \
df[['hasil_prediksi_mesin_baru', 'hasil_prediksi_validator']].replace({'positive': 2,
                                                                       'negative': 1,
                                                                       'neutral': 0Z})

In [None]:
df

In [None]:
df = df.dropna().reset_index(drop=True)

In [None]:
df['hasil_prediksi_mesin_baru'] = df['hasil_prediksi_mesin_baru'].astype(int)
df['hasil_prediksi_validator'] = df['hasil_prediksi_validator'].astype(int)

In [None]:
(df['hasil_prediksi_mesin_baru'] == df['hasil_prediksi_validator']).sum()

1491

In [None]:
from sklearn import metrics

metrics.accuracy_score(df['hasil_prediksi_mesin_baru'], df['hasil_prediksi_validator'])

0.8744868035190616

In [None]:
gagal = df[(df['hasil_prediksi_mesin_baru'] != df['hasil_prediksi_validator'])]

In [None]:
gagal[gagal['confidence_new_model'] >= 0.9]

Unnamed: 0,text,clean_fix,final_sentiment_human,hasil_prediksi_mesin_baru,confidence_new_model,hasil_prediksi_validator
12,@03___nakula @Le_MineraleID @KementerianLHK @a...,hahaa lagi dan lagi buzzer sekali pakai berula...,negative,0,0.931022,1
37,@AryPrasetyo_85 @BPOM_RI @sehatAQUA Kalo tau a...,kalo tau apa itu bpa para orang tua pasti akan...,negative,2,0.979902,1
50,"@ask_AXIS izin bertanya min, saya kan beli pak...",izin bertanya min saya kan beli paket owsem di...,neutral,0,0.980813,1
56,@bdngfess gpp sebulan dua bulan mah kalo lbh d...,gpp sebulan dua bulan mah kalo lbh dr kredivo...,neutral,1,0.991422,0
82,"@Bluebirdgroup mau tanya,, emang sopir dikenak...",mau tanya emang sopir dikenakan biaya kalo tel...,negative,0,0.995911,1
...,...,...,...,...,...,...
1690,Ketua Umum (Ketum) Partai Gerindra Prabowo Sub...,ketua umum ketum partai gerindra prabowo subia...,neutral,2,0.952956,0
1696,"Bagaimana ini,, sama Parpol yang membesarkanny...",bagaimana ini sama parpol yang membesarkannya ...,neutral,2,0.992536,1
1697,Hmm.. Ngebet sih ngebet tapi cobalah hargai ya...,hmm ngebet sih ngebet tapi cobalah hargai yang...,neutral,2,0.991545,1
1699,"Menteri Pariwisata dan Ekonomi Kreatif, Sandia...",menteri pariwisata dan ekonomi kreatif sandiag...,neutral,0,0.980988,2


In [None]:
gagal[gagal['confidence_new_model'] >= 0.8]

Unnamed: 0,text,clean_fix,final_sentiment_human,hasil_prediksi_mesin_baru,confidence_new_model,hasil_prediksi_validator
12,@03___nakula @Le_MineraleID @KementerianLHK @a...,hahaa lagi dan lagi buzzer sekali pakai berula...,negative,0,0.931022,1
21,@albiruen @jeanmaryllis @monraquela @oIivianne...,kredivo,neutral,1,0.845085,0
37,@AryPrasetyo_85 @BPOM_RI @sehatAQUA Kalo tau a...,kalo tau apa itu bpa para orang tua pasti akan...,negative,2,0.979902,1
50,"@ask_AXIS izin bertanya min, saya kan beli pak...",izin bertanya min saya kan beli paket owsem di...,neutral,0,0.980813,1
56,@bdngfess gpp sebulan dua bulan mah kalo lbh d...,gpp sebulan dua bulan mah kalo lbh dr kredivo...,neutral,1,0.991422,0
...,...,...,...,...,...,...
1690,Ketua Umum (Ketum) Partai Gerindra Prabowo Sub...,ketua umum ketum partai gerindra prabowo subia...,neutral,2,0.952956,0
1696,"Bagaimana ini,, sama Parpol yang membesarkanny...",bagaimana ini sama parpol yang membesarkannya ...,neutral,2,0.992536,1
1697,Hmm.. Ngebet sih ngebet tapi cobalah hargai ya...,hmm ngebet sih ngebet tapi cobalah hargai yang...,neutral,2,0.991545,1
1699,"Menteri Pariwisata dan Ekonomi Kreatif, Sandia...",menteri pariwisata dan ekonomi kreatif sandiag...,neutral,0,0.980988,2


In [None]:
gagal[gagal['confidence_new_model'] < 0.8]

Unnamed: 0,text,clean_fix,final_sentiment_human,hasil_prediksi_mesin_baru,confidence_new_model,hasil_prediksi_validator
3,.lol.kouta mu mahal2 malah harga naik,lolkouta mu malah harga naik,negative,2,0.694254,1
25,@anju0gerald @alharkan__ atau kalo bahasa disk...,atau kalo bahasa diskusi sama boeng ya balik ...,negative,0,0.728688,1
42,@AryPrasetyo_85 Info cara ngatasi sampah botol...,info cara ngatasi sampah botol aqua dong soaln...,negative,0,0.648455,1
54,@bakul_temp @arsetiawan107 @BisKota_ @gojekind...,gak semua kok yg oknum ojol cuma sisanya sam...,negative,0,0.535148,1
72,@BisKota_ @gojekindonesia @PT_Transjakarta Orm...,ormas ojol,negative,2,0.657741,1
...,...,...,...,...,...,...
1658,YES 2023 MAKIN BURIKï£¿Ã¼Ã²Ã,yes makin burikï£¿ã¼ã²ã,negative,0,0.570694,1
1695,Kenapa sih sama masyarakat aja apa lagi petani...,kenapa sih sama masyarakat aja apa lagi petani...,neutral,2,0.787320,0
1698,Sadaaar paaak.. Pariwisata Indonesia sudah bai...,sadaaar paaak pariwisata indonesia sudah baik ...,neutral,2,0.530324,1
1700,@sandiuno @Kemenparekraf Udah gituu aja?? Kunj...,udah gituu aja kunjungan terus melihathemmmh,neutral,2,0.760144,0
