## Normalizer

In [29]:
import nltk
import re
import numpy as np
import pandas as pd
from nlp_id.tokenizer import Tokenizer
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from nlp_id.stopword import StopWord 
from nltk.corpus import stopwords
import words as w

class normalizer():
    def __init__(self):
        nltk.download('stopwords')
        stopwords_sastrawi = StopWordRemoverFactory()
        stopwords_nlpid = StopWord() 
        stopwords_nltk = stopwords.words('indonesian')
        stopwords_github = list(np.array(pd.read_csv("Utils/stopwords.txt", header=None).values).squeeze())
        more_stopword = w.custom_stopwords
        data_stopword = stopwords_sastrawi.get_stop_words() + stopwords_nlpid.get_stopword() + stopwords_github + stopwords_nltk + more_stopword 
        data_stopword = list(set(data_stopword))

        # Only use 'rt' as stopwords
        data_stopword = list(set(data_stopword))

        # Combine slang dictionary
        import json
        with open('Utils/slang.txt') as f:
            data = f.read()
        data_slang = json.loads(data) 

        with open('Utils/sinonim.txt') as f:
            data = f.readlines()
        for line in data:
            word = line.split('=')
            data_slang[word[0].strip()] = word[1].strip()

        # print(data_slang)
        more_dict = w.custom_dict
        data_slang.update(more_dict)

        self.stopwords, self.slang = data_stopword, data_slang
        self.tokenizer = Tokenizer()


    def normalize(self,text):
        text = text.lower()
  
        # Change HTML entities
        text = text.replace('&amp;', 'dan')
        text = text.replace('&gt;', 'lebih dari')
        text = text.replace('&lt;', 'kurang dari')
        
        # Remove url
        text = re.sub(r'http\S+', 'httpurl', text)
        
        # Remove HTML tags
        text = re.sub(r'<.*?>', ' ', text)
        
        # Remove hashtags
        text = re.sub(r'#\w+', ' ', text)
        
        # Replace @mentions with 'user'
        text = re.sub(r'@\w+', 'user', text)

        # Remove non-letter characters
        text = re.sub('[^a-zA-z]', ' ', text)

        # Remove excess space
        text = re.sub(' +', ' ', text)
        text = text.strip()

        result = []
        word_token = self.tokenizer.tokenize(text) # Tokenize words
        for word in word_token:
            word = word.strip().lower() # Case Folding to Lower Case
            if word in self.slang:
                word = self.slang[word]
            if word not in self.stopwords: # Stopwords removal
                result.append(word)
            else:
                continue
        return result

In [30]:
normalize = normalizer()
test_text = "Luar biasa! Coba kita bayangkan apa yg bakal terjadi jika Ketua MK, Ketua MA, Panglima TNI, Jaksa Agung, Ketua KPK, Kepala BIN, dan Kapolri juga dgn menggunakan alasan yg sama ikut cawe2 dlm memenangkan Capres-Cawapres tertentu dlm Pemilu 2024? Itukah maksudnya?#RakyatMonitor#"
print(normalize.normalize(test_text))

[nltk_data] Downloading package stopwords to C:\Users\Bayu Adjie
[nltk_data]     Sidharta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['coba', 'bayangkan', 'ketua', 'mk', 'ketua', 'panglima', 'tni', 'jaksa', 'agung', 'ketua', 'kpk', 'kepala', 'bin', 'kapolri', 'alasan', 'cawe', 'memenangkan', 'capres', 'cawapres', 'pemilu', 'maksud']


## Barasa

In [42]:
from nltk.corpus.reader.wordnet import Synset
from nltk.corpus.reader import WordNetError
from nltk.corpus import wordnet as wn
import nltk
from nlp_id.tokenizer import Tokenizer
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from nlp_id.stopword import StopWord 
from nltk.corpus import stopwords
import words as w
import numpy as np
import pandas as pd
import spacy
import re


class SentiSynset:
    def __init__(self, pos_score, neg_score, synset):
        self._pos_score = pos_score
        self._neg_score = neg_score
        self._obj_score = 1.0 - (self._pos_score + self._neg_score)
        self.synset = synset


    def pos_score(self):
        return self._pos_score


    def neg_score(self):
        return self._neg_score


    def obj_score(self):
        return self._obj_score


    def __str__(self):
        """Prints just the Pos/Neg scores for now."""
        s = "<"
        s += self.synset.name() + ": "
        s += "PosScore=%s " % self._pos_score
        s += "NegScore=%s" % self._neg_score
        s += ">"
        return s

    def __repr__(self):
        return "Senti" + repr(self.synset)




class CustomSentiWordNet(object):
    def __init__(self):
        with open("Utils/barasa.txt", "r", encoding="utf-8") as f:
            lines = f.readlines()
        # create empty 2d dict
        synsets = {}
        id_dict = {}
        for line in lines:
            if line.startswith("#"):
                continue
            parts = line.strip().split("\t")
            if len(parts) != 6:
                continue
            synset_id = parts[0]

            if synset_id not in synsets:
                synsets[synset_id] = {}
            
            synset = {}
            id, lang, goodness, lemma, pos, neg = parts
            pos = float(pos)
            neg = float(neg)
            synsets[synset_id][lemma] = (pos, neg, 1 - (pos + neg))
            id_dict[lemma] = synset_id

        self.lemma_dict = id_dict
        self.synsets = synsets
        self.not_found = {}
    
    def _get_synset(self, synset_id):
        # helper function to map synset_id to synset
        synsets = self.synsets[synset_id]
        return synsets
        
        
    
    def _get_pos_file(self, pos):
        # helper function to map WordNet POS tags to file names
        if pos == 'n':
            return 'noun'
        elif pos == 'v':
            return 'verb'
        elif pos == 'a' or pos == 's':
            return 'adj'
        elif pos == 'r':
            return 'adv'
        else:
            raise WordNetError('Unknown POS tag: {}'.format(pos))
    
    
    def senti_synset(self, synset_id):
        pos_score,neg_score,obj_score = self.synsets[synset_id]
        synset = self._get_synset(synset_id)
        return SentiSynset(synset, pos_score, neg_score)
    
    def calculate_sentiment(self,tokens):
        pos = []
        neg = []
        found = []
        for token in tokens:
            if token not in self.lemma_dict:
                self.not_found[token] = self.not_found.get(token, 0) + 1
                continue
            synsets = self.synsets[self.lemma_dict[token]][token]
            pos_score, neg_score, obj_score = synsets
            # print(token)
            print("Found {} with pos {} and neg {}".format(token, pos_score, neg_score))
            pos.append(pos_score)
            neg.append(neg_score)
            found.append(token)
        print("Found {} out of {} tokens".format(len(found), len(tokens)))
        print(", ".join(found))
        print("Unique tokens found: {}".format(len(set(found))))
        return pos, neg
    
    def get_not_found(self):
        return self.not_found

In [43]:
barasa = CustomSentiWordNet()
print(barasa.calculate_sentiment(normalize.normalize(test_text)))

Found coba with pos 0.0 and neg 0.0
Found bayangkan with pos 0.0 and neg 0.0
Found ketua with pos 0.0 and neg 0.0
Found ketua with pos 0.0 and neg 0.0
Found panglima with pos 0.0 and neg 0.0
Found jaksa with pos 0.0 and neg 0.0
Found agung with pos 0.0 and neg 0.0
Found ketua with pos 0.0 and neg 0.0
Found kepala with pos 0.0 and neg 0.0
Found alasan with pos 0.625 and neg 0.0
Found memenangkan with pos 0.125 and neg 0.0
Found maksud with pos 0.0 and neg 0.125
Found 12 out of 21 tokens
coba, bayangkan, ketua, ketua, panglima, jaksa, agung, ketua, kepala, alasan, memenangkan, maksud
Unique tokens found: 10
([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.625, 0.125, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.125])


In [12]:
barasa.get_not_found()

{'mk': 1,
 'tni': 1,
 'kpk': 1,
 'bin': 1,
 'kapolri': 1,
 'cawe': 1,
 'capres': 1,
 'cawapres': 1,
 'pemilu': 1}

## Inset

In [104]:
import pandas as pd
import numpy as np
from nltk import ngrams

def read_inset(path):
    sentiments = {}
    with open(path, 'r') as f:
        lines = f.readlines()
    for line in lines:
        if line.startswith('#'):
            continue
        word, sentiment = line.split('\t')
        sentiments[word] = int(sentiment)
    print(len(sentiments))
    return sentiments

def print_n_grams(unigrams, bigrams, trigrams):
    print('Unigrams: ', ', '.join(unigrams))
    print('Bigrams: ', ', '.join(bigrams))
    print('Trigrams: ', ', '.join(trigrams))

    

class inSet():
    def __init__(self, verbose = False):
        self.pos = read_inset('Utils/Inset/positive.tsv')
        self.neg = read_inset('Utils/Inset/negative.tsv')
        self.verbose = verbose

    def delete_word_from_text(self, text, word):
        text = text.replace(word, '', 1)
        return text
    
    
    def calculate_n_gram(self, text):
        unigrams = ngrams(text.split(), 1)
        bigrams = ngrams(text.split(), 2)
        trigrams = ngrams(text.split(), 3)

        unigrams = [' '.join(grams) for grams in unigrams]
        bigrams = [' '.join(grams) for grams in bigrams]
        trigrams = [' '.join(grams) for grams in trigrams]

        return unigrams, bigrams, trigrams
    
    def recalculate_n_grams(self, text, word):
        text = self.delete_word_from_text(text, word)
        unigrams, bigrams, trigrams = self.calculate_n_gram(text)
        if self.verbose:
            print_n_grams(unigrams, bigrams, trigrams)
        return unigrams, bigrams, trigrams, text

    def calculate_inset_score(self, text):
        text_length = len(text.split())
        unigrams, bigrams, trigrams = self.calculate_n_gram(text)
        pos_score = 0
        neg_score = 0
        found = []
        for trigram in trigrams:
            if trigram in self.pos:
                if self.verbose:
                    print('Hit Trigram Pos ', trigram)
                print('Positive Trigram {} with score {}'.format(trigram, self.pos[trigram]))
                pos_score += self.pos[trigram]
                found.append(trigram)
                unigrams, bigrams, trigrams, text = self.recalculate_n_grams(text, trigram)
            if trigram in self.neg:
                if self.verbose:
                    print('Hit Trigram Neg ', trigram)
                print('Negative Trigram {} with score {}'.format(trigram, self.neg[trigram]))
                neg_score += self.neg[trigram]
                unigrams, bigrams, trigrams, text = self.recalculate_n_grams(text, trigram)
        

        for bigram in bigrams:
            if bigram in self.pos:
                if self.verbose:
                    print('Hit Bigram Pos ', bigram)
                print('Positive Bigram {} with score {}'.format(bigram, self.pos[bigram]))
                pos_score += self.pos[bigram]
                found.append(bigram)
                unigrams, bigrams, trigrams, text = self.recalculate_n_grams(text, bigram)

            if bigram in self.neg:
                if self.verbose:
                    print('Hit Bigram Neg ', bigram)
                print('Negative Bigram {} with score {}'.format(bigram, self.neg[bigram]))
                neg_score += self.neg[bigram]
                unigrams, bigrams, trigrams, text = self.recalculate_n_grams(text, bigram)

        for unigram in unigrams:
            if unigram in self.pos:
                if self.verbose:
                    print('Hit Unigram Pos ', unigram)
                print('Positive Unigram {} with score {}'.format(unigram, self.pos[unigram]))
                pos_score += self.pos[unigram]
                found.append(unigram)
                unigrams, bigrams, trigrams, text = self.recalculate_n_grams(text, unigram)

            if unigram in self.neg:
                if self.verbose:
                    print('Hit Unigram Neg ', unigram)
                print('Negative Unigram {} with score {}'.format(unigram, self.neg[unigram]))   
                neg_score += self.neg[unigram]
                unigrams, bigrams, trigrams, text = self.recalculate_n_grams(text, unigram)
        print("Found {} out of {} tokens".format(len(found), text_length))
        print(", ".join(found))
        print("Unique tokens found: {}".format(len(set(found))))
        return pos_score, neg_score

In [105]:
insett = inSet(verbose=False)
insett.calculate_inset_score(' '.join(normalize.normalize(test_text)))

3609
6607
Positive Unigram coba with score 2
Negative Unigram coba with score -1
Positive Unigram panglima with score 3
Positive Unigram maksud with score 3
Negative Unigram maksud with score -1
Found 3 out of 21 tokens
coba, panglima, maksud
Unique tokens found: 3


(8, -2)

## IndoBERTweet Pred

In [17]:
import json, glob, os, random
import argparse
import logging
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import f1_score, accuracy_score
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup
import re, emoji
from datetime import datetime



logger = logging.getLogger(__name__)
model_dict = { 'indobertweet': 'indolem/indobertweet-base-uncased',
               'indobert': 'indolem/indobert-base-uncased'}


def find_url(string):
    # with valid conditions for urls in string 
    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    url = re.findall(regex,string)
    return [x[0] for x in url]

def preprocess_tweet(tweet):
    tweet = emoji.demojize(tweet).lower()
    new_tweet = []
    for word in tweet.split():
        if word[0] == '@' or word == '[username]':
            new_tweet.append('@USER')
        elif find_url(word) != []:
            new_tweet.append('HTTPURL')
        elif word == 'httpurl' or word == '[url]':
            new_tweet.append('HTTPURL')
        else:
            new_tweet.append(word)
    return ' '.join(new_tweet)

def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


class BertData():
    def __init__(self, args):
        self.tokenizer = BertTokenizer.from_pretrained(model_dict[args.bert_model], do_lower_case=True)
        self.sep_token = '[SEP]'
        self.cls_token = '[CLS]'
        self.pad_token = '[PAD]'
        self.sep_vid = self.tokenizer.vocab[self.sep_token]
        self.cls_vid = self.tokenizer.vocab[self.cls_token]
        self.pad_vid = self.tokenizer.vocab[self.pad_token]
        self.MAX_TOKEN = args.max_token

    def preprocess_one(self, src_txt):
        src_txt = preprocess_tweet(src_txt)
        src_subtokens = [self.cls_token] + self.tokenizer.tokenize(src_txt) + [self.sep_token]        
        src_subtoken_idxs = self.tokenizer.convert_tokens_to_ids(src_subtokens)
        
        if len(src_subtoken_idxs) > self.MAX_TOKEN:
            src_subtoken_idxs = src_subtoken_idxs[:self.MAX_TOKEN]
            src_subtoken_idxs[-1] = self.sep_vid
        else:
            src_subtoken_idxs += [self.pad_vid] * (self.MAX_TOKEN-len(src_subtoken_idxs))
        segments_ids = [0] * len(src_subtoken_idxs)
        assert len(src_subtoken_idxs) == len(segments_ids)
        return src_subtoken_idxs, segments_ids
    
    def preprocess(self, src_txts):
        output = []
        for idx in range(len(src_txts)):
            output.append(self.preprocess_one(src_txts[idx]))
        return output


class Batch():
    def __init__(self, data, idx, batch_size, device):
        cur_batch = data[idx:idx+batch_size]
        src = torch.tensor([x[0] for x in cur_batch])
        seg = torch.tensor([x[1] for x in cur_batch])
        # label = torch.tensor([x[2] for x in cur_batch])
        mask_src = 0 + (src != 0)
        
        self.src = src.to(device)
        self.seg= seg.to(device)
        # self.label = label.to(device)
        self.mask_src = mask_src.to(device)

    def get(self):
        return self.src, self.seg, self.mask_src


class Model(nn.Module):
    def __init__(self, args, device):
        super(Model, self).__init__()
        self.args = args
        self.device = device
        self.tokenizer = BertTokenizer.from_pretrained(model_dict[args.bert_model], do_lower_case=True)
        self.bert = BertModel.from_pretrained(model_dict[args.bert_model])
        self.linear = nn.Linear(self.bert.config.hidden_size, args.vocab_label_size)
        self.dropout = nn.Dropout(0.2)
        self.loss = torch.nn.CrossEntropyLoss(ignore_index=args.vocab_label_size, reduction='sum')


    def forward(self, src, seg, mask_src):
        top_vec, _ = self.bert(input_ids=src, token_type_ids=seg, attention_mask=mask_src, return_dict=False)
        top_vec = self.dropout(top_vec)
        top_vec *= mask_src.unsqueeze(dim=-1).float()
        top_vec = torch.sum(top_vec, dim=1) / mask_src.sum(dim=-1).float().unsqueeze(-1)
        conclusion = self.linear(top_vec).squeeze()
        return conclusion
    
    def get_loss(self, src, seg, label, mask_src):
        output = self.forward(src, seg, mask_src)
        return self.loss(output.view(-1,self.args.vocab_label_size), label.view(-1))

    def predict(self, src, seg, mask_src):
        output = self.forward(src, seg, mask_src)
        batch_size = output.shape[0]
        prediction = torch.argmax(output, dim=-1).data.cpu().numpy().tolist()
        return prediction


def prediction(dataset, model, args):
    preds = []
    # golds = []
    model.eval()
    for j in range(0, len(dataset), args.batch_size):
        src, seg, mask_src = Batch(dataset, j, args.batch_size, args.device).get()
        preds += model.predict(src, seg, mask_src)
        # golds += label.cpu().data.numpy().tolist()
    return preds

def create_vocab(labels):
    unique = np.unique(labels)
    label2id = {}
    id2label = {}
    counter = 0
    for word in unique:
        label2id[word] = counter
        id2label[counter] = word
        counter += 1
    return label2id, id2label

def convert_label2id(label2id, labels):
    return [label2id[x] for x in labels]

def save_df(pred, id2label):
    ids = np.arange(len(pred))
    pred = [id2label[p] for p in pred]
    df = pd.DataFrame()
    df['index']=ids
    df['label']=pred
    df.to_csv('pred_bertW.csv', index=False)

def train(args, train_dataset, dev_dataset, test_dataset, model, id2label):
    """ Train the model """
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    t_total = len(train_dataset) // args.batch_size * args.num_train_epochs
    args.warmup_steps = int(0.1 * t_total)
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Total optimization steps = %d", t_total)
    logger.info("  Warming up = %d", args.warmup_steps)
    logger.info("  Patience  = %d", args.patience)

    # Added here for reproductibility
    global best_model
    set_seed(args)
    tr_loss = 0.0
    global_step = 1
    best_f1_dev = 0
    cur_patience = 0
    for i in range(int(args.num_train_epochs)):
        random.shuffle(train_dataset)
        epoch_loss = 0.0
        for j in range(0, len(train_dataset), args.batch_size):
            src, seg, label, mask_src = Batch(train_dataset, j, args.batch_size, args.device).get()
            model.train()
            loss = model.get_loss(src, seg, label, mask_src)
            loss = loss.sum()/args.batch_size
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
            loss.backward()

            tr_loss += loss.item()
            epoch_loss += loss.item()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1
        logger.info("Finish epoch = %s, loss_epoch = %s", i+1, epoch_loss/global_step)
        dev_f1, _ = prediction(dev_dataset, model, args)
        if dev_f1 > best_f1_dev:
            best_f1_dev = dev_f1
            _, test_pred = prediction(test_dataset, model, args)
            save_df(test_pred, id2label)
            #SAVE
            cur_patience = 0
            logger.info("Better, BEST F1 in DEV = %s, SAVE TEST!", best_f1_dev)
            best_model = model.state_dict()
            print(best_model)
          
        else:
            cur_patience += 1
            if cur_patience == args.patience:
                logger.info("Early Stopping Not Better, BEST F1 in DEV = %s", best_f1_dev)
                break
            else:
                logger.info("Not Better, BEST F1 in DEV = %s", best_f1_dev)

    return global_step, tr_loss / global_step, best_f1_dev


args_parser = argparse.ArgumentParser()
args_parser.add_argument('--bert_model', default='indobertweet', choices=['indobert', 'indobertweet'], help='select one of models')
args_parser.add_argument('--data_path', default='./indobert_smsa/data/', help='path to all train/test/dev')
args_parser.add_argument('--output_dir', default='/content/gdrive/MyDrive/TA_Bayu-05111940000172/Indobert/SMsA/Model/', help='path to save model')
args_parser.add_argument('--max_token', type=int, default=128, help='maximum token allowed for 1 instance')
args_parser.add_argument('--batch_size', type=int, default=30, help='batch size')
args_parser.add_argument('--learning_rate', type=float, default=5e-5, help='learning rate')
args_parser.add_argument('--weight_decay', type=int, default=0, help='weight decay')
args_parser.add_argument('--adam_epsilon', type=float, default=1e-8, help='adam epsilon')
args_parser.add_argument('--max_grad_norm', type=float, default=1.0)
args_parser.add_argument('--num_train_epochs', type=int, default=20, help='total epoch')
args_parser.add_argument('--warmup_steps', type=int, default=242, help='warmup_steps, the default value is 10% of total steps')
args_parser.add_argument('--logging_steps', type=int, default=200, help='report stats every certain steps')
args_parser.add_argument('--seed', type=int, default=2021)
args_parser.add_argument('--local_rank', type=int, default=-1)
args_parser.add_argument('--patience', type=int, default=5, help='patience for early stopping')
args_parser.add_argument('--no_cuda', default=False)
args = args_parser.parse_args()




# Setup CUDA, GPU & distributed training
if args.local_rank == -1 or args.no_cuda:
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    torch.distributed.init_process_group(backend="nccl")
    args.n_gpu = 1
args.device = device

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
)

set_seed(args)



# Load pretrained model and tokenizer
if args.local_rank not in [-1, 0]:
    # Make sure only the first process in distributed training will download model & vocab
    torch.distributed.barrier()

if args.local_rank == 0:
    # Make sure only the first process in distributed training will download model & vocab
    torch.distributed.barrier()

bertdata = BertData(args)

trainset = pd.read_csv(args.data_path+'train_preprocess.tsv', sep='\t')
df = pd.read_csv('data_with_replies/Final/Result/Jawa_User.csv')
df = df.head(15)
# devset = pd.read_csv(args.data_path+'valid_preprocess.tsv', sep='\t')
# testset = pd.read_csv(args.data_path+'test_preprocess_masked_label.tsv', sep='\t')
xtrain, ytrain = list(trainset['text']), list(trainset['label'])
# xdev, ydev = list(devset['text']), list(devset['label'])
# xtest, ytest = list(testset['text']), list(testset['label'])

label2id, id2label = create_vocab (ytrain)
# ytrain =  convert_label2id (label2id, ytrain)
# ydev =  convert_label2id (label2id, ydev)
# ytest =  convert_label2id (label2id, ytest)
args.vocab_label_size = len(label2id)

model = Model(args, device)
best_model = model.state_dict()

model.to(args.device)
model.load_state_dict(torch.load('indobert_smsa\model_SMSA.pt', map_location=args.device))

print(model)


res = pd.read_csv("indobert_smsa/result.csv")
start_pos = len(res)
now = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
print(now)
print("starting position = ", start_pos)


try:
    for i in range(start_pos,len(df),1000):
        print(i)
        batch = df.iloc[i:i+1000]
        tweets = batch['content'].tolist()
        index = batch['tweetID'].tolist()
        tweets = bertdata.preprocess(tweets)
        pred = prediction(tweets, model, args)

        dataframe = pd.DataFrame({'tweetID': index, 'label': pred})
        res = pd.concat([res, dataframe], ignore_index=True)
        res.to_csv("indobert_smsa/result.csv", index=False)
        now = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
        print("Saved at ",now)
except:
    print("Error at ",i)
    dataframe = pd.DataFrame({'tweetID': index, 'label': pred})
    res = pd.concat([res, dataframe], ignore_index=True)
    res.to_csv("indobert_smsa/result.csv", index=False)
    now = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    print("Saved at ",now)


usage: ipykernel_launcher.py [-h] [--bert_model {indobert,indobertweet}]
                             [--data_path DATA_PATH] [--output_dir OUTPUT_DIR]
                             [--max_token MAX_TOKEN] [--batch_size BATCH_SIZE]
                             [--learning_rate LEARNING_RATE]
                             [--weight_decay WEIGHT_DECAY]
                             [--adam_epsilon ADAM_EPSILON]
                             [--max_grad_norm MAX_GRAD_NORM]
                             [--num_train_epochs NUM_TRAIN_EPOCHS]
                             [--warmup_steps WARMUP_STEPS]
                             [--logging_steps LOGGING_STEPS] [--seed SEED]
                             [--local_rank LOCAL_RANK] [--patience PATIENCE]
                             [--no_cuda NO_CUDA]
ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme="hmac-sha256" --Session.key=b"281d14dc-e5f0-4f9f-8bd0-fca4b0d97102" 

SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [1]:
import pandas as pd

def get_n_keywords(text):
    total = 0
    for word in text.split():
        if word in keyword['keyword']:
            total = keyword['keyword'][word] + total
            if keyword['keyword'][word] == 1:
                print(word)
    return total

keywords = pd.read_csv('Utils/keyword.csv', sep=';',encoding = 'unicode_escape')
keywords['text'] = keywords['text'].astype(str)
keywords['text'] = keywords['text'].apply(lambda x: x.lower())

keywords = keywords.drop(columns=['count'])
keywords.set_index('text', inplace=True)

keyword = keywords.to_dict()
keyword

get_n_keywords('penerus bangsa kita jokowi dodo jk presiden , nomor 1 diatas segalanya, indonesia')


bangsa
jokowi
presiden
1
indonesia


5

In [2]:
df = pd.read_csv('tagged_joined.csv', sep=';')
df = df.dropna()
df = df[df['tag_overall'] != 5]
df = df[df['tag_overall'] != 4]
df['tag_overall'].value_counts()
df.columns

Index(['id', 'username', 'tweetID', 'content', 'likeCount', 'retweetCount',
       'quoteCount', 'replyCount', 'label', 'weighted_sentiment',
       'pos_sentiword', 'neg_sentiword', 'len', 'pos_inset', 'neg_inset',
       'normalized', 'len_normalized', 'n_keywords', 'tag_owi', 'tag_prab',
       'tag_overall'],
      dtype='object')

In [20]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import numpy as np

pos_sentiword = df['pos_sentiword'].tolist()
neg_sentiword = df['neg_sentiword'].tolist()

y_pred = []

for i in range(len(pos_sentiword)):
    delta = pos_sentiword[i] - neg_sentiword[i]
    if delta > 0.5:
        y_pred.append(3)
    elif delta < -0.5:
        y_pred.append(1)
    else:
        y_pred.append(2)

y_true = df['tag_overall'].tolist()
print(classification_report(y_true, y_pred, target_names=['neg', 'neu', 'pos']))
print(confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

         neg       0.54      0.15      0.23       350
         neu       0.32      0.68      0.44       242
         pos       0.27      0.24      0.26       175

    accuracy                           0.34       767
   macro avg       0.38      0.36      0.31       767
weighted avg       0.41      0.34      0.30       767

[[ 52 241  57]
 [ 22 165  55]
 [ 23 110  42]]


In [5]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import numpy as np

pos_sentiword = df['posSentiword'].tolist()
neg_sentiword = df['negSentiword'].tolist()
y_true = df['tag_overall'].tolist()

KeyError: 'posSentiword'

In [30]:
np.linspace(1, len(df), 5, dtype = np.int64) 

array([  1, 192, 384, 575, 767], dtype=int64)

In [11]:
max_acc = 0
best_treshold = 0

for i in range(1, 20):
    treshold = i / 4
    y_pred = []


    for x in range(len(pos_sentiword)):
        delta = pos_sentiword[x] - neg_sentiword[x]
        if delta > treshold:
            y_pred.append(3)
        elif delta < -treshold:
            y_pred.append(1)
        else:
            y_pred.append(2)
    accuracy = np.sum(np.array(y_true) == np.array(y_pred)) / len(y_true)
    print(str(treshold) + ";" + str(accuracy) + " SSS")
    # if accuracy > max_acc:
    #     max_acc = accuracy
    #     best_treshold = treshold
    #     print(classification_report(y_true, y_pred, target_names=['neg', 'neu', 'pos']))
    #     print(confusion_matrix(y_true, y_pred))
    #     print("Max Accuracy = ", max_acc, "Best Treshold = ", best_treshold)
    #     print("=====================================")
    

0.25;0.3224115334207077 SSS
0.5;0.33551769331585846 SSS
0.75;0.32765399737876805 SSS
1.0;0.3237221494102228 SSS
1.25;0.3302752293577982 SSS
1.5;0.3328964613368283 SSS
1.75;0.3197903014416776 SSS
2.0;0.3197903014416776 SSS
2.25;0.31847968545216254 SSS
2.5;0.31716906946264745 SSS
2.75;0.3132372214941022 SSS
3.0;0.3132372214941022 SSS
3.25;0.3132372214941022 SSS
3.5;0.3132372214941022 SSS
3.75;0.3119266055045872 SSS
4.0;0.3119266055045872 SSS
4.25;0.3119266055045872 SSS
4.5;0.3119266055045872 SSS
4.75;0.3119266055045872 SSS


In [3]:
pos_inset = df['pos_inset'].tolist()
neg_inset = df['neg_inset'].tolist()
y_true = df['tag_overall'].tolist()

In [24]:
from sklearn.metrics import accuracy_score
max_acc = 0
best_treshold = 0

for i in range(1, 20, 1):
    treshold = i 
    y_pred = []


    for x in range(len(pos_inset)):
        delta = pos_inset[x] + neg_inset[x]
        if delta > treshold:
            y_pred.append(3)
        elif delta < - treshold:
            y_pred.append(1)
        else:
            y_pred.append(2)
    # print("Treshold = ", treshold)
    accuracy = accuracy_score(y_true, y_pred)
    # print("Accuracy = ", accuracy)
    print(str(treshold) + ";" + str(accuracy) + " SSS")
    if accuracy > max_acc:
        max_acc = accuracy
        best_treshold = treshold
        # print(classification_report(y_true, y_pred, target_names=['neg', 'neu', 'pos']))
        # print(confusion_matrix(y_true, y_pred))
        # print("Max Accuracy = ", max_acc, "Best Treshold = ", best_treshold)
        # print("=====================================")
    

1;0.4198174706649283 SSS
2;0.4315514993481095 SSS
3;0.4302477183833116 SSS
4;0.4315514993481095 SSS
5;0.423728813559322 SSS
6;0.42633637548891784 SSS
7;0.40547588005215124 SSS
8;0.39374185136897 SSS
9;0.3833116036505867 SSS
10;0.37809647979139505 SSS
11;0.37027379400260757 SSS
12;0.3650586701434159 SSS
13;0.37157757496740546 SSS
14;0.37027379400260757 SSS
15;0.36766623207301175 SSS
16;0.3663624511082138 SSS
17;0.35723598435462844 SSS
18;0.35071707953063885 SSS
19;0.34028683181225555 SSS


In [61]:
pos_inset = df['pos_inset'].tolist()
neg_inset = df['neg_inset'].tolist()
y_true = df['tag_overall'].tolist()

In [6]:
y_pred = []

for i in range(len(pos_inset)):
    delta = pos_inset[i] + neg_inset[i]
    if delta > 2:
        y_pred.append(3)
    elif delta < -2:
        y_pred.append(1)
    else:
        y_pred.append(2)

print(classification_report(y_true, y_pred, target_names=['neg', 'neu', 'pos']))
print(confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

         neg       0.54      0.50      0.52       350
         neu       0.43      0.23      0.30       242
         pos       0.32      0.57      0.41       175

    accuracy                           0.43       767
   macro avg       0.43      0.43      0.41       767
weighted avg       0.46      0.43      0.43       767

[[175  45 130]
 [100  56  86]
 [ 47  28 100]]


In [62]:
y_pred = []

for i in range(len(pos_inset)):
    delta = pos_inset[i] + neg_inset[i]
    if delta > 4:
        y_pred.append(3)
    elif delta < -4:
        y_pred.append(1)
    else:
        y_pred.append(2)

print(classification_report(y_true, y_pred, target_names=['neg', 'neu', 'pos']))
print(confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

         neg       0.55      0.41      0.47       350
         neu       0.40      0.40      0.40       242
         pos       0.34      0.50      0.40       175

    accuracy                           0.43       767
   macro avg       0.43      0.44      0.43       767
weighted avg       0.46      0.43      0.44       767

[[145 100 105]
 [ 77  98  67]
 [ 40  47  88]]


In [28]:
df['pos_inset'].describe()
    

count    767.000000
mean      13.859192
std        9.459878
min        0.000000
25%        7.000000
50%       12.000000
75%       19.000000
max       51.000000
Name: pos_inset, dtype: float64

In [29]:
df['neg_inset'].describe()

count    767.000000
mean     -14.186441
std        9.950159
min      -82.000000
25%      -20.000000
50%      -13.000000
75%       -7.000000
max        0.000000
Name: neg_inset, dtype: float64

In [57]:
pos = df['pos_inset'].quantile([0, 0.25, 0.5, 0.75, 1]).tolist()
neg = df['neg_inset'].quantile([0, 0.25, 0.5, 0.75, 1]).tolist()


for i in range(len(pos) - 1) :
    # print number of values in between each quantile
    print(df[(df['pos_inset'] >= pos[i]) & (df['pos_inset'] < pos[i+1])].shape[0])
          
for i in range(len(neg) - 1) :
    # print number of values in between each quantile
    print(df[(df['neg_inset'] >= neg[i]) & (df['neg_inset'] < neg[i+1])].shape[0])


169
196
199
202
178
176
212
154


In [44]:
y_pred = []

for i in range(len(pos_inset)):
    delta = pos_inset[i] + neg_inset[i]
    if delta > 10:
        y_pred.append(3)
    elif delta < -10:
        y_pred.append(1)
    else:
        y_pred.append(2)

y_true = df['tag_overall'].tolist()
print(classification_report(y_true, y_pred, target_names=['neg', 'neu', 'pos']))
print(confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

         neg       0.56      0.23      0.32       349
         neu       0.34      0.69      0.46       239
         pos       0.31      0.25      0.27       175

    accuracy                           0.38       763
   macro avg       0.41      0.39      0.35       763
weighted avg       0.44      0.38      0.35       763

[[ 79 208  62]
 [ 40 166  33]
 [ 21 111  43]]


In [4]:
df['posSentiword'].describe()

count    763.000000
mean       0.510976
std        0.522797
min        0.000000
25%        0.125000
50%        0.375000
75%        0.750000
max        3.875000
Name: posSentiword, dtype: float64

In [6]:
df['negSentiword'].describe()

count    763.000000
mean       0.422182
std        0.599809
min        0.000000
25%        0.000000
50%        0.250000
75%        0.625000
max        6.750000
Name: negSentiword, dtype: float64

In [48]:
pos = df['pos_sentiword'].quantile([0, 0.25, 0.5, 0.75, 1]).tolist()
neg = df['neg_sentiword'].quantile([0, 0.25, 0.5, 0.75, 1]).tolist()


In [56]:
neg

[0.0, 0.0, 0.25, 0.625, 6.75]

In [55]:
for i in range(len(pos) - 1) :
    # print number of values in between each quantile
    print(df[(df['pos_sentiword'] >= pos[i]) & (df['pos_sentiword'] < pos[i+1])].shape[0])
          
for i in range(len(neg) - 1) :
    # print number of values in between each quantile
    print(df[(df['neg_sentiword'] >= neg[i]) & (df['neg_sentiword'] < neg[i+1])].shape[0])


163
166
215
222
0
353
203
210


In [44]:
for i in range(len(neg)):
    # print number of data in each quantile range
    if i == len(neg) - 1:
        print(df[(df['neg_sentiword'] >= neg[i])].shape[0])
    else:  
        print(df[(df['neg_sentiword'] >= neg[i]) & (df['neg_sentiword'] <= neg[i+1])].shape[0])


449
241
211


In [7]:
df.head()

Unnamed: 0,id,username,tweetID,content,likes,retweets,quotes,replies,BERTlabel,weightedBERTlabel,posSentiword,negSentiword,posInset,negInset,n_keywords,tag_overall
0,0,Singhasari1982,1074194773559304194,@marierteman @prabowo PRABOWO AJA\nPRABOWO AJA...,2,0,0,0,-1,-1,0.0,0.0,0,0,23,3.0
1,1,NurSyahbana9,1067997009859141632,ULAMA PEWARIS NABI\nApa bedanya antara Ulama p...,87,24,2,13,-1,-67,0.0,0.0,42,-2,19,1.0
2,2,didienAZHAR,1115833133964910597,Lima kueri tertinggi di antaranya yakni kampan...,1,1,0,0,0,0,0.125,0.0,18,0,18,3.0
3,3,fariji_lacak,1052383624698314753,@YCH7168 @mochamadarip @NaneDianti @GunRomli *...,4,2,0,0,-1,-4,0.5,0.125,37,-19,17,1.0
4,4,fariji_lacak,1052382472174886912,"*Hanya di-Era Jokowi, Ada Ulama KHUSUS*\n1.Ula...",3,0,0,1,-1,-1,0.5,0.125,37,-19,17,1.0


In [None]:
from sklearn.metrics import accuracy_score
max_acc = 0
best_treshold = 0

for i in range(1, 20, 1):
    treshold = i 
    y_pred = []


    for x in range(len(pos_inset)):
        delta = pos_inset[x] + neg_inset[x]
        if delta > treshold:
            y_pred.append(3)
        elif delta < - treshold:
            y_pred.append(1)
        else:
            y_pred.append(2)
    # print("Treshold = ", treshold)
    accuracy = accuracy_score(y_true, y_pred)
    # print("Accuracy = ", accuracy)
    if accuracy > max_acc:
        max_acc = accuracy
        best_treshold = treshold
        print(classification_report(y_true, y_pred, target_names=['neg', 'neu', 'pos']))
        print(confusion_matrix(y_true, y_pred))
        print("Max Accuracy = ", max_acc, "Best Treshold = ", best_treshold)
        print("=====================================")
    

In [8]:
wrong_answer_barasa = df

In [9]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import numpy as np

pos_sentiword = df['pos_sentiword'].tolist()
neg_sentiword = df['neg_sentiword'].tolist()

y_pred = []

for i in range(len(pos_sentiword)):
    delta = pos_sentiword[i] - neg_sentiword[i]
    if delta > 0.5:
        y_pred.append(3)
    elif delta < -0.5:
        y_pred.append(1)
    else:
        y_pred.append(2)

y_true = df['tag_overall'].tolist()
print(classification_report(y_true, y_pred, target_names=['neg', 'neu', 'pos']))
print(confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

         neg       0.54      0.15      0.23       350
         neu       0.32      0.68      0.44       242
         pos       0.27      0.24      0.26       175

    accuracy                           0.34       767
   macro avg       0.38      0.36      0.31       767
weighted avg       0.41      0.34      0.30       767

[[ 52 241  57]
 [ 22 165  55]
 [ 23 110  42]]


In [10]:
wrong_answer_barasa['Barasa'] = y_pred

In [11]:
wrong_answer_barasa = wrong_answer_barasa[wrong_answer_barasa['tag_overall'] != wrong_answer_barasa['Barasa']]

In [18]:
wrong_answer_barasa.to_csv('wrong_answer_barasa.csv', index=False, sep='|')

In [101]:
with open('teks.txt', 'r') as f:
    teks = f.read()

print(" ".join(normalize.normalize(teks)))
pos, neg = barasa.calculate_sentiment(normalize.normalize(teks))
print(np.round(np.sum(pos), 4), np.round(np.sum(neg), 4))
pos = np.round(np.sum(pos), 4)
neg = np.round(np.sum(neg), 4)
if pos - neg > 0.5:
    print("Positif")
elif pos - neg < -0.5:
    print("Negatif")
else:
    print("Netral")

dpr korupsi pdip turoechan asy ari dprd izedrik emir moeis dpr agus chondro prayitno dpr max moein anggota dpr rusman lumbantoruan dpr poltak sitorus dpr panda nababan dpr engelina patiasina dpr m iqbal dpr budiningsih dpr effri tongas dpr mariani dpr
Found korupsi with pos 0.125 and neg 0.125
Found emir with pos 0.0 and neg 0.0
Found agus with pos 0.0 and neg 0.125
Found anggota with pos 0.0 and neg 0.0
Found panda with pos 0.0 and neg 0.0
Found 5 out of 41 tokens
korupsi, emir, agus, anggota, panda
Unique tokens found: 5
0.125 0.25
Netral


In [65]:
wrong_answer_inset = df
wrong_answer_inset['inset'] =  y_pred

In [68]:
wrong_answer_inset = wrong_answer_inset[wrong_answer_inset['tag_overall'] != wrong_answer_inset['inset']]

In [69]:
wrong_answer_inset

Unnamed: 0,id,username,tweetID,content,likeCount,retweetCount,quoteCount,replyCount,label,weighted_sentiment,...,pos_inset,neg_inset,normalized,len_normalized,n_keywords,tag_owi,tag_prab,tag_overall,Barasa,inset
0,0,Singhasari1982,1074194773559304194,@marierteman @prabowo PRABOWO AJA\nPRABOWO AJA...,2,0,0,0,-1.0,-1.0,...,0,0,prabowo prabowo prabowo prabowo prabowo prabow...,183,23,4.0,3.0,3.0,2,2
1,1,NurSyahbana9,1067997009859141632,ULAMA PEWARIS NABI\nApa bedanya antara Ulama p...,87,24,2,13,-1.0,-67.5,...,42,-2,ulama pewaris nabi beda ulama pendukung presid...,195,19,3.0,1.0,1.0,2,3
3,3,fariji_lacak,1052383624698314753,@YCH7168 @mochamadarip @NaneDianti @GunRomli *...,4,2,0,0,-1.0,-4.0,...,37,-19,era jokowi ulama ulama instan ulama yutub meds...,172,17,2.0,4.0,1.0,2,3
4,4,fariji_lacak,1052382472174886912,"*Hanya di-Era Jokowi, Ada Ulama KHUSUS*\n1.Ula...",3,0,0,1,-1.0,-1.5,...,37,-19,era jokowi ulama ulama instan ulama yutub meds...,172,17,2.0,4.0,1.0,2,3
5,5,Ars_sand,1097878146861346816,@putrabanten80 @ZAEffendy @RamliRizal Kalau ha...,1,0,0,0,-1.0,-0.5,...,37,-10,kerja kerja kerja mikir kerja kerja kerja utan...,133,17,1.0,2.0,1.0,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
757,762,NurSyahbana9,1108712236048998401,Komitmen NAHDHATUL ULAMA untuk menjaga dan mem...,298,178,24,43,1.0,327.0,...,12,-9,komitmen nahdhatul ulama menjaga membentengi n...,209,8,4.0,4.0,3.0,2,2
759,764,PRFMnews,1099676060600258562,#DiskusiPRFM KPK menyambut baik langkah KPU ya...,23,5,1,14,0.0,0.0,...,26,-17,kpk menyambut langkah kpu mengumumkan tambahan...,209,8,4.0,4.0,2.0,2,3
760,765,erna_st,1059458181275955200,"Tim Kmpanye Nasional Jokowi-Ma’ruf, yg dipimpi...",15,13,3,13,0.0,0.0,...,3,-14,tim kmpanye nasional jokowi ruf dipimpin erick...,209,8,4.0,4.0,2.0,2,1
762,767,hendriabidin2,1113122672416743425,@tody_mt1 @cahw_i @McKayAudy @J_Aryoko @_SEKNA...,1,0,0,2,-1.0,-0.5,...,16,-12,tong penurunan zaman sby dibantu blt subsidi b...,208,8,4.0,4.0,1.0,3,2


In [108]:
with open('teks.txt', 'r') as f:
    teks = f.read()

print(" ".join(normalize.normalize(teks)))
pos, neg = insett.calculate_inset_score(" ".join(normalize.normalize(teks)))
print(np.round(np.mean(pos), 4), np.round(np.mean(neg), 4))
if pos + neg > 4: 
    print("Positif")
elif pos + neg < -4:
    print("Negatif")
else:
    print("Netral")


dpr korupsi pdip turoechan asy ari dprd izedrik emir moeis dpr agus chondro prayitno dpr max moein anggota dpr rusman lumbantoruan dpr poltak sitorus dpr panda nababan dpr engelina patiasina dpr m iqbal dpr budiningsih dpr effri tongas dpr mariani dpr
Negative Unigram korupsi with score -4
Positive Unigram emir with score 4
Negative Unigram anggota with score -3
Found 1 out of 41 tokens
emir
Unique tokens found: 1
4.0 -7.0
Netral


In [110]:
label_indobert = df['label'].tolist()
y_true = df['tag_overall'].tolist()

In [112]:
label_indobert = [int(i + 2) for i in label_indobert]

In [113]:
print(classification_report(y_true, label_indobert, target_names=['neg', 'neu', 'pos']))
print(confusion_matrix(y_true, label_indobert))

              precision    recall  f1-score   support

         neg       0.71      0.94      0.81       350
         neu       0.80      0.53      0.64       242
         pos       0.73      0.62      0.67       175

    accuracy                           0.74       767
   macro avg       0.75      0.69      0.71       767
weighted avg       0.75      0.74      0.72       767

[[328  11  11]
 [ 86 128  28]
 [ 46  21 108]]


In [114]:
wrong_answer_indobert = df
wrong_answer_indobert['indobert'] = label_indobert

In [115]:
wrong_answer_indobert = wrong_answer_indobert[wrong_answer_indobert['tag_overall'] != wrong_answer_indobert['indobert']]


In [117]:
wrong_answer_indobert.to_csv('wrong_answer_indobert.csv', index=False, sep=';')