# Pelabelan Data Menggunakan Pretrained Model
Menggunakan Transformer dengan Model Indonesian RoBERTa Base Sentiment Classifier

## Import Library

In [1]:
import pandas as pd
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize import word_tokenize
from tqdm.notebook import tqdm
from wordcloud import WordCloud
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from transformers import pipeline

2022-08-21 12:24:14.654218: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


## Import Data dari Proses Sebelumnya

In [2]:
df = pd.read_csv('data/tweets_clean.csv')
df.head()

Unnamed: 0,tweet
0,"['menangani', 'kekerasan', 'seksual', 'disahka..."
1,"['menangani', 'kekerasan', 'seksual', 'disahka..."
2,"['wakil', 'mpr', 'ri', 'mahasiswa', 'kawal', '..."
3,"['fadel', 'muhammad', 'mahasiswa', 'kawal', 'i..."
4,"['tanggal', 'chatnya', 'april', 'dijerat']"


## Detokenizing
### Membuat kata - kata yang telah tertoken menjadi kalimat biasa

In [3]:
temp_detokenize = []

def detokenize(text):
    text1 = text.replace(']','').replace('[','')
    arr = text1.replace('"','').replace("\'","").split(",")
    return(TreebankWordDetokenizer().detokenize(arr))

df['tweet'] = df['tweet'].astype('U').apply(detokenize)

df.head()

Unnamed: 0,tweet
0,menangani kekerasan seksual disahkan enam ...
1,menangani kekerasan seksual disahkan enam ...
2,wakil mpr ri mahasiswa kawal implementasi
3,fadel muhammad mahasiswa kawal implementasi
4,tanggal chatnya april dijerat


## Mempersiapkan Model

In [4]:
pretrained_name = "w11wo/indonesian-roberta-base-sentiment-classifier"

nlp = pipeline(
    "sentiment-analysis",
    model=pretrained_name,
    tokenizer=pretrained_name
)

2022-08-21 12:24:20.985874: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-21 12:24:21.006783: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-21 12:24:21.006882: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-21 12:24:21.007212: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the approp

## Polarity Scoring and Labelling
### Menilai polaritas yang terdapat pada kata dan melabelkannya bedasarkan nilai polaritasnya

In [5]:
df2 = pd.DataFrame(columns = ['tweet', 'sentimen', 'skor'])
for i in tqdm(df['tweet'],desc='Scoring and Labelling..'):
    sentiment = nlp(i)[0]['label']
    score = nlp(i)[0]['score']
    df2 = pd.concat([df2, pd.DataFrame([{"tweet" : i, "sentimen" : sentiment, "skor" : score}])])
    
def change_languange(text):
    if text == "neutral":
        return "Netral"
    if text == "positive":
        return "Positif"
    if text == "negative":
        return "Negatif"

df2['sentimen'] = df2['sentimen'].apply(change_languange)

Scoring and Labelling..:   0%|          | 0/15632 [00:00<?, ?it/s]

InvalidArgumentError: Exception encountered when calling layer "embeddings" (type TFRobertaEmbeddings).

Value for attr 'Tindices' of float is not in the list of allowed values: int32, int64
	; NodeDef: {{node ResourceGather}}; Op<name=ResourceGather; signature=resource:resource, indices:Tindices -> output:dtype; attr=batch_dims:int,default=0; attr=validate_indices:bool,default=true; attr=dtype:type; attr=Tindices:type,allowed=[DT_INT32, DT_INT64]; is_stateful=true> [Op:ResourceGather]

Call arguments received by layer "embeddings" (type TFRobertaEmbeddings):
  • input_ids=tf.Tensor(shape=(1, 0), dtype=float32)
  • position_ids=None
  • token_type_ids=tf.Tensor(shape=(1, 0), dtype=int32)
  • inputs_embeds=None
  • past_key_values_length=0
  • training=False

In [None]:
df2

## Visualization

### Total tweet positif, negatif, atau netral

In [None]:
print("Positif :",len(df2[df2.sentimen=="Positif"]), " tweet")
print("Netral :",len(df2[df2.sentimen=="Netral"]), " tweet")
print("Negatif :",len(df2[df2.sentimen=="Negatif"]), " tweet")

In [None]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 30):
#      with open('data/daftar_kata.txt', 'w') as f:
#         print(df2['tweet'].str.split(expand=True).stack().value_counts(), file=f)

### Pie chart dari data labelling

In [None]:
y = np.array([len(df2[df2.sentimen=="Positif"]),  len(df2[df2.sentimen=="Netral"]), len(df2[df2.sentimen=="Negatif"])])
mylabels = ['Positif', 'Netral', 'Negatif']
mycolors = ['lightblue', 'lightgreen', 'orange']
myexplode = [0, 0.2, 0]

plt.rcParams['text.color'] = 'black'
plt.pie(y, colors=mycolors, labels = mylabels, explode = myexplode, shadow=True, autopct='%1.0f%%')
plt.show()

### Wordcloud semua data, positif, netral, dan negatif

In [None]:
mask = np.array(Image.open("data/cloud.png"))

def plot_cloud(title, text):
    wc = WordCloud(scale=3,max_words=100,font_path="data/font/GothamMedium.ttf",background_color='white',
                   mask=mask,contour_color='black',contour_width=1).generate(str(" ".join(text)))
    # Set figure size
    plt.figure(figsize=(40,30))
    # Insert image wordcloud
    plt.imshow(wc) 
    # No axis details
    plt.axis("off")
    # Add Title
    plt.title(title)
    # Display image
    plt.show()

In [None]:
text = df2['tweet'].astype('U')

plot_cloud("Semua Data", text)

In [None]:
text_pos = df2[df2.sentimen=="Positif"].tweet.astype('U')

plot_cloud("Data Positif", text_pos)

In [None]:
text_net = df2[df2.sentimen=="Netral"].tweet.astype('U')

plot_cloud("Data Netral", text_net)

In [None]:
text_neg = df2[df2.sentimen=="Negatif"].tweet.astype('U')

plot_cloud("Data Negatif", text_neg)

## Export data

### Ekspor data per kata positif, negatif, atau netral

In [None]:
df2_positif = df2[df2.sentimen=="Positif"]
df2_netral = df2[df2.sentimen=="Netral"]
df2_negatif = df2[df2.sentimen=="Negatif"]

with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 30):
    with open('data/temp/daftar_kata_all_roberta.txt', 'w') as f:
        print(df2['tweet'].str.split(expand=True).stack().value_counts(), file=f)
    with open('data/temp/daftar_kata_positif_roberta.txt', 'w') as f:
        print(df2_positif['tweet'].str.split(expand=True).stack().value_counts(), file=f)
    with open('data/temp/daftar_kata_netral_roberta.txt', 'w') as f:
        print(df2_netral['tweet'].str.split(expand=True).stack().value_counts(), file=f)
    with open('data/temp/daftar_kata_negatif_roberta.txt', 'w') as f:
        print(df2_negatif['tweet'].str.split(expand=True).stack().value_counts(), file=f)

### Ekspor data labelling untokenized

In [None]:
df2.to_csv("data/tweets_labelled_roberta.csv", index=False)

### Ekspor data labelling tokenized

In [None]:
nltk.download('punkt')
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df2['tweet'] = df2['tweet'].apply(word_tokenize_wrapper)
df2.to_csv("data/tweets_labelled_tokenized_roberta.csv", index=False)