# Inference

In [158]:
from keras.models import load_model
import pickle
import torch
import joblib
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
from nltk.tokenize import word_tokenize
import requests
import ast
from nltk.corpus import stopwords
import string
import numpy as np

In [159]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [160]:
def getStopWordList(url):
    response = requests.get(url)
    response.raise_for_status()  
    content = response.text
    stopWords = content.splitlines()  # Split the content into lines to create the stopword list
    return stopWords

stopwords_tambahan = getStopWordList('https://raw.githubusercontent.com/yasirutomo/python-sentianalysis-id/master/data/feature_list/stopwordsID.txt')
stopwords_tambahan.pop(0)
stopwords_tambahan += ['gojek','driver','aplikasi','iya']

In [161]:
def getSlangWordList(url):
    response = requests.get(url)
    response.raise_for_status()  
    content = response.text
    data_dict = ast.literal_eval(content)
    return data_dict

slangwords = getSlangWordList('https://raw.githubusercontent.com/louisowen6/NLP_bahasa_resources/master/combined_slang_words.txt')
slangwords['gak'] = 'tidak'
slangwords['ga'] = 'tidak'
slangwords['bagu'] = 'bagus'
slangwords['gk'] ='tidak'
slangwords['udh'] = 'sudah'
slangwords['sdh'] = 'sudah'

### Load model and tokenizer/vectorizer

BILSTM

In [162]:
model_bilstm= load_model('model_tokenizer_bilstm_logres/bilstm_model.keras')

with open('model_tokenizer_bilstm_logres/tokenizer.pkl', 'rb') as f:
    tokenizer_bilstm = pickle.load(f)

FINETUNED cahya/distilbert-base-indonesian

In [163]:
tokenizer_distilbert_finetuned = AutoTokenizer.from_pretrained("saved_model")
model_distilbert_finetuned = AutoModelForSequenceClassification.from_pretrained(
    "saved_model",
    num_labels=3
)
model_distilbert_finetuned.to(device)
model_distilbert_finetuned.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


**Model tidak disertakan secara langsung dalam repositori ini karena ukurannya yang besar. Untuk menggunakan atau mencoba model, dapat langsung memuat model dari Hugging Face Hub menggunakan transformers atau sentence-transformers dengan repo ID yang tersedia.**

In [None]:
#tokenizer_distilbert_finetuned = AutoTokenizer.from_pretrained("PetaniHandal/distilbert-base-id-finetuned-sentiment")
#model_distilbert_finetuned = AutoModelForSequenceClassification.from_pretrained(
#    "PetaniHandal/distilbert-base-id-finetuned-sentiment",
#    num_labels=3
#)
#model_distilbert_finetuned.to(device)
#model_distilbert_finetuned.eval()

#### Logistic Regression with TF-IDF

In [164]:
model_logres = joblib.load('model_tokenizer_bilstm_logres/logistic_regression_model.pkl')

with open('model_tokenizer_bilstm_logres/tfidf_vectorizer.pkl', 'rb') as f:
    tfidf = pickle.load(f)

### Pipeline

#### Pipeline Processing Text

In [165]:
def processingTextGeneral(text):
    # Cleaning Text
    text = re.sub(r'@[A-Za-z0-9]+', '', text) 
    text = re.sub(r'#[A-Za-z0-9]+', '', text) 
    text = re.sub(r'RT[\s]', '', text) 
    text = re.sub(r"http\S+", '', text) 
    text = re.sub(r'[0-9]+', '', text) 
    text = re.sub(r'[^\w\s]', '', text) 

    text = text.replace('\n', ' ') 
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    text = text.strip(' ')

    # Casefolding Text
    text = text.lower()

    #menggabungkan kata nomina (nya)
    text = re.sub(r'\b(\w+)\s+nya\b', r'\1nya', text)
    # Tokenizing Text
    text_tokenize = word_tokenize(text)

    # slangword fixing and stopword removal
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords1 = set(stopwords.words('english'))
    listStopwords.update(listStopwords1)
    listStopwords.update(stopwords_tambahan)

    filtered = []
    for txt in text_tokenize:
        token_fixed = slangwords.get(txt, txt)
        if token_fixed not in listStopwords:
            filtered.append(token_fixed)

    return ' '.join(filtered)

def processingTextForBert(text):
    # Cleaning Text
    text = re.sub(r'@[A-Za-z0-9]+', '', text) 
    text = re.sub(r'#[A-Za-z0-9]+', '', text) 
    text = re.sub(r'RT[\s]', '', text) 
    text = re.sub(r"http\S+", '', text) 
    text = re.sub(r'[0-9]+', '', text)

    text = text.replace('\n', ' ') 
    text = text.strip(' ') 

    # Casefolding Text
    text = text.lower()

    tokenize = text.split()
    tokenize = [word for word in tokenize if word not in slangwords]
    text = ' '.join(tokenize)
    return text


#### Pipeline BILSTM

In [166]:
def tokenizing_bilstm(text):
    new_seq = tokenizer_bilstm.texts_to_sequences([text])
    new_pad = pad_sequences(new_seq, maxlen=35, truncating='post', padding='post')
    return new_pad

def predicting_bilstm(token):
    pred = model_bilstm.predict(token)
    result = np.argmax(pred, axis=1)

    return result

def inference_bilstm(text):
    text_processed = processingTextGeneral(text)
    token = tokenizing_bilstm(text_processed)
    result = predicting_bilstm(token)

    print("Teks :", text)
    if result == 0:
        print("Negatif")
    elif result == 1:
        print("Netral")
    else:
        print("Positif")
    

##### Pipeline DistilBERT Fintuned

In [167]:
def tokenizing_distilbert(text):
    return tokenizer_distilbert_finetuned(text, padding="max_length", truncation=True, max_length=35)

def predicting_distilbert(inputs):
    with torch.no_grad():
        outputs = model_distilbert_finetuned(**inputs)
        logits = outputs.logits
        outputs = model_distilbert_finetuned(**inputs)
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)  # <- sesuai dengan latihan

    return preds.cpu().numpy()

def inference_distilbert(text):
    text_processed = processingTextForBert(text)
    inputs = tokenizing_distilbert(text_processed)
    inputs = {k: torch.tensor(v).unsqueeze(0).to(device) for k, v in inputs.items()}
    result = predicting_distilbert(inputs)

    print("Teks :", text)
    if result == 0:
        print("Positif")
    elif result == 1:
        print("Netral")
    else:
        print("Negatif")

#### Pipeline LogRes

In [168]:
def vectorizing_logres(text):
    vector = tfidf.transform([text])
    return vector

def predicting_logres(vector):
    pred = model_logres.predict(vector)
    return pred

def inference_logres(text):
    text_processed = processingTextGeneral(text)
    vector = vectorizing_logres(text_processed)
    result = predicting_logres(vector)

    print("Teks :", text)
    if result == 0:
        print("Negatif")
    elif result == 1:
        print("Netral")
    else:
        print("Positif")

### Cek Hasil

In [169]:
texts = [
    "app nya bagus, tampilannya jelas dan menarik, mudah digunakan jg",
    "apalah developer, buat tuh aplikasi yang berguna, malah buat sampah yang bahkan gak jalan sama sekali, hadeuh",
    "kurang suka bagian tampilannya, gak menarik. lumayan lah"
]

for i in texts:
    print("BILSTM")
    inference_bilstm(i)
    print("======================================")
    print("DISTILBERT FINETUNED")
    inference_distilbert(i)
    print("======================================")
    print("LOGISTIC REGRESSION")
    inference_logres(i)
    print("======================================\n")

BILSTM
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 424ms/step
Teks : app nya bagus, tampilannya jelas dan menarik, mudah digunakan jg
Positif
DISTILBERT FINETUNED
Teks : app nya bagus, tampilannya jelas dan menarik, mudah digunakan jg
Positif
LOGISTIC REGRESSION
Teks : app nya bagus, tampilannya jelas dan menarik, mudah digunakan jg
Positif

BILSTM
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
Teks : apalah developer, buat tuh aplikasi yang berguna, malah buat sampah yang bahkan gak jalan sama sekali, hadeuh
Netral
DISTILBERT FINETUNED
Teks : apalah developer, buat tuh aplikasi yang berguna, malah buat sampah yang bahkan gak jalan sama sekali, hadeuh
Negatif
LOGISTIC REGRESSION
Teks : apalah developer, buat tuh aplikasi yang berguna, malah buat sampah yang bahkan gak jalan sama sekali, hadeuh
Netral

BILSTM
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Teks : kurang suka bagian tampilannya, gak menarik. lumayan lah
P