#Import Libraries and Dataset

*Install Repositories*
* https://github.com/marcotcr/lime

In [None]:
!pip install nltk
!pip install Sastrawi
!pip install lime

*Import Related Libraries*

In [None]:
import nltk
import re
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.stem import WordNetLemmatizer

import pandas as pd

import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer

*Import Datasets*

In [None]:
data = pd.read_csv('re_dataset.csv', encoding='latin-1')

NOTE: Raw tweet dataset.

In [None]:
alay_dict = pd.read_csv('new_kamusalay.csv', encoding='latin-1', header=None)
alay_dict = alay_dict.rename(columns={0: 'original',
                                      1: 'replacement'})

NOTE: Dictionary of slang and mistyped words.

In [None]:
id_stopword_dict = pd.read_csv('stopwordbahasa.csv', header=None)
id_stopword_dict = id_stopword_dict.rename(columns={0: 'stopword'})

NOTE: Dictionary of stopwords. Taken from (https://www.kaggle.com/datasets/oswinrh/indonesian-stoplist)

#Preprocessing


In [None]:
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def lowercase(text):
    return text.lower()

def remove_unnecessary_char(text):
    text = re.sub('\n',' ',text) # Remove every '\n'
    text = re.sub('rt',' ',text) # Remove every retweet symbol
    text = re.sub('user',' ',text) # Remove every username
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
    text = re.sub('  +', ' ', text) # Remove extra spaces
    return text

def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text)
    return text

alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
def normalize_alay(text):
    return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])

def remove_stopword(text):
    text = ' '.join(['' if word in id_stopword_dict.stopword.values else word for word in text.split(' ')])
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = text.strip()
    return text

def stemming(text):
    return stemmer.stem(text)

In [None]:
def preprocess(text):
    text = lowercase(text) # 1
    text = remove_nonaplhanumeric(text) # 2
    text = remove_unnecessary_char(text) # 2
    text = normalize_alay(text) # 3
    #text = stemming(text) # 4
    text = remove_stopword(text) # 5
    return text

In [None]:
data['Tweet'] = data['Tweet'].apply(preprocess)

**Classification and Evaluation**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(data['Tweet'])
y = data['HS']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report

y_pred = xgb_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86      1516
           1       0.86      0.70      0.77      1118

    accuracy                           0.82      2634
   macro avg       0.83      0.81      0.81      2634
weighted avg       0.83      0.82      0.82      2634



In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

y_pred = xgb_model.predict(X_test)

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1-score: ", f1)

Accuracy:  0.8230827638572513
Precision:  0.8268166871498865
Recall:  0.8230827638572513
F1-score:  0.8198672365659038


In [None]:
print("Tweet : ", data['Tweet'][1164])

if y_pred[1164] == 0:
  print("The tweet is not a hate speech")
else:
  print("The tweet is a hate speech")

Tweet :  jokowi staf ahli khusus mempe ahankan aset nasional diktator jomlo lokal
The tweet is not a hate speech


In [None]:
print("Tweet : ", data['Tweet'][1062])

if y_pred[1062] == 0:
  print("The tweet is not a hate speech")
else:
  print("The tweet is a hate speech")

Tweet :  bodoh bangsa gue ujian semester uniform resource locator
The tweet is a hate speech


In [None]:
import numpy as np
import tensorflow as tf

# Convert y_train to numpy array
y_train = np.array(y_train)

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model architecture
input_dim = X_train.shape[1]  # Replace with the actual number of input features
num_classes = len(np.unique(y_train))
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(input_dim,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Specify the training parameters
batch_size = 32
num_epochs = 10

# Training loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")

    # Shuffle the training data
    indices = np.arange(X_train.shape[0])
    np.random.shuffle(indices)
    X_train = X_train[indices]
    y_train = y_train[indices]

    # Mini-batch training
    for i in range(0, X_train.shape[0], batch_size):
        X_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]

        # Perform forward pass and compute loss
        loss, accuracy = model.train_on_batch(X_batch, y_batch)

    # Evaluate the model on the validation set
    val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=0)
    print(f"Validation loss: {val_loss:.4f} - Validation accuracy: {val_accuracy:.4f}")

In [None]:
print(c.predict_proba(data['Tweet'][0]))

NOTES:
* https://www.kaggle.com/code/bavalpreet26/explainable-ai-lime/notebook
* https://towardsdatascience.com/lime-how-to-interpret-machine-learning-models-with-python-94b0e7e4432e

