## PREPROCESSING

### Install and Importing Library/Package

In [None]:
# Installing Library / Packages
%pip install tensorflow==2.17.1

In [None]:

%pip install lime

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print(tf.__version__)

In [None]:
import re
import nltk
import gensim
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional
from tensorflow.keras.utils import plot_model
from sklearn.metrics import classification_report


#nltk.download("stopwords")
nltk.download("stopwords")
nltk.download('wordnet')
nltk.download('omw-1.4')


stop_words = set(stopwords.words("english"))
lemmatizer= WordNetLemmatizer()


#Lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from lime.lime_text import IndexedString,IndexedCharacters
from lime.lime_base import LimeBase
from lime.lime_text import explanation

### Gathering Data

In [None]:
# Read datasets
data_train = pd.read_csv('../datasets/raw/train.txt', names=['Text', 'Emotion'], sep=';')
data_val = pd.read_csv('../datasets/raw/val.txt', names=['Text', 'Emotion'], sep=';')
data_test = pd.read_csv('../datasets/raw/test.txt', names=['Text', 'Emotion'], sep=';')

In [None]:
data_train = data_train.rename(columns={'Text': 'text', 'Emotion': 'label'})
data_val = data_val.rename(columns={'Text': 'text', 'Emotion': 'label'})
data_test = data_test.rename(columns={'Text': 'text', 'Emotion': 'label'})

In [None]:
#print first 5 rows
data_train.head()

In [None]:
#print the shape of the data set
print(data_train.shape)

In [None]:
#print first 5 rows
data_val.head()

In [None]:
#print the shape of the data set
print(data_val.shape)

In [None]:
#print first 5 rows
data_test.head()

In [None]:
#print the shape of the data set
print(data_test.shape)

### Assessing Data

#### Melihat informasi sederhana dari data

In [None]:
# Melihat informasi sederhana dari data_train
data_train.info()

In [None]:
# Melihat informasi sederhana dari data_val
data_val.info()

In [None]:
# Melihat informasi sederhana dari data_test
data_test.info()

#### Melihat Missing Value

In [None]:
print("\nJumlah Nan Value pada data_train \n",data_train.isna().sum())
print("\nJumlah Nan Value pada data_val \n",data_val.isna().sum())
print("\nJumlah Nan Value pada data_test \n",data_test.isna().sum())

#### Mengecek duplikat data

In [None]:
print("\nJumlah Duplikat Data pada data_train \n",data_train.duplicated().sum())
print("\nJumlah Duplikat Data pada data_val \n",data_val.duplicated().sum())
print("\nJumlah Duplikat Data pada data_test \n",data_test.duplicated().sum())

In [None]:
print("\nJumlah Duplikat Data pada data_train \n",data_train['text'].duplicated().sum())

#### Mengecek detail di setiap kolom dataframe

In [None]:
data_train.describe()

In [None]:
data_val.describe()

In [None]:
data_test.describe()

In [None]:
#print the rows which are duplicated (duplicated in the text but with different emotions)
data_train[data_train['text'].duplicated() == True]

In [None]:
#print the rows which are duplicated (duplicated in the text but with different emotions)
data_val[data_val['text'].duplicated() == True]

In [None]:
#print the rows which are duplicated (duplicated in the text but with different emotions)
data_test[data_test['text'].duplicated() == True]

### Cleaning Data

#### Melakukan Drop Duplikat Data

In [None]:
print("\nJumlah Duplikat Data pada data_train \n",data_train.duplicated().sum())

In [None]:
data_train.drop_duplicates(inplace=True)
data_train.reset_index(drop=True, inplace=True)

In [None]:
#removing duplicated text
data_train.drop_duplicates(subset=['text'], inplace=True)
data_train.reset_index(drop=True, inplace=True)

In [None]:
#removing duplicated text
data_val.drop_duplicates(subset=['text'], inplace=True)
data_val.reset_index(drop=True, inplace=True)

### EDA

#### Mengecek distribusi data di setiap label

In [None]:
data_train.label.value_counts() *100/ len(data_train)

In [None]:
fig = px.histogram(data_train, x="label", title="Distribusi Emosi")
fig.show()


In [None]:
data_val.label.value_counts() *100/ len(data_val)

In [None]:
fig = px.histogram(data_val, x="label", title="Distribusi Emosi")
fig.show()


In [None]:
data_test.label.value_counts() *100/ len(data_test)

In [None]:
fig = px.histogram(data_test, x="label", title="Distribusi Emosi")
fig.show()


#### Mengecek distribusi stopwords dalam train

In [None]:
data_train['stop_words'] = data_train['text'].apply(lambda x: len([word for word in x.split() if word in stop_words]))
data_train.stop_words.value_counts()

In [None]:
fig = px.histogram(data_train, x="stop_words", title="Distribusi Stop Word dalam data train")
fig.show()

In [None]:
data_val['stop_words'] = data_val['text'].apply(lambda x: len([word for word in x.split() if word in stop_words]))
data_val.stop_words.value_counts()

In [None]:
fig = px.histogram(data_val, x="stop_words", title="Distribusi Stop Word dalam data validasi")
fig.show()

In [None]:
data_test['stop_words'] = data_test['text'].apply(lambda x: len([word for word in x.split() if word in stop_words]))
data_test.stop_words.value_counts()

In [None]:
fig = px.histogram(data_test, x="stop_words", title="Distribusi Stop Word dalam data uji")
fig.show()

### Function will be imported

In [None]:
def lemmatization(text):
    lemmatizer= WordNetLemmatizer()

    text = text.split()

    text=[lemmatizer.lemmatize(y) for y in text]

    return " " .join(text)



def remove_stop_words(text):

    Text=[i for i in str(text).split() if i not in stop_words]
    return " ".join(Text)



def Removing_numbers(text):
    text=''.join([i for i in text if not i.isdigit()])
    return text



def lower_case(text):

    text = text.split()

    text=[y.lower() for y in text]

    return " " .join(text)



def Removing_punctuations(text):
    ## Remove punctuations
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)
    text = text.replace('؛',"", )

    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    text =  " ".join(text.split())
    return text.strip()



def Removing_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)



def remove_small_sentences(df):
    for i in range(len(df)):
        if len(df.text.iloc[i].split()) < 3:
            df.text.iloc[i] = np.nan



def normalize_text(df):
    df.text=df.text.apply(lambda text : lower_case(text))
    df.text=df.text.apply(lambda text : remove_stop_words(text))
    df.text=df.text.apply(lambda text : Removing_numbers(text))
    df.text=df.text.apply(lambda text : Removing_punctuations(text))
    df.text=df.text.apply(lambda text : Removing_urls(text))
    df.text=df.text.apply(lambda text : lemmatization(text))
    return df

def normalized_sentence(sentence):
    sentence= lower_case(sentence)
    sentence= remove_stop_words(sentence)
    sentence= Removing_numbers(sentence)
    sentence= Removing_punctuations(sentence)
    sentence= Removing_urls(sentence)
    sentence= lemmatization(sentence)
    return sentence


In [None]:
# !unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

### Membuat Model

#### Mengaplikasikan fungsi normalisasi untuk teks

In [None]:
data_train = normalize_text(data_train)
data_test = normalize_text(data_test)
data_val = normalize_text(data_val)

In [None]:
#Splitting the text from the labels
X_train = data_train['text']
y_train = data_train['label']

X_test = data_test['text']
y_test = data_test['label']

X_val = data_val['text']
y_val = data_val['label']

#### Encode Labels

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
y_val = le.transform(y_val)

In [None]:
#Convert the class vector (integers) to binary class matrix
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
y_val = to_categorical(y_val)

In [None]:
MAX_LENGTH = 500

# Create a tokenizer
tokenizer = Tokenizer(oov_token='UNK')
# Fit the tokenizer on the training data
tokenizer.fit_on_texts(X_train)
# Convert text to sequences of tokens
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_val_seq = tokenizer.texts_to_sequences(X_val)
# Pad sequences to ensure equal length
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LENGTH, truncating='pre')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LENGTH, truncating='pre')
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_LENGTH, truncating='pre')

In [None]:
# Train Word2Vec model
sentences = [text.split() for text in X_train]
w2v_model = gensim.models.Word2Vec(sentences, vector_size=200, window=5, min_count=5, workers=4)

# Create embedding matrix
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 200))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

In [None]:
# Build neural network architecture

adam = Adam(learning_rate=0.005)

# Define vocabSize
vocabSize = len(tokenizer.word_index) + 1  # +1 for the padding token or OOV token

model = Sequential()
model.add(Embedding(vocabSize, 200, weights=[embedding_matrix])) # Use X_train_pad to get shape
model.add(Bidirectional(LSTM(256, dropout=0.2,recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(128, dropout=0.2,recurrent_dropout=0.2)))
model.add(Dense(6, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

model.summary()

In [None]:
#to stop the training when the loss starts to increase
callback = EarlyStopping(
    monitor="val_loss",
    patience=4,
    restore_best_weights=True,
)

In [None]:
# Fit model
history = model.fit(X_train_pad,
                    y_train,
                    validation_data=(X_val_pad, y_val),
                    verbose=1,
                    batch_size=256,
                    epochs=10,
                    callbacks=[callback]
                   )

In [None]:
plot_model(model, show_shapes=True, dpi=80)

In [None]:
#print the overall loss and accuracy
model.evaluate(X_test_pad, y_test)

In [None]:
data_test['label'].value_counts()

In [None]:
predicted = model.predict(X_test_pad)
y_pred = predicted.argmax(axis=-1)

print(classification_report(le.transform(data_test['label']), y_pred))

In [None]:
# Visualize Loss & Accuracy

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()