In [35]:
import pandas as pd # read the csv
import re # regex to detect username, url, html entity 
import nltk # to use word tokenize (split the sentence into words)
import numpy as np
from nltk.corpus import stopwords # to remove the stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

from keras.utils import to_categorical
from keras import backend as K

In [36]:
# initiate the path and read it
dataset_path = "./labeled_data.csv"
df = pd.read_csv(dataset_path)
df.head()

# dataset shape to know how many tweets in the datasets
print(f"num of tweets: {df.shape}")

# extract the text and labels
tweet = list(df['tweet'])
labels = list(df['class'])

num of tweets: (24783, 7)


### Steps:
1. preprocessing
2. splitting
3. tokenize & padding
4. Create model & train
5. evaluate

---

Preprocessing (cleaning the datasets):
- remove html entity
- change user tags (@xxx -> user)
- remove urls
- remove unnecessary  symbol ('', !, ", ') -> cause a lot of noise in the dataset
- remove stopwords

In [37]:
## notes: all of the function taking 1 text at a time
stop_words = set(stopwords.words('english'))
# add rt to remove retweet in dataset (noise)
stop_words.add("rt")

# remove html entity:
def remove_entity(raw_text):
    entity_regex = r"&[^\s;]+;"
    text = re.sub(entity_regex, "", raw_text)
    return text

# change the user tags
def change_user(raw_text):
    regex = r"@([^ ]+)"
    text = re.sub(regex, "user", raw_text)

    return text

def remove_hashtag(raw_text):
    regex = r"#([^ ]+)"
    text = re.sub(regex, "", raw_text)

    return text


# remove urls
def remove_url(raw_text):
    url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    text = re.sub(url_regex, '', raw_text)

    return text

# remove unnecessary symbols
def remove_noise_symbols(raw_text):
    text = raw_text.replace('"', '')
    text = text.replace("'", '')
    text = text.replace("!", '')
    text = text.replace("`", '')
    text = text.replace("..", '')
    text = text.replace("/", '')
    text = text.replace("\\", '')
    text = text.replace("|", '')
    return text

# remove stopwords
def remove_stopwords(raw_text):
    tokenize = nltk.word_tokenize(raw_text)
    text = [word for word in tokenize if not word.lower() in stop_words]
    text = " ".join(text)

    return text

## this function in to clean all the dataset by utilizing all the function above
def preprocess(datas):
    clean = []
    # change the @xxx into "user"
    clean = [change_user(text) for text in datas]
    # remove emojis (specifically unicode emojis)
    clean = [remove_entity(text) for text in clean]
    # remove urls
    clean = [remove_url(text) for text in clean]
    # remove  hashtag
    clean = [remove_hashtag(text) for text in clean]
    # remove trailing stuff
    clean = [remove_noise_symbols(text) for text in clean]
    # remove stopwords
    clean = [remove_stopwords(text) for text in clean]

    return clean

In [38]:
# call the cleaning function
clean_tweet = preprocess(tweet)

In [39]:
# remove the null tweet and its labels(probably neutral tweet)
counter = 0
for i, tweet in enumerate(clean_tweet):
    if not tweet:
        clean_tweet.pop(i)
        labels.pop(i)

# check the final len
print(len(labels))
print(len(clean_tweet))

24777
24777


Saving clean tweets and reading it

In [40]:
## uncomment this section if you planning to run many times
# write
with open("clean_tweet.txt", 'w') as file:
    file.write('\n'.join(clean_tweet))

Splitting the dataset into train, validation and test (70:20:10) 

In [42]:
X_train, X_test, y_train, y_test = train_test_split(clean_tweet, labels, test_size=0.3, random_state=42)

In [44]:
split = len(y_test)//3
print(split)

2478


Tokenizing and Padding

In [7]:
## Tokenizing -> basically we use tokenisation for many things, its commonly used for feature extraction in preprocessing. btw idk how it works as feature extraction tho :(
# declare the tokenizer
tokenizer = Tokenizer()
# build the vocabulary based on train dataset
tokenizer.fit_on_texts(X_train)
# tokenize the train and test dataset
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# vocabulary size (num of unique words) -> will be used in embedding layer
vocab_size = len(tokenizer.word_index) + 1

In [8]:
## Padding -> to uniform the datas
max_length = max(len(seq) for seq in X_train)

# to test an outlier case (if one of the test dataset has longer length)
for x in X_test:
    if len(x) > max_length:
        print(f"an outlier detected: {x}")

X_train = pad_sequences(X_train, maxlen = max_length)
X_test = pad_sequences(X_test, maxlen = max_length)

In [9]:
# create hot_labels (idk whty tapi ini penting)
y_test = to_categorical(y_test, num_classes=3)
y_train = to_categorical(y_train, num_classes=3)

In [10]:
# another look on the number of tweet in test and training data
print(f"num test tweet: {y_test.shape[0]}")
print(f"num train tweet: {y_train.shape[0]}")

num test tweet: 4957
num train tweet: 19826


Model building

In [11]:
# use some early stopping to get the right epoch
# Define the EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# evaluation metrics recall, precision, and f1
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1(y_true, y_pred):
    precisions = precision(y_true, y_pred)
    recalls = recall(y_true, y_pred)
    return 2*((precisions*recalls)/(precisions+recalls+K.epsilon()))

In [12]:
# change dis if u want
output_dim = 200

# LSTM model architechture (CNN + LSTM)
model = Sequential([
    # embedding layer is like idk
    Embedding(vocab_size, output_dim, input_length=max_length),
    # lstm for xxx
    LSTM(1024, dropout=0.3, recurrent_dropout=0.3),
    # dropout to prevent overfitting
    Dropout(0.5),
    # dense to connect the previous output with current layer
    Dense(2048, activation="relu"),
    # dropout to prevent overfitting
    Dropout(0.5),
    # this is output layer, with 3 class (0, 1, 2)
    Dense(3, activation="softmax"),
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy',f1,precision, recall])


In [13]:
# checking the model parameters
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 26, 200)           3734200   
                                                                 
 lstm (LSTM)                 (None, 1024)              5017600   
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 dense (Dense)               (None, 2048)              2099200   
                                                                 
 dropout_1 (Dropout)         (None, 2048)              0         
                                                                 
 dense_1 (Dense)             (None, 3)                 6147      
                                                                 
Total params: 10,857,147
Trainable params: 10,857,147
No

In [14]:
# Train the model
model_history = model.fit(
    X_train,
    y_train,
    callbacks=[early_stopping],
    batch_size = 32,
    epochs=1000,
    validation_data=(X_test, y_test)
)

Epoch 1/1000

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import confusion_matrix

#Predict
y_prediction = model.predict(x_test)

#Create confusion matrix and normalizes it over predicted (columns)
result = confusion_matrix(y_test, y_prediction , normalize='pred')

NameError: name 'model' is not defined

In [None]:
# Print the best epoch (by val loss)
best_epoch = early_stopping.stopped_epoch
print("Best Epoch:", best_epoch-4)

Best Epoch: 2


: 

#### test 1 batch size 32
##### output_dim = 100 (best epoch = 1, based on val loss)
- loss: 0.4318
- accuracy: 0.8523
- f1: 0.8356
- precision: 0.8592
- recall: 0.8167
- val_loss: 0.3283
- val_accuracy: 0.8864
- val_f1: 0.8851
- val_precision: 0.9080
- val_recall: 0.8641

##### output_dim = 200 (best epoch = 1, based on val loss)
- loss: 0.4174
- accuracy: 0.8545
- f1: 0.8421
- precision: 0.8706
- recall: 0.8185
- val_loss: 0.3044
- val_accuracy: 0.8913
- val_f1: 0.8906
- val_precision: 0.9078
- val_recall: 0.8745

##### LSTM (128), dense (256)
- loss: 0.3906
- accuracy: 0.8656
- f1: 0.8559
- precision: 0.8769
- recall: 0.8384
- val_loss: 0.2915
- val_accuracy: 0.8987
- val_f1: 0.8985
- val_precision: 0.9111
- val_recall: 0.8866

##### LSTM (256), dense (512)
- loss: 0.3818
- accuracy: 0.8678
- f1: 0.8617
- precision: 0.8783
- recall: 0.8466
- val_loss: 0.3016
- val_accuracy: 0.8943
- val_f1: 0.8944
- val_precision: 0.9017
- val_recall: 0.8874

##### LSTM (512), dense (1024)
- loss: 0.3763
- accuracy: 0.8688
- f1: 0.8632
- precision: 0.8813
- recall: 0.8476
- val_loss: 0.2958
- val_accuracy: 0.8965
- val_f1: 0.8965
- val_precision: 0.9084
- val_recall: 0.8852

##### LSTM (1024), dense (2048)
- loss: 0.2217
- accuracy: 0.9233
- f1: 0.9223
- precision: 0.9281
- recall: 0.9168
- val_loss: 0.3078
- val_accuracy: 0.8935
- val_f1: 0.8936
- val_precision: 0.9040
- val_recall: 0.8836