In [1]:
import os

IN_COLAB = False if os.getenv("COLAB_RELEASE_TAG") is None else True
print(IN_COLAB)

False


##### Copy project to Colab Engine

In [2]:
if IN_COLAB:
    project_root = "/content/drive/MyDrive/Colab-Notebooks/LLM/sentiment-analysis"
    data_folder = "/content/drive/MyDrive/dataset/LLM/sentiment140"  # os.path.join(project_root, "data")
    INPUT_FILE_COLAB = "data/training.1600000.processed.noemoticon.csv"

In [3]:
if IN_COLAB:
    ## --------Mount google drive to Colab --------
    import os
    import sys
    from google.colab import drive

    root = "/content"
    drive.mount(os.path.join(root, "drive/"), force_remount=True)

    # -------- copy the project code and data----------------
    project_tag = (
        os.path.basename(os.path.dirname(project_root))
        + "_"
        + os.path.basename(project_root)
    )
    project_tag
    %cd $root
    %rm -rfv project

    # import the project code to workspace
    import os

    %cd $root
    %rm -rfv project
    %mkdir project
    !cp -r $project_root/* project
    %cd project

    #  copy the project data locally
    !rm  -r data/
    !cp  -r "$data_folder/" data/
    !ls data

    # extract the zip data
    !unzip data/*.zip -d data/
    !unzip data/*/*.zip -d data/*/

    # show the copied data
    %ls data/**/**

#### input variables

In [4]:
INPUT_FILE = "artifacts/data_ingestion/training.1600000.processed.noemoticon.csv"

In [5]:
if IN_COLAB:
    INPUT_FILE = INPUT_FILE_COLAB

In [6]:
# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

# EXPORT
KERAS_MODEL = "models/model.keras"
WORD2VEC_MODEL = "models/model.w2v"
TOKENIZER_MODEL = "models/tokenizer.pkl"
ENCODER_MODEL = "models/encoder.pkl"

#### setup the workspace variables

In [7]:
%cd /app
import sys

sys.path.append("src")

/app


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


#### install packages

In [20]:
# # %%capture
# %pip install gensim==4.3.3 #--upgrade
# %pip install keras==3.7.0 #--upgrade
# %pip install pandas==2.2.3 #--upgrade

# Project: Twitter Sentiment Analysis

### load the needed libraries

In [23]:
# DataFrame
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import (
    Activation,
    Dense,
    Dropout,
    Embedding,
    Flatten,
    Conv1D,
    MaxPooling1D,
    LSTM,
)
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Word2vec
import gensim
from gensim.models import Word2Vec

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

# Set log
logging.basicConfig(
    format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
)

In [24]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Data preparation

### Loaded dataset
The loaded datset schema is :
* **target**: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
* **ids**: The id of the tweet ( 2087)
* **date**: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
* **flag**: The query (lyx). If there is no query, then this value is NO_QUERY.
* **user**: the user that tweeted (robotickilldozr)
* **text**: the text of the tweet (Lyx is cool)

In [None]:
print("Open file:", INPUT_FILE)
df = pd.read_csv(INPUT_FILE, encoding=DATASET_ENCODING, names=DATASET_COLUMNS)

In [None]:
print("Dataset size:", len(df))

In [None]:
df.head(5)

### Map target label to String
* **0** -> **NEGATIVE**
* **2** -> **NEUTRAL**
* **4** -> **POSITIVE**

In [25]:
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}


def decode_sentiment(label):
    return decode_map[int(label)]

In [None]:
%%time
df.target = df.target.apply(lambda x: decode_sentiment(x))

In [None]:
target_cnt = Counter(df.target)

plt.figure(figsize=(16, 8))
plt.bar(target_cnt.keys(), target_cnt.values())
plt.title("Dataset labels distribuition")

## Data Cleaning and pre-Processing

### clean unvalid texts (stemmer language -> english)

In [26]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [27]:
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, " ", str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [None]:
%%time
df.text = df.text.apply(lambda x: preprocess(x))

### Split train and test

In [None]:
df_train, df_test = train_test_split(df, test_size=1 - TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

TRAIN size: 1280000
TEST size: 320000


## Data Transformtion

### Word2Vec

#### prepare the corpus

In [None]:
%%time
documents = [_text.split() for _text in df_train.text]

CPU times: user 7.46 s, sys: 844 ms, total: 8.3 s
Wall time: 8.44 s


In [None]:
w2v_model = gensim.models.word2vec.Word2Vec(
    vector_size=W2V_SIZE, window=W2V_WINDOW, min_count=W2V_MIN_COUNT, workers=8
)

In [None]:
# Load pre-trained Word2Vec model if any.
if os.path.exists(WORD2VEC_MODEL):
    w2v_model = gensim.models.Word2Vec.load(WORD2VEC_MODEL)
    print(
        f"- the pretrained model was loadded successfully from the path {WORD2VEC_MODEL}"
    )
else:
    print(f"- no pretrained model was found in path {WORD2VEC_MODEL}")

- the pretrained model was loadded successfully from the path models/model.w2v


In [None]:
w2v_model.build_vocab(documents)



IndexError: index 30459 is out of bounds for axis 0 with size 30369

In [None]:
words = list(w2v_model.wv.index_to_key)
vocab_size = len(words)
print("Vocab size", vocab_size)

#### train the Word2Vec model

In [None]:
%%time
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

# save the trained w2v_model
w2v_model.save(WORD2VEC_MODEL)

In [None]:
w2v_model.wv.most_similar("love")

### Tokenization

In [None]:
%%time
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

In [None]:
%%time
x_train = pad_sequences(
    tokenizer.texts_to_sequences(df_train.text), maxlen=SEQUENCE_LENGTH
)
x_test = pad_sequences(
    tokenizer.texts_to_sequences(df_test.text), maxlen=SEQUENCE_LENGTH
)

### Label Encoder

In [None]:
labels = df_train.target.unique().tolist()
labels.append(NEUTRAL)
labels

In [None]:
encoder = LabelEncoder()
encoder.fit(df_train.target.tolist())

y_train = encoder.transform(df_train.target.tolist())
y_test = encoder.transform(df_test.target.tolist())

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

print("y_train", y_train.shape)
print("y_test", y_test.shape)

In [None]:
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print()
print("x_test", x_test.shape)
print("y_test", y_test.shape)

In [None]:
y_train[:10]

## Model training

### Build embeddings layer and classification model

#### Embedding layer

In [None]:
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

In [None]:
embedding_layer = Embedding(
    vocab_size,
    W2V_SIZE,
    weights=[embedding_matrix],
    input_length=SEQUENCE_LENGTH,
    trainable=False,
)

#### Classification Model

In [None]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))

model.summary()

### Model training

#### Compile model

In [None]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

#### Callbacks

In [None]:
callbacks = [
    ReduceLROnPlateau(monitor="val_loss", patience=5, cooldown=0),
    EarlyStopping(monitor="val_accuracy", min_delta=1e-4, mode="max", patience=5),
]

#### Train

In [None]:
# Load pre-trained classification model if any.
if os.path.exists(KERAS_MODEL):
    model = tf.keras.models.load_model(KERAS_MODEL)
    print(
        f"- the pretrained model was loadded successfully from the path {WORD2VEC_MODEL}"
    )
else:
    print(f"- no pretrained model was found in path {KERAS_MODEL}")

In [None]:
# %%time
# history = model.fit(x_train, y_train,
#                     batch_size=BATCH_SIZE,
#                     epochs=1,
#                     validation_split=0.1,
#                     verbose=1,
#                     callbacks=callbacks)

# # save trained model
# model.save(KERAS_MODEL)

#### Save trained model

In [None]:
%cd $root/project
%mkdir  models

In [None]:
# model.save(KERAS_MODEL)
# w2v_model.save(WORD2VEC_MODEL)
# pickle.dump(tokenizer, open(TOKENIZER_MODEL, "wb"), protocol=0)
# pickle.dump(encoder, open(ENCODER_MODEL, "wb"), protocol=0)

### Model Evaluation

#### Training performance

In [None]:
%%time
# score = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
# print("\nACCURACY:",score[1])
# print("LOSS:",score[0])

In [None]:
try:
    acc = history.history["accuracy"]
    val_acc = history.history["val_accuracy"]
    loss = history.history["loss"]
    val_loss = history.history["val_loss"]

    epochs = range(len(acc))

    plt.plot(epochs, acc, "b", label="Training acc")
    plt.plot(epochs, val_acc, "r", label="Validation acc")
    plt.title("Training and validation accuracy")
    plt.legend()

    plt.figure()

    plt.plot(epochs, loss, "b", label="Training loss")
    plt.plot(epochs, val_loss, "r", label="Validation loss")
    plt.title("Training and validation loss")
    plt.legend()

    plt.show()
except Exception as e:
    print(
        f"\n - The training performance is not avaialble!!\
  \n because {e}"
    )

#### Confusion Matrix

In [None]:
# %%time
# y_pred_1d = []
# y_test_1d = list(df_test.target)
# scores = model.predict(x_test, verbose=1, batch_size=8000)
# y_pred_1d = [decode_sentiment(score, include_neutral=False) for score in scores]

In [None]:
def plot_confusion_matrix(cm, classes, title="Confusion matrix", cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation="nearest", cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90, fontsize=22)
    plt.yticks(tick_marks, classes, fontsize=22)

    fmt = ".2f"
    thresh = cm.max() / 2.0
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(
            j,
            i,
            format(cm[i, j], fmt),
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black",
        )

    plt.ylabel("True label", fontsize=25)
    plt.xlabel("Predicted label", fontsize=25)

In [None]:
# %%time
# cnf_matrix = confusion_matrix(y_test_1d, y_pred_1d)
# plt.figure(figsize=(12,12))
# plot_confusion_matrix(cnf_matrix, classes=df_train.target.unique(), title="Confusion matrix")
# plt.show()

#### Classification Report

In [None]:
# print(classification_report(y_test_1d, y_pred_1d))

#### Accuracy Score

In [None]:
# accuracy_score(y_test_1d, y_pred_1d)

### Export the trained model to Google Drive

In [107]:
if IN_COLAB:
    %cd $root/project
    %mkdir -fp $project_root/models
    %cp -rfv models $project_root/
    %cp -rfv artifacts/models $project_root/

/content/project
mkdir: invalid option -- 'f'
Try 'mkdir --help' for more information.
'models/model.h5' -> '/content/drive/MyDrive/Colab-Notebooks/LLM/sentiment-analysis/models/model.h5'
'models/model.w2v' -> '/content/drive/MyDrive/Colab-Notebooks/LLM/sentiment-analysis/models/model.w2v'
'models/tokenizer.pkl' -> '/content/drive/MyDrive/Colab-Notebooks/LLM/sentiment-analysis/models/tokenizer.pkl'
'models/encoder.pkl' -> '/content/drive/MyDrive/Colab-Notebooks/LLM/sentiment-analysis/models/encoder.pkl'
'models/model.keras' -> '/content/drive/MyDrive/Colab-Notebooks/LLM/sentiment-analysis/models/model.keras'
^C
cp: cannot stat 'artifacts/models': No such file or directory


In [108]:
# ## Exit colab
# if IN_COLAB:
# 	from google.colab import runtime
# 	runtime.unassign()

## Model Inference and Prediction

In [13]:
%pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Downloading widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.5-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)
Downloading widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.1.5 jupyterlab-widgets-3.0.13 widgetsnbextension-4.0.13
Note: you may need to restart the kernel to use updated packages.


#### load the trained tokenizer

In [8]:
import pickle

# Load tokenizer from the pickle file
with open(TOKENIZER_MODEL, "rb") as file:
    loaded_tokenizer = pickle.load(file)

# Test loaded tokenizer
text = "Hello"
tokenized_text = loaded_tokenizer.texts_to_sequences(text)
print(tokenized_text)

2024-12-14 21:32:21.832367: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734211943.010495      21 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734211943.221657      21 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-14 21:32:25.522180: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[[1129], [475], [499], [499], []]


In [9]:
# Load tokenizer from the pickle file
with open(ENCODER_MODEL, "rb") as file:
    loaded_encoder = pickle.load(file)

# Test loaded tokenizer
example_labels = [k for k in range(len(loaded_encoder.classes_))]
encoded_labels = loaded_encoder.inverse_transform(example_labels)

assert set(loaded_encoder.classes_) == set(encoded_labels)
print(
    f"- example_labels = {example_labels}\
      \n- encoded_classes = {encoded_labels}"
)

- example_labels = [0, 1]      
- encoded_classes = ['NEGATIVE' 'POSITIVE']


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


#### load the trained models

In [10]:
import gensim

w2v_model = gensim.models.Word2Vec.load(WORD2VEC_MODEL)

In [11]:
import tensorflow as tf

loaded_clf_model = tf.keras.models.load_model(KERAS_MODEL)

2024-12-14 21:32:58.792620: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
2024-12-14 21:32:58.957983: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 348502800 exceeds 10% of free system memory.
2024-12-14 21:32:59.355470: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 348502800 exceeds 10% of free system memory.
2024-12-14 21:32:59.609171: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 348502800 exceeds 10% of free system memory.
2024-12-14 21:33:07.233856: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 348502800 exceeds 10% of free system memory.
  saveable.load_own_variables(weights_store.get(inner_path))


#### prediction functions

In [36]:
!pip install ipywidgets

[0m

In [30]:
# decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
# def decode_sentiment(label):
#     return decode_map[int(label)]
import ipywidgets as widgets


def decode_sentiment(score, include_neutral=True):
    if include_neutral:
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE

        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE


def show_sentiment_thermometer(label, score):
    if score < 0.5:
        color = {"bar_color": "#cc0000"}
    elif score == 0.5:
        color = {"bar_color": "#555753"}
    else:
        color = {"bar_color": "#00ff00"}

    thermometer = widgets.FloatProgress(
        value=score,
        min=0,
        max=1.0,
        description=label,
        bar_style="info",
        style=color,
        orientation="horizontal",
    )

    display(thermometer)

    return thermometer

In [31]:
from tensorflow.keras.preprocessing.sequence import pad_sequences


def predict(text, tokenizer, model, include_neutral=True):
    import time

    start_at = time.time()
    # clean and preprocess the input text
    text = (lambda x: preprocess(x))(text)

    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=SEQUENCE_LENGTH)
    # Predict
    score = model.predict([x_test])[0]
    score = np.squeeze(score)
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)

    # print the predictions
    print(f"\n- processed text : {text}")
    print(f"\n- predicted sentiment : {label} (score={100*score:.1f} %)")
    # show thermometer
    show_sentiment_thermometer(label, score)

    # get results
    result = {
        "label": label,
        "score": float(score),
        "elapsed_time": time.time() - start_at,
    }
    return result

#### prediction examples

In [32]:
result = predict("I love the latest @RoKy music", loaded_tokenizer, loaded_clf_model)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 146ms/step

- processed text : love latest roky music

- predicted sentiment : POSITIVE (score=98.8 %)


FloatProgress(value=0.9880579710006714, bar_style='info', description='POSITIVE', max=1.0, style=ProgressStyle…

In [37]:
result = predict("I hate the rain", loaded_tokenizer, loaded_clf_model)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 146ms/step

- processed text : hate rain

- predicted sentiment : NEGATIVE (score=1.2 %)


FloatProgress(value=0.011537362821400166, bar_style='info', description='NEGATIVE', max=1.0, style=ProgressSty…

In [38]:
result = predict("i don't know what i'm doing", loaded_tokenizer, loaded_clf_model)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step

- processed text : know

- predicted sentiment : NEGATIVE (score=38.4 %)


FloatProgress(value=0.38427892327308655, bar_style='info', description='NEGATIVE', max=1.0, style=ProgressStyl…

In [39]:
result = predict("i am neutral", loaded_tokenizer, loaded_clf_model)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 155ms/step

- processed text : neutral

- predicted sentiment : POSITIVE (score=76.3 %)


FloatProgress(value=0.7627895474433899, bar_style='info', description='POSITIVE', max=1.0, style=ProgressStyle…