# LSTM Model Training
Below we will train a LSTM network using the training data and validate it using the validation set.

## Set enviroment

In [None]:
# %pip install tensorflow pandas numpy matplotlib seaborn scikit-learn

        [38;2;0;0;0m▄[38;2;0;0;0m[48;2;189;148;115m▀[0m[38;2;0;0;0m[48;2;189;148;115m▀[0m[38;2;0;0;0m▄              
      [38;2;0;0;0m▄[38;2;0;0;0m[48;2;189;148;115m▀[0m[38;2;65;65;65m[48;2;65;65;65m▀[0m[38;2;189;148;115m[48;2;189;148;115m▀[0m[38;2;189;148;115m[48;2;189;148;115m▀[0m[38;2;65;65;65m[48;2;65;65;65m▀[0m[38;2;0;0;0m[48;2;189;148;115m▀[0m[38;2;0;0;0m▄            
      [38;2;0;0;0m▀[38;2;189;148;115m[48;2;0;0;0m▀[0m[38;2;65;65;65m[48;2;65;65;65m▀[0m[38;2;189;148;115m[48;2;189;148;115m▀[0m[38;2;156;115;82m[48;2;65;65;65m▀[0m[38;2;189;148;115m[48;2;189;148;115m▀[0m[38;2;189;148;115m[48;2;0;0;0m▀[0m[38;2;0;0;0m▀      [38;2;0;0;0m▄[38;2;0;0;0m▄[38;2;0;0;0m▄   
     [38;2;0;0;0m▄[38;2;0;0;0m[48;2;189;148;115m▀[0m[38;2;189;148;115m[48;2;189;148;115m▀[0m[38;2;189;148;115m[48;2;156;115;82m▀[0m[38;2;189;148;115m[48;2;65;65;65m▀[0m[38;2;189;148;115m[48;2;189;148;115m▀[0m[38;2;189;148;115m[48;2;189;148;115m▀[0m[38;

In [None]:
# Import necessary libraries for data handling, preprocessing, and modeling
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

# For text preprocessing and tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# For building the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# For model evaluation
from sklearn.metrics import classification_report, confusion_matrix

2025-03-17 17:19:24.172180: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-17 17:19:24.180578: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742228364.189488  113381 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742228364.192316  113381 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742228364.199770  113381 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

## Load Data

In [1]:
# Load DATA
train_df = pd.read_csv('./data/sent_train.csv')
test_df = pd.read_csv('./data/sent_test.csv')

NameError: name 'pd' is not defined

In [None]:
# Explore the data: check for null values, data distribution, etc.
print(train_df.head())
print(train_df['sentiment'].value_counts())

## Preprocessing

In [None]:
# Preprocess the text data
def clean_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove punctuation
    return text

train_df['clean_text'] = train_df['tweet'].apply(clean_text)
test_df['clean_text'] = test_df['tweet'].apply(clean_text)

## Tokenization

In [None]:
# Set hyperparameters for tokenization and padding
max_vocab = 5000  # maximum number of words to consider
max_length = 50   # maximum length of a tweet in terms of word count

# Tokenize the text
tokenizer = Tokenizer(num_words=max_vocab, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['clean_text'])

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_df['clean_text'])
valid_sequences = tokenizer.texts_to_sequences(test_df['clean_text'])

# Pad sequences to ensure uniform length
X_train = pad_sequences(train_sequences, maxlen=max_length, padding='post')
X_valid = pad_sequences(valid_sequences, maxlen=max_length, padding='post')

# Prepare target labels
y_train = pd.get_dummies(train_df['sentiment']).values
y_valid = pd.get_dummies(test_df['sentiment']).values

The tokenizer converts words to integers, and padding ensures each sequence is of uniform length.

## LSTM Model

In [None]:
# Define model hyperparameters
embedding_dim = 64
lstm_units = 64

# Build the model
model = Sequential([
    Embedding(input_dim=max_vocab, output_dim=embedding_dim, input_length=max_length),
    LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')  # 3 classes: Bearish, Bullish, Neutral
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Summary of the model architecture
model.summary()

The network starts with an embedding layer, followed by an LSTM to capture sequential dependencies, and ends with dense layers to output probabilities over the 3 sentiment classes.

In [None]:
# Train the model with validation
model.fit(
    X_train, y_train,
    epochs=10,  # adjust epochs as necessary
    batch_size=32,
    validation_data=(X_valid, y_valid)
)

## Evaluate Model

In [None]:
# Plot training & validation accuracy values
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(model.history['accuracy'], label='Train Accuracy')
plt.plot(model.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(model.history['loss'], label='Train Loss')
plt.plot(model.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Evaluate on validation set
val_loss, val_accuracy = model.evaluate(X_valid, y_valid, verbose=0)
print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

# Generate predictions and print classification report
y_pred = model.predict(X_valid)
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_valid, axis=1)

print(classification_report(y_true_labels, y_pred_labels, target_names=['Bearish', 'Bullish', 'Neutral']))

# Optional: Display confusion matrix
cm = confusion_matrix(y_true_labels, y_pred_labels)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Bearish', 'Bullish', 'Neutral'], yticklabels=['Bearish', 'Bullish', 'Neutral'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()