# LSTM Model Training
Below we will train a LSTM network using the training data and validate it using the validation set.

## Set enviroment

In [None]:
# %pip install tensorflow pandas numpy matplotlib

In [None]:
# Import necessary libraries for data handling, preprocessing, and modeling
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# For text preprocessing and tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# For building the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# For model evaluation
from sklearn.metrics import classification_report, confusion_matrix

## Load Data

In [None]:
# Load DATA
train_df = pd.read_csv('c:\\Users\\manue\\University\\3\\NaturalLanguageProcessing\\sesion9\\data\\sent_train.csv')
test_df = pd.read_csv('c:\\Users\\manue\\University\\3\\NaturalLanguageProcessing\\sesion9\\data\\sent_test.csv')

In [None]:
# Explore the data: check for null values, data distribution, etc.
print(train_df.head())
print(train_df['sentiment'].value_counts())

## Preprocessing

In [None]:
# Clean Data
import re

def clean_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove punctuation
    return text

train_df['clean_text'] = train_df['tweet'].apply(clean_text)
test_df['clean_text'] = test_df['tweet'].apply(clean_text)

## Tokenization

In [None]:
# Set hyperparameters for tokenization and padding
max_vocab = 5000  # maximum number of words to consider
max_length = 50   # maximum length of a tweet in terms of word count

# Tokenize the text
tokenizer = Tokenizer(num_words=max_vocab, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['clean_text'])

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_df['clean_text'])
valid_sequences = tokenizer.texts_to_sequences(test_df['clean_text'])

# Pad sequences to ensure uniform length
X_train = pad_sequences(train_sequences, maxlen=max_length, padding='post')
X_valid = pad_sequences(valid_sequences, maxlen=max_length, padding='post')

# Prepare target labels
y_train = pd.get_dummies(train_df['sentiment']).values
y_valid = pd.get_dummies(test_df['sentiment']).values

The tokenizer converts words to integers, and padding ensures each sequence is of uniform length.

## LSTM Model

In [None]:
# Define model hyperparameters
embedding_dim = 64
lstm_units = 64

# Build the model
model = Sequential([
    Embedding(input_dim=max_vocab, output_dim=embedding_dim, input_length=max_length),
    LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')  # 3 classes: Bearish, Bullish, Neutral
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Summary of the model architecture
model.summary()

The network starts with an embedding layer, followed by an LSTM to capture sequential dependencies, and ends with dense layers to output probabilities over the 3 sentiment classes.

In [None]:
# Prepare validation data
valid_texts = test_data['text'].values
valid_labels = test_data['label'].values
valid_sequences = tokenizer.texts_to_sequences(valid_texts)
valid_padded = pad_sequences(valid_sequences, maxlen=padded_sequences.shape[1], padding='post')
valid_labels_categorical = to_categorical(valid_labels, num_classes)

# Evaluate the model on validation data
loss, accuracy = model.evaluate(valid_padded, valid_labels_categorical)
print('Validation Loss:', loss)
print('Validation Accuracy:', accuracy)