# LSTM Model Training
Below we will train a LSTM network using the training data and validate it using the validation set.

In [None]:
# %pip install tensorflow pandas numpy matplotlib

In [None]:
# Import necessary libraries for data handling, preprocessing, and modeling
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# For text preprocessing and tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# For building the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# For model evaluation
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Load DATA
train_df = pd.read_csv('c:\\Users\\manue\\University\\3\\NaturalLanguageProcessing\\sesion9\\data\\sent_train.csv')
test_df = pd.read_csv('c:\\Users\\manue\\University\\3\\NaturalLanguageProcessing\\sesion9\\data\\sent_test.csv')

In [None]:
# Explore the data: check for null values, data distribution, etc.
print(train_df.head())
print(train_df['sentiment'].value_counts())

In [None]:
# Clean Data
import re

def clean_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove punctuation
    return text

train_df['clean_text'] = train_df['tweet'].apply(clean_text)
test_df['clean_text'] = test_df['tweet'].apply(clean_text)

In [None]:
# Convert text and labels to appropriate forms
texts = train_df['text'].values
labels = train_df['label'].values

# Tokenize the texts
tokenizer = Tokenizer(oov_token='<OOV>') # Out of vocabulary token for words not in the dictionary
tokenizer.fit_on_texts(texts) # Fit tokenizer on texts to create dictionary
sequences = tokenizer.texts_to_sequences(texts) # Convert texts to sequences of numbers
# Pad sequences
padded_sequences = pad_sequences(sequences, padding='post') # Pad sequences to make them of equal length

# Convert labels to one-hot if needed
num_classes = 3  # Bearish, Bullish, Neutral
labels_categorical = to_categorical(labels, num_classes) # Convert labels to one-hot encoding

In [None]:
# Build the LSTM model
model = Sequential() # Sequential because we are stacking layers sequentially
# Embedding layer, adjust vocab size and embedding dim as needed
vocab_size = len(tokenizer.word_index) + 1
model.add(Embedding(input_dim=vocab_size, output_dim=64)) # Embedding layer to convert words to vectors of size 64
model.add(LSTM(64)) # LSTM layer with 64 units
model.add(Dense(num_classes, activation='softmax')) # Dense layer with 3 units and softmax activation
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # Compile the model

# Train the model
model.fit(padded_sequences, labels_categorical, epochs=3, batch_size=32) # Fit the model on the data

In [None]:
# Prepare validation data
valid_texts = test_data['text'].values
valid_labels = test_data['label'].values
valid_sequences = tokenizer.texts_to_sequences(valid_texts)
valid_padded = pad_sequences(valid_sequences, maxlen=padded_sequences.shape[1], padding='post')
valid_labels_categorical = to_categorical(valid_labels, num_classes)

# Evaluate the model on validation data
loss, accuracy = model.evaluate(valid_padded, valid_labels_categorical)
print('Validation Loss:', loss)
print('Validation Accuracy:', accuracy)