In [None]:
# %% [markdown]
# # Twitter Sentiment Analysis using LSTM
# 
# In this project, we build an LSTM model to classify tweets into positive or negative sentiments using the Sentiment140 dataset.

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

nltk.download('stopwords')
from nltk.corpus import stopwords

# %%
# Load dataset
data_path = "../data/training.1600000.processed.noemoticon.csv"

df = pd.read_csv(data_path, encoding='latin-1', header=None)
df = df[[0, 5]]
df.columns = ["sentiment", "text"]

# Keep only 0 (negative) and 4 (positive) sentiments
df = df[df['sentiment'] != 2]
df['sentiment'] = df['sentiment'].map({0: 0, 4: 1})

df = df.sample(100000, random_state=42).reset_index(drop=True)  # Subsample to speed up training

# %%
# Preprocessing function
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = re.sub(r"http\S+|@\S+|#\S+|[^A-Za-z\s]", "", text)
    text = text.lower().strip()
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    return " ".join(tokens)

df["cleaned"] = df["text"].apply(preprocess)

# %%
# Tokenization
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df["cleaned"])

sequences = tokenizer.texts_to_sequences(df["cleaned"])
X = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")
y = df["sentiment"].values

# %%
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# %%
# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

# %%
# Train the model
history = model.fit(X_train, y_train, epochs=3, batch_size=128, validation_data=(X_test, y_test))

# %%
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# %%
# Predictions and confusion matrix
y_pred = (model.predict(X_test) > 0.5).astype("int32")

print(classification_report(y_test, y_pred))
conf_matrix = confusion_matrix(y_test, y_pred)

sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
