In [None]:
# ------------------------
# 1. PREPROCESSING
# ------------------------
import pandas as pd
import string
import nltk
nltk.download('stopwords')

import tensorflow as tf
from tensorflow.keras.layers import (
    TextVectorization, Embedding, Input,
    LayerNormalization, MultiHeadAttention,
    Dense, Dropout, GlobalAveragePooling1D
)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

df = pd.read_csv('train_augmented_expanded.csv')

# --- CLEAN TEXT FUNCTION ---
def clean_text(text):
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join(word for word in text.split() if not word.isdigit())
    return text

df['text'] = df['text'].apply(clean_text)
df.drop(columns=['id'], inplace=True)

# Define a mapping for sentiment labels to integers
sentiment_to_int = {'Very bad': 0, 'Bad': 1, 'Good': 2, 'Very good': 3, 'Excellent': 4}

# Apply the mapping to the 'review' column
df['review'] = df['review'].map(sentiment_to_int)

# Extract training data
train_texts = df['text'].astype(str).tolist()
train_labels = df['review'].astype(int).tolist()

# ------------------------
# 2. TEXT VECTORIZER
# ------------------------
max_tokens = 20000 # vocabulary size
max_len = 100 # FIX: shorter length for small data
vectorizer = TextVectorization( max_tokens=max_tokens
                              , output_sequence_length=max_len
                                )
vectorizer.adapt(train_texts)

# ------------------------
# 3. TRANSFORMER BLOCK
# ------------------------
def transformer_block(x, num_heads=2, embed_dim=64, ff_dim=128, rate=0.3):
    # Multi-head attention
    attn_output = MultiHeadAttention(
        num_heads=num_heads,
        key_dim=embed_dim,
        dropout=rate    # FIX 1: dropout inside attention
    )(x, x)

    attn_output = Dropout(rate)(attn_output)
    out1 = LayerNormalization(epsilon=1e-6)(x + attn_output)

    # Feed-forward network
    ffn = Dense(ff_dim, activation="relu",
                kernel_regularizer=tf.keras.regularizers.l2(1e-3))(out1)  # FIX 2: L2
    ffn = Dropout(rate)(ffn)  # FIX 3: more dropout
    ffn = Dense(embed_dim)(ffn)

    return LayerNormalization(epsilon=1e-6)(out1 + ffn)

# ------------------------
# 4. MODEL ARCHITECTURE
# ------------------------
num_classes = 5
embed_dim = 32   # smaller transformer
num_heads = 2
ff_dim = 64
drop_rate = 0.4

inputs = Input(shape=(1,), dtype=tf.string)
x = vectorizer(inputs)
x = Embedding(max_tokens, embed_dim)(x)

# Transformer Block (with fixes)
x = transformer_block(x, num_heads, embed_dim, ff_dim, drop_rate)

# Pooling
x = GlobalAveragePooling1D()(x)
x = Dropout(0.5)(x)   # FIX 4: bigger dropout before dense

# Output
outputs = Dense(num_classes, activation="softmax")(x)

model = Model(inputs, outputs)
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# ------------------------
# 5. TRAINING WITH VALIDATION
# ------------------------
import numpy as np
from sklearn.model_selection import train_test_split

# Split the training data and labels into training and validation sets manually
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels
)

# Convert text data and labels to numpy arrays, explicitly using dtype=object for strings
X_train_split = np.array(X_train_split, dtype=object) # Convert list of strings to numpy array of strings
X_val_split = np.array(X_val_split, dtype=object)   # Convert list of strings to numpy array of strings
y_train_split = np.array(y_train_split)
y_val_split = np.array(y_val_split)

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    X_train_split,
    y_train_split,
    validation_data=(X_val_split, y_val_split),
    epochs=20,             # we increase this because early stopping will stop early
    batch_size=32,
    shuffle=True,
    callbacks=[early_stop]  # <-- IMPORTANT
)

# ------------------------
# 6. TEST PREDICTION
# ------------------------
test_df = pd.read_csv("test.csv")

# Clean text
test_df["text"] = test_df["text"].apply(clean_text)

# Convert to numpy array of strings
test_texts = test_df["text"].astype(str).tolist()
X_test = np.array(test_texts, dtype=object)

# Predict probabilities
pred_probs = model.predict(X_test, batch_size=32)

# Convert to class labels
pred_labels = np.argmax(pred_probs, axis=1)

# Reverse mapping
int_to_sentiment = {v: k for k, v in sentiment_to_int.items()}
pred_sentiments = [int_to_sentiment[i] for i in pred_labels]

ids = test_df["id"]

# Save submission
submission = pd.DataFrame({
    "id": ids,
    "review": pred_sentiments
})

submission.to_csv("submission.csv", index=False)

print("Submission file saved as submission.csv")
