In [None]:
# ------------------------
# 1. PREPROCESSING
# ------------------------
import pandas as pd
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

import tensorflow as tf
from tensorflow.keras.layers import (
    TextVectorization, Embedding, Input,
    LayerNormalization, MultiHeadAttention,
    Dense, Dropout, GlobalAveragePooling1D
)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

df = pd.read_csv('train_augmented_expanded.csv')

def clean_text(text):
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join(word for word in text.split() if word not in
    stop_words and not word.isdigit())
    return text

df['text'] = df['text'].apply(clean_text)
df.drop(columns=['id'], inplace=True)

sentiment_to_int = {'Very bad': 0, 'Bad': 1, 'Good': 2, 'Very good': 3, 'Excellent': 4}
df['review'] = df['review'].map(sentiment_to_int)


train_texts = df['text'].astype(str).tolist()
train_labels = df['review'].astype(int).tolist()

# ------------------------
# 2. TEXT VECTORIZER
# ------------------------

#max unique words
max_tokens = 20000 
#max sequence length of Texts
max_len = 100
vectorizer = TextVectorization( max_tokens=max_tokens
                              , output_sequence_length=max_len
                                )
#apply vectorizer to training texts
vectorizer.adapt(train_texts)

# ------------------------
# 3. TRANSFORMER BLOCK
# ------------------------

#num_heads: number of attention heads (e.g., one head might look at grammar, another at sentiment).
#embed_dim: embedding dimension 
#ff_dim: feed-forward network dimension
#rate: dropout rate This randomly "turns off" 30% of the neurons during training to prevent overfitting.

def transformer_block(x, num_heads=2, embed_dim=64, ff_dim=128, rate=0.3):
    # Multi-head attention
    attn_output = MultiHeadAttention(
        num_heads=num_heads,
        key_dim=embed_dim,
        dropout=rate
    )(x, x)

    #Randomly drops some connections
    attn_output = Dropout(rate)(attn_output)
    # keeps the mean near 0 and variance near 1
    out1 = LayerNormalization(epsilon=1e-6)(x + attn_output)

    # Feed-forward network
        # Increases dimensionality from embed_dim to ff_dim with ReLU activation 
    ffn = Dense(ff_dim, activation="relu",
                kernel_regularizer=tf.keras.regularizers.l2(1e-3))(out1) 
    ffn = Dropout(rate)(ffn)  
        #Projects back from ff_dim  to embed_dim  dimensions
    ffn = Dense(embed_dim)(ffn)

    return LayerNormalization(epsilon=1e-6)(out1 + ffn)

# ------------------------
# 4. MODEL ARCHITECTURE
# ------------------------
num_classes = 5
embed_dim = 32       # Each word is represented by a vector of 32 numbers
num_heads = 2        # The attention layer has 2 heads
ff_dim = 64          # The internal layer size inside the transformer
drop_rate = 0.4      # 40% dropout rate

inputs = Input(shape=(1,), dtype=tf.string)

x = vectorizer(inputs)

x = Embedding(max_tokens, embed_dim)(x)

# Transformer Block
x = transformer_block(x, num_heads, embed_dim, ff_dim, drop_rate)

# Pooling
# This layer averages all 100 vectors into a single vector
x = GlobalAveragePooling1D()(x)
x = Dropout(0.5)(x)

# Output
# Final Dense layer with softmax activation : calculates the probability for multi-class classification
outputs = Dense(num_classes, activation="softmax")(x)

model = Model(inputs, outputs)
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# ------------------------
# 5. TRAINING WITH VALIDATION
# ------------------------
import numpy as np
from sklearn.model_selection import train_test_split

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels
)

X_train_split = np.array(X_train_split, dtype=object) 
X_val_split = np.array(X_val_split, dtype=object)   
y_train_split = np.array(y_train_split)
y_val_split = np.array(y_val_split)

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=3, #wait for 3 consecutive epochs to see if it improves
    restore_best_weights=True
)

history = model.fit(
    X_train_split,
    y_train_split,
    validation_data=(X_val_split, y_val_split),
    epochs=20,             
    batch_size=32,
    shuffle=True,
    callbacks=[early_stop]  
)

# ------------------------
# 6. TEST PREDICTION
# ------------------------
test_df = pd.read_csv("test.csv")

test_df["text"] = test_df["text"].apply(clean_text)

# Convert to numpy array of strings
test_texts = test_df["text"].astype(str).tolist()
X_test = np.array(test_texts, dtype=object)

pred_probs = model.predict(X_test, batch_size=32)

# Convert to class labels
pred_labels = np.argmax(pred_probs, axis=1)

# Reverse mapping
int_to_sentiment = {v: k for k, v in sentiment_to_int.items()}
pred_sentiments = [int_to_sentiment[i] for i in pred_labels]

ids = test_df["id"]

submission = pd.DataFrame({
    "id": ids,
    "review": pred_sentiments
})

submission.to_csv("submission.csv", index=False)

print("Submission file saved as submission.csv")
