<a href="https://colab.research.google.com/github/2303A510H5/batch30/blob/main/NLP%20project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===========================================
# ✅ FINAL — LSTM Seq2Seq Translation + BLEU + ROUGE
# ===========================================
import os, sys, subprocess, numpy as np, pandas as pd, tensorflow as tf, nltk
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu

# ---------- Install required packages ----------
for pkg in ["rouge-score", "openpyxl", "nltk"]:
    try:
        __import__(pkg)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

from rouge_score import rouge_scorer

# ---------- Download NLTK tokenizers ----------
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)

# ---------- Configuration ----------
DATA_PATH = "/content/sru_dataset_nmt_sequencetosequence_model(1).xlsx"  # <-- update path if needed
NUM_WORDS = 20000
EMBED_DIM = 256
LATENT_DIM = 512
BATCH_SIZE = 64
EPOCHS = 30
TEST_SIZE = 0.1
MAX_ENCODER_SEQ_LEN = 60
MAX_DECODER_SEQ_LEN = 60
START_TOKEN, END_TOKEN = "<start>", "<end>"

# ---------- Load dataset ----------
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found: {DATA_PATH}")

df = pd.read_excel(DATA_PATH)
df.dropna(inplace=True)
df.columns = [c.lower().strip() for c in df.columns]

if "english" in df.columns and "telugu" in df.columns:
    SRC, TGT = "english", "telugu"
else:
    SRC, TGT = df.columns[0], df.columns[1]

df[TGT] = df[TGT].apply(lambda x: f"{START_TOKEN} {str(x).strip()} {END_TOKEN}")

# ---------- Tokenization ----------
src_tok = Tokenizer(num_words=NUM_WORDS, oov_token="<oov>")
src_tok.fit_on_texts(df[SRC])
src_seq = src_tok.texts_to_sequences(df[SRC])

tgt_tok = Tokenizer(num_words=NUM_WORDS, oov_token="<oov>")
tgt_tok.fit_on_texts(df[TGT])
tgt_seq = tgt_tok.texts_to_sequences(df[TGT])

enc_in = pad_sequences(src_seq, maxlen=MAX_ENCODER_SEQ_LEN, padding="post")
dec_in = pad_sequences([s[:-1] for s in tgt_seq], maxlen=MAX_DECODER_SEQ_LEN, padding="post")
dec_tg = pad_sequences([s[1:] for s in tgt_seq], maxlen=MAX_DECODER_SEQ_LEN, padding="post")

X_enc_tr, X_enc_te, X_dec_tr, X_dec_te, y_dec_tr, y_dec_te = train_test_split(
    enc_in, dec_in, dec_tg, test_size=TEST_SIZE, random_state=42
)

num_encoder_tokens = min(NUM_WORDS, len(src_tok.word_index) + 1)
num_decoder_tokens = min(NUM_WORDS, len(tgt_tok.word_index) + 1)

# ---------- Build Seq2Seq Model ----------
# Encoder
enc_inputs = Input(shape=(None,))
enc_emb = Embedding(num_encoder_tokens, EMBED_DIM, mask_zero=True)(enc_inputs)
enc_out, state_h, state_c = LSTM(LATENT_DIM, return_state=True)(enc_emb)
enc_states = [state_h, state_c]

# Decoder
dec_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, EMBED_DIM, mask_zero=True)
dec_emb = dec_emb_layer(dec_inputs)
dec_lstm_layer = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
dec_out, _, _ = dec_lstm_layer(dec_emb, initial_state=enc_states)
dec_dense_layer = Dense(num_decoder_tokens, activation="softmax")  # ✅ fixed line
dec_out = dec_dense_layer(dec_out)

# Full Model
model = Model([enc_inputs, dec_inputs], dec_out)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# ---------- Training ----------
train_y = np.expand_dims(y_dec_tr, -1)
val_y = np.expand_dims(y_dec_te, -1)

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
]

model.fit(
    [X_enc_tr, X_dec_tr],
    train_y,
    validation_data=([X_enc_te, X_dec_te], val_y),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=callbacks,
)

# ---------- Inference Models ----------
# Encoder inference
enc_model = Model(enc_inputs, enc_states)

# Decoder inference
dec_state_input_h = Input(shape=(LATENT_DIM,))
dec_state_input_c = Input(shape=(LATENT_DIM,))
dec_states_inputs = [dec_state_input_h, dec_state_input_c]

dec_emb2 = dec_emb_layer(dec_inputs)
dec_out2, dec_h2, dec_c2 = dec_lstm_layer(dec_emb2, initial_state=dec_states_inputs)
dec_out2 = dec_dense_layer(dec_out2)
dec_states2 = [dec_h2, dec_c2]

dec_model = Model([dec_inputs] + dec_states_inputs, [dec_out2] + dec_states2)

# ---------- Reverse dictionary ----------
rev_tgt = {i: w for w, i in tgt_tok.word_index.items()}
rev_tgt[0] = ""

# ---------- Decoding ----------
def decode_sequence(seq):
    states = enc_model.predict(seq)
    target_seq = np.array([[tgt_tok.word_index.get("<start>", 1)]])
    result = []
    for _ in range(MAX_DECODER_SEQ_LEN):
        output_tokens, h, c = dec_model.predict([target_seq] + states)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = rev_tgt.get(sampled_token_index, "")
        if sampled_word in ("<end>", ""):
            break
        result.append(sampled_word)
        target_seq = np.array([[sampled_token_index]])
        states = [h, c]
    return " ".join(result)

# ---------- Evaluation ----------
refs, hyps = [], []
for i in range(len(X_enc_te)):
    dec = decode_sequence(X_enc_te[i:i+1])
    ref_seq = y_dec_te[i]
    ref_words = [rev_tgt.get(t, "") for t in ref_seq if t > 0 and rev_tgt.get(t) not in ("<start>", "<end>", "")]
    refs.append([nltk.word_tokenize(" ".join(ref_words))])
    hyps.append(nltk.word_tokenize(dec))

bleu = corpus_bleu([r[0] for r in refs], hyps) * 100
print(f"\n🟩 BLEU Score: {bleu:.2f}")

scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
r1 = r2 = rl = 0
for r, h in zip(refs, hyps):
    ref = " ".join(r[0])
    hyp = " ".join(h)
    scores = scorer.score(ref, hyp)
    r1 += scores["rouge1"].fmeasure
    r2 += scores["rouge2"].fmeasure
    rl += scores["rougeL"].fmeasure
n = len(refs)
print(f"🟩 ROUGE-1: {r1/n*100:.2f}, ROUGE-2: {r2/n*100:.2f}, ROUGE-L: {rl/n*100:.2f}")
print("\n✅ Training and Evaluation Completed Successfully.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
