<a href="https://colab.research.google.com/github/2303A510H5/batch30/blob/main/final%20nlp%20project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=8ceebb7878ad4f6f8f0aa2db3ae034400fb37b185cac341aad4ee57b7bfb828e
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [2]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import re
from sklearn.model_selection import train_test_split
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
from rouge import Rouge
import warnings
warnings.filterwarnings('ignore')

# Enable mixed precision for faster training
from tensorflow.keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

print("GPU Available:", tf.config.list_physical_devices('GPU'))

# Download required NLTK data
try:
    nltk.download('punkt', quiet=True)
except:
    pass

# Load the dataset
print("Loading dataset...")
df = pd.read_excel('/content/nlp.xlsx')

en_col = df.columns[0]
te_col = df.columns[1]

english_texts = df[en_col].astype(str).tolist()
telugu_texts = df[te_col].astype(str).tolist()

print(f"Dataset size: {len(english_texts)} pairs")

# Advanced preprocessing with data cleaning
def advanced_preprocess(text, is_telugu=False):
    text = str(text).strip()
    if not is_telugu:
        text = text.lower()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Keep only valid characters
    if is_telugu:
        text = re.sub(r'[^\u0C00-\u0C7F\s.,!?]', '', text)
    else:
        text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
    return text.strip()

# DATA AUGMENTATION - Critical for high scores
def augment_data(english_texts, telugu_texts):
    augmented_en = []
    augmented_te = []

    for en, te in zip(english_texts, telugu_texts):
        # Original
        augmented_en.append(en)
        augmented_te.append(te)

        # Add variations with punctuation
        if not en.endswith('.'):
            augmented_en.append(en + '.')
            augmented_te.append(te)

        # Word order variations for short sentences
        words = en.split()
        if 3 <= len(words) <= 7:
            # Shuffle for simple sentences
            np.random.seed(42)
            if np.random.random() > 0.7:
                shuffled = ' '.join(np.random.permutation(words[:3]).tolist() + words[3:])
                augmented_en.append(shuffled)
                augmented_te.append(te)

    return augmented_en, augmented_te

print("\nPreprocessing and augmenting data...")
english_texts = [advanced_preprocess(text, False) for text in english_texts]
telugu_texts = [advanced_preprocess(text, True) for text in telugu_texts]

# Remove empty or very short pairs
valid_pairs = [(e, t) for e, t in zip(english_texts, telugu_texts)
               if len(e.split()) >= 2 and len(t.split()) >= 2]
english_texts, telugu_texts = zip(*valid_pairs)

# Augment data
english_texts, telugu_texts = augment_data(list(english_texts), list(telugu_texts))

print(f"After augmentation: {len(english_texts)} pairs")

# Add tokens
telugu_texts = ['<start> ' + text + ' <end>' for text in telugu_texts]

# Advanced tokenization with larger vocabulary
print("Creating advanced tokenizers...")
eng_tokenizer = keras.preprocessing.text.Tokenizer(
    filters='',
    oov_token='<OOV>',
    num_words=15000  # Increased vocabulary
)
tel_tokenizer = keras.preprocessing.text.Tokenizer(
    filters='',
    oov_token='<OOV>',
    num_words=15000
)

eng_tokenizer.fit_on_texts(english_texts)
tel_tokenizer.fit_on_texts(telugu_texts)

eng_vocab_size = min(len(eng_tokenizer.word_index) + 1, 15000)
tel_vocab_size = min(len(tel_tokenizer.word_index) + 1, 15000)

print(f"English vocabulary size: {eng_vocab_size}")
print(f"Telugu vocabulary size: {tel_vocab_size}")

# Convert to sequences
eng_sequences = eng_tokenizer.texts_to_sequences(english_texts)
tel_sequences = tel_tokenizer.texts_to_sequences(telugu_texts)

# Dynamic padding
max_eng_len = min(50, max(len(seq) for seq in eng_sequences))
max_tel_len = min(50, max(len(seq) for seq in tel_sequences))

print(f"Max English length: {max_eng_len}")
print(f"Max Telugu length: {max_tel_len}")

eng_padded = keras.preprocessing.sequence.pad_sequences(
    eng_sequences, maxlen=max_eng_len, padding='post'
)
tel_padded = keras.preprocessing.sequence.pad_sequences(
    tel_sequences, maxlen=max_tel_len, padding='post'
)

# Split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    eng_padded, tel_padded, test_size=0.15, random_state=42, shuffle=True
)

print(f"\nTraining samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

# ADVANCED TRANSFORMER-BASED MODEL
class PositionalEncoding(layers.Layer):
    def __init__(self, position, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(position, d_model)

    def get_angles(self, position, i, d_model):
        angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
        return position * angles

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(
            position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],
            i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
            d_model=d_model
        )
        sines = tf.math.sin(angle_rads[:, 0::2])
        cosines = tf.math.cos(angle_rads[:, 1::2])
        pos_encoding = tf.concat([sines, cosines], axis=-1)
        pos_encoding = pos_encoding[tf.newaxis, ...]
        return tf.cast(pos_encoding, tf.float32)

    def call(self, inputs):
        # Cast positional encoding to the input's dtype
        return inputs + tf.cast(self.pos_encoding[:, :tf.shape(inputs)[1], :], inputs.dtype)

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerEncoder, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerDecoder, self).__init__()
        self.att1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.att2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
        self.dropout3 = layers.Dropout(rate)

    def call(self, inputs, enc_output, training):
        attn1 = self.att1(inputs, inputs)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + inputs)

        attn2 = self.att2(out1, enc_output)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        return self.layernorm3(ffn_output + out2)

# Build Advanced Transformer Model
embed_dim = 512  # Increased
num_heads = 8    # Multi-head attention
ff_dim = 2048    # Feed-forward dimension
num_layers = 6   # Stacked layers

print("\nBuilding Advanced Transformer Model...")

# Encoder
encoder_inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(eng_vocab_size, embed_dim)(encoder_inputs)
x = PositionalEncoding(max_eng_len, embed_dim)(x)

for _ in range(num_layers):
    x = TransformerEncoder(embed_dim, num_heads, ff_dim)(x, training=True)

encoder_outputs = x

# Decoder
decoder_inputs = layers.Input(shape=(None,), dtype="int32")
y = layers.Embedding(tel_vocab_size, embed_dim)(decoder_inputs)
y = PositionalEncoding(max_tel_len, embed_dim)(y)

for _ in range(num_layers):
    y = TransformerDecoder(embed_dim, num_heads, ff_dim)(y, encoder_outputs, training=True)

decoder_outputs = layers.Dense(tel_vocab_size, activation="softmax", dtype='float32')(y)

transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Training configuration WITHOUT accuracy metric
optimizer = keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
transformer.compile(
    optimizer=optimizer,
    loss='sparse_categorical_crossentropy'
    # No metrics specified - only loss will be tracked
)

print(transformer.summary())

# Prepare training data
decoder_input_data = y_train[:, :-1]
decoder_target_data = y_train[:, 1:]

# Callbacks for better training
callbacks = [
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6),
    keras.callbacks.ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss')
]

# Training
print("\n" + "="*60)
print("TRAINING ADVANCED TRANSFORMER MODEL")
print("="*60)

history = transformer.fit(
    [X_train, decoder_input_data],
    decoder_target_data,
    batch_size=32,  # Smaller batch for better convergence
    epochs=100,     # More epochs with early stopping
    validation_split=0.1,
    callbacks=callbacks,
    verbose=1
)

print("\nTraining completed!")

# Load best model
transformer = keras.models.load_model('best_model.keras', custom_objects={
    'PositionalEncoding': PositionalEncoding,
    'TransformerEncoder': TransformerEncoder,
    'TransformerDecoder': TransformerDecoder
})

# Advanced prediction with beam search
def beam_search_decode(input_seq, beam_width=5):
    enc_out = transformer.layers[2](transformer.layers[1](input_seq))

    sequences = [[[], 0.0]]

    for _ in range(max_tel_len):
        all_candidates = []

        for seq, score in sequences:
            if len(seq) > 0 and seq[-1] == tel_tokenizer.word_index.get('<end>', 0):
                all_candidates.append([seq, score])
                continue

            if len(seq) == 0:
                dec_input = np.array([[tel_tokenizer.word_index['<start>']]])
            else:
                dec_input = np.array([seq])

            dec_input = keras.preprocessing.sequence.pad_sequences(
                [dec_input[0]], maxlen=max_tel_len, padding='post'
            )

            predictions = transformer.predict([input_seq, dec_input], verbose=0)

            # Get top k predictions
            top_k = np.argsort(predictions[0, len(seq)])[-beam_width:]

            for k in top_k:
                candidate = [seq + [k], score - np.log(predictions[0, len(seq), k] + 1e-10)]
                all_candidates.append(candidate)

        # Order all candidates by score
        ordered = sorted(all_candidates, key=lambda x: x[1])
        sequences = ordered[:beam_width]

    return sequences[0][0]

def translate(sentence):
    sentence = advanced_preprocess(sentence, False)
    inputs = eng_tokenizer.texts_to_sequences([sentence])
    inputs = keras.preprocessing.sequence.pad_sequences(
        inputs, maxlen=max_eng_len, padding='post'
    )

    # Use beam search for better results
    output = beam_search_decode(inputs, beam_width=5)

    result = []
    for idx in output:
        word = tel_tokenizer.index_word.get(idx, '')
        if word and word not in ['<start>', '<end>', '<OOV>']:
            result.append(word)

    return ' '.join(result)

# EVALUATION - BLEU AND ROUGE ONLY
print("\n" + "="*60)
print("EVALUATION: BLEU & ROUGE SCORES ONLY")
print("="*60)

print("\nGenerating predictions...")
predictions = []
references = []
english_samples = []

num_eval = min(500, len(X_test))  # Evaluate on more samples

for i in range(num_eval):
    eng_text = ' '.join([
        eng_tokenizer.index_word.get(idx, '')
        for idx in X_test[i] if idx != 0
    ])

    tel_text = ' '.join([
        tel_tokenizer.index_word.get(idx, '')
        for idx in y_test[i]
        if idx != 0 and tel_tokenizer.index_word.get(idx) not in ['<start>', '<end>', '<OOV>']
    ])

    pred = translate(eng_text)

    if pred.strip() and tel_text.strip():
        predictions.append(pred)
        references.append(tel_text)
        english_samples.append(eng_text)

    if i < 10:
        print(f"\nExample {i+1}:")
        print(f"English: {eng_text}")
        print(f"Reference: {tel_text}")
        print(f"Predicted: {pred}")

# Calculate BLEU Score with multiple methods
print("\n" + "="*60)
print("BLEU SCORES")
print("="*60)

smooth = SmoothingFunction()
bleu_scores = {'bleu1': [], 'bleu2': [], 'bleu3': [], 'bleu4': []}

for pred, ref in zip(predictions, references):
    pred_tokens = pred.split()
    ref_tokens = [ref.split()]

    try:
        bleu_scores['bleu1'].append(
            sentence_bleu(ref_tokens, pred_tokens, weights=(1, 0, 0, 0),
                         smoothing_function=smooth.method4)
        )
        bleu_scores['bleu2'].append(
            sentence_bleu(ref_tokens, pred_tokens, weights=(0.5, 0.5, 0, 0),
                         smoothing_function=smooth.method4)
        )
        bleu_scores['bleu3'].append(
            sentence_bleu(ref_tokens, pred_tokens, weights=(0.33, 0.33, 0.33, 0),
                         smoothing_function=smooth.method4)
        )
        bleu_scores['bleu4'].append(
            sentence_bleu(ref_tokens, pred_tokens, weights=(0.25, 0.25, 0.25, 0.25),
                         smoothing_function=smooth.method4)
        )
    except:
        pass

avg_bleu1 = np.mean(bleu_scores['bleu1']) * 100
avg_bleu2 = np.mean(bleu_scores['bleu2']) * 100
avg_bleu3 = np.mean(bleu_scores['bleu3']) * 100
avg_bleu4 = np.mean(bleu_scores['bleu4']) * 100

print(f"BLEU-1: {avg_bleu1:.2f}%")
print(f"BLEU-2: {avg_bleu2:.2f}%")
print(f"BLEU-3: {avg_bleu3:.2f}%")
print(f"BLEU-4: {avg_bleu4:.2f}%")

# Calculate ROUGE Score
print("\n" + "="*60)
print("ROUGE SCORES")
print("="*60)

rouge = Rouge()
try:
    valid_pairs = [(p, r) for p, r in zip(predictions, references)
                   if p.strip() and r.strip() and len(p.split()) > 0 and len(r.split()) > 0]

    if valid_pairs:
        valid_preds, valid_refs = zip(*valid_pairs)
        rouge_scores = rouge.get_scores(list(valid_preds), list(valid_refs), avg=True)

        print(f"ROUGE-1 F1: {rouge_scores['rouge-1']['f']*100:.2f}%")
        print(f"ROUGE-2 F1: {rouge_scores['rouge-2']['f']*100:.2f}%")
        print(f"ROUGE-L F1: {rouge_scores['rouge-l']['f']*100:.2f}%")

        print(f"\nROUGE-1 Precision: {rouge_scores['rouge-1']['p']*100:.2f}%")
        print(f"ROUGE-1 Recall: {rouge_scores['rouge-1']['r']*100:.2f}%")
        print(f"ROUGE-2 Precision: {rouge_scores['rouge-2']['p']*100:.2f}%")
        print(f"ROUGE-2 Recall: {rouge_scores['rouge-2']['r']*100:.2f}%")
except Exception as e:
    print(f"Error calculating ROUGE: {e}")

print("\n" + "="*60)
print("EVALUATION COMPLETE!")
print("="*60)
print(f"\nTotal predictions evaluated: {len(predictions)}")
print(f"Model saved as: best_model.keras")

GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Loading dataset...
Dataset size: 5000 pairs

Preprocessing and augmenting data...
After augmentation: 5896 pairs
Creating advanced tokenizers...
English vocabulary size: 9486
Telugu vocabulary size: 10024
Max English length: 20
Max Telugu length: 22

Training samples: 5011
Testing samples: 885

Building Advanced Transformer Model...


None

TRAINING ADVANCED TRANSFORMER MODEL
Epoch 1/100
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m399s[0m 2s/step - loss: 3.8026 - val_loss: 2.9925 - learning_rate: 1.0000e-04
Epoch 2/100
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m326s[0m 2s/step - loss: 2.6731 - val_loss: 2.2968 - learning_rate: 1.0000e-04
Epoch 3/100
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m346s[0m 2s/step - loss: 2.0819 - val_loss: 2.2814 - learning_rate: 1.0000e-04
Epoch 4/100
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 320ms/step - loss: 1.9684 - val_loss: 2.2936 - learning_rate: 1.0000e-04
Epoch 5/100
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 310ms/step - loss: 1.9440 - val_loss: 2.2907 - learning_rate: 1.0000e-04
Epoch 6/100
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 314ms/step - loss: 1.8699 - val_loss: 2.3230 - learning_rate: 1.0000e-04
Epoch 7/100
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━