In [87]:
#Library calls
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from gensim.models import Word2Vec
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from nltk.translate.bleu_score import sentence_bleu





Question no 1


In [3]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # eliminate punctuation
    text = re.sub(r'\bpladge\b', '', text, flags=re.IGNORECASE)  # remove the word "pladge"
    return text.strip()

In [4]:
data = pd.read_csv("/content/Input.tsv", sep="\t", names=["text", "label"])#data loading

data['text'] = data['text'].apply(preprocess_text)#preprocessing on text column
data = data[data['label'].isin(['P', 'N'])]# row Filtering with labels not in the desired set
encoder = LabelEncoder()#labels to numerical format: P -> 1, N -> 0
data['label'] = encoder.fit_transform(data['label'])
# Spliting the dataset
X_train, X_test, y_train, y_test = train_test_split(
    data['text'], data['label'], test_size=0.25, random_state=42
)
print(data.head())


                                                text  label
1  میں نے ایٹم بم بنایا ھے او بھائی ایٹم بمب کوٹ ...      1
2  چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...      0
4  سرچ انجن گوگل کے نائب صدر نے فضا میں  130000 ف...      1
5      ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار أ      1
6  گندی زبان اور گٹر جیسے دماغ والے جاهل جیالے هو...      0


In [5]:
def assess_binary_model(model, X_data, true_labels):
    predicted_probs = model.predict(X_data)
    predicted_classes = (predicted_probs > 0.5).astype("int32")
    return {
        "Accuracy": accuracy_score(true_labels, predicted_classes),
        "Precision": precision_score(true_labels, predicted_classes),
        "Recall": recall_score(true_labels, predicted_classes),
        "F1-Score": f1_score(true_labels, predicted_classes)
    }

# Evaluation BERT
def assess_transformer_model(model, test_data_loader, true_labels):
    output_logits = model.predict(test_data_loader).logits
    predicted_classes = np.argmax(output_logits, axis=1)

    return {
        "Accuracy": accuracy_score(true_labels, predicted_classes),
        "Precision": precision_score(true_labels, predicted_classes),
        "Recall": recall_score(true_labels, predicted_classes),
        "F1-Score": f1_score(true_labels, predicted_classes)
    }


# Text tokenizer and padding

In [6]:
text_tokenizer = Tokenizer(oov_token="<OOV>")#out-of-vocabulary words
text_tokenizer.fit_on_texts(X_train)
vocab_size = len(text_tokenizer.word_index) + 1
train_sequences = text_tokenizer.texts_to_sequences(X_train)
test_sequences = text_tokenizer.texts_to_sequences(X_test)
max_sequence_length = 50
X_train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')

evaluation_results = {}

ALL models

In [7]:
# Build a simple RNN model for binary classification
rnn_model = Sequential([
    Embedding(vocab_size, 128),
    SimpleRNN(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

rnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Train the model for 25 epochs with a batch size of 32 and 10% validation data
rnn_model.fit(X_train_padded, y_train, epochs=25, batch_size=32, validation_split=0.1)
evaluation_metrics = assess_binary_model(rnn_model, X_test_padded, y_test)#evaltating model on test data
evaluation_results["RNN"] = evaluation_metrics
print(evaluation_results["RNN"])


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
{'Accuracy': 0.5959183673469388, 'Precision': 0.5540540540540541, 'Recall': 0.7130434782608696, 'F1-Score': 0.623574144486692}


In [27]:
# Build the LSTM-based model for binary classification
lstm_model = Sequential([
    Embedding(vocab_size, 128),
    LSTM(64),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train_padded, y_train, epochs=25, batch_size=32, validation_split=0.1)
lstm_metrics = assess_binary_model(lstm_model, X_test_padded, y_test)
evaluation_results["LSTM"] = lstm_metrics

print(evaluation_results["LSTM"])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
{'Accuracy': 0.6163265306122448, 'Precision': 0.5668789808917197, 'Recall': 0.7739130434782608, 'F1-Score': 0.6544117647058824}


In [12]:
# Build the GRU-based model for binary classification
gru_model = Sequential([
    Embedding(vocab_size, 128),
    GRU(64),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
gru_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
gru_model.fit(X_train_padded, y_train, epochs=25, batch_size=32, validation_split=0.1)
gru_metrics = assess_binary_model(gru_model, X_test_padded, y_test)
evaluation_results["GRU"] = gru_metrics
print(evaluation_results["GRU"])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
{'Accuracy': 0.46938775510204084, 'Precision': 0.46938775510204084, 'Recall': 1.0, 'F1-Score': 0.6388888888888888}


In [30]:
# Build the BiLSTM-based model for binary classification
bilstm_model = Sequential([
    Embedding(vocab_size, 128),
    Bidirectional(LSTM(64)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
bilstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_train_padded, y_train, epochs=25, batch_size=32, validation_split=0.1)
bilstm_metrics = assess_binary_model(bilstm_model, X_test_padded, y_test)
evaluation_results["BILSTM"] = bilstm_metrics
print(evaluation_results["BILSTM"])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
{'Accuracy': 0.6244897959183674, 'Precision': 0.5851851851851851, 'Recall': 0.6869565217391305, 'F1-Score': 0.632}


In [32]:
# Initialize the multilingual BERT model and tokenizer
bert_model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
multilingual_model = TFAutoModelForSequenceClassification.from_pretrained(bert_model_name, num_labels=2)
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)
optimizer = Adam(learning_rate=2e-5)
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(16)# Convert data into TensorFlow datasets
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(16)
multilingual_model.compile(optimizer=optimizer,
                           loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                           metrics=['accuracy'])
multilingual_model.fit(train_dataset, epochs=5, validation_data=test_dataset)
mbert_metrics = assess_transformer_model(multilingual_model, test_dataset, y_test)
evaluation_results["mBERT"] = mbert_metrics
print(evaluation_results["mBERT"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
{'Accuracy': 0.6489795918367347, 'Precision': 0.8372093023255814, 'Recall': 0.3130434782608696, 'F1-Score': 0.45569620253164556}


In [10]:
# Initialize XLM-RoBERTa model and tokenizer
xlm_model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(xlm_model_name)
xlm_model = TFAutoModelForSequenceClassification.from_pretrained(xlm_model_name, num_labels=2)
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(16)
xlm_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])
xlm_model.fit(train_dataset, epochs=3, validation_data=test_dataset)
xlm_metrics = assess_transformer_model(xlm_model, test_dataset, y_test)
evaluation_results["XLMBERT"] = xlm_metrics
print(evaluation_results["XLMBERT"])

All PyTorch model weights were used when initializing TFXLMRobertaForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFXLMRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
{'Accuracy': 0.5387755102040817, 'Precision': 0.6666666666666666, 'Recall': 0.034782608695652174, 'F1-Score': 0.06611570247933884}


In [15]:
df_results = pd.DataFrame(evaluation_results[1:], columns=evaluation_results[0])
print(df_results)

     Model  Accuracy  Precision  Recall  F1-Score
0  XLMBERT    0.5388     0.6667  0.0348    0.0661
1    mBERT    0.6490     0.8372  0.3130    0.4557
2   BILSTM    0.6245     0.5852  0.6870    0.6320
3      GRU    0.4694     0.4694  1.0000    0.6389
4     LSTM    0.6163     0.5669  0.7739    0.6544
5      RNN    0.5469     0.5156  0.5739    0.5432


In [8]:
# Urdu text normalization for embedding preparation
def preprocess_text(text):
    # Eliminate digits
    text = re.sub(r"\d+", " ", text)
    # Strip out common English punctuation marks
    text = re.sub(r"""[!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]+""", " ", text)
    # Remove Urdu-specific punctuation symbols
    text = re.sub(r"[:؛؟’‘٭ء،۔]+", " ", text)
    # Delete Arabic numerals
    text = re.sub(r"[٠‎١‎٢‎٣‎٤‎٥‎٦‎٧‎٨‎٩]+", " ", text)
    # Filter non-word characters (excluding spaces)
    text = re.sub(r"[^\w\s]", " ", text)
    # Strip English letters and digits
    text = re.sub(r"[a-zA-Z0-9]+", " ", text)
    # Normalize extra white spaces
    text = re.sub(r"\s+", " ", text)
    return text.strip()


Question no 2

In [15]:
def load_and_clean_data(filepath):
    processed_sentences = []
    data = pd.read_csv(filepath, sep='\t')

    for sentence in data['Tweet']:
        cleaned = preprocess_text(sentence)
        processed_sentences.append(cleaned.split())

    return processed_sentences
def train_w2v(sentences, filename):
    w2v_model = Word2Vec(
        sentences=sentences,
        vector_size=100,
        window=5,
        min_count=5,
        workers=4,
        sg=0  # CBOW model
    )
    w2v_model.save(filename)
    return w2v_model
def get_word_vector(model, word):
    if word in model.wv:
        print(f"Embedding for '{word}':\n{model.wv[word]}")
    else:
        print(f"'{word}' not found in the model vocabulary.")


In [12]:
sentences = load_and_clean_data("/content/Input.tsv")
w2v_model = train_w2v(sentences, "w2v_model")
get_word_vector(w2v_model, 'آپ')

Embedding for 'آپ':
[-0.12803465  0.23627098  0.0536944   0.04548621  0.07707908 -0.31422073
  0.09123978  0.3726027  -0.11469957 -0.09597608 -0.07373054 -0.22217876
 -0.02202859  0.02562775  0.03884988 -0.10320386  0.10772666 -0.29465017
 -0.07978034 -0.40121683  0.03057197  0.1258434   0.14919293 -0.11433229
 -0.04719877  0.04385486 -0.15139683 -0.13392672 -0.17738768  0.12932323
  0.22466369  0.03454817  0.0121672  -0.11035147 -0.17571992  0.2537493
 -0.00504002 -0.14013863 -0.09821153 -0.34341547  0.1038917  -0.16273597
 -0.0898766   0.00554703  0.20079497 -0.08291562 -0.09857216 -0.11621027
  0.08492552  0.02501482  0.16610302 -0.13555522 -0.08198652 -0.02869631
 -0.18323438  0.10242663  0.08131876  0.04286406 -0.21250407  0.05173036
  0.04203551  0.05517119  0.02658407 -0.07634305 -0.18427299  0.15360023
  0.17831989  0.16955064 -0.2443832   0.25011253 -0.12086792  0.03534761
  0.06896741  0.01984213  0.19519942  0.12309249 -0.01295527  0.0180031
 -0.13097537  0.09084111 -0.06110

In [19]:
word2vec = Word2Vec.load("w2v_model")

embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, index in text_tokenizer.word_index.items():
    if word in word2vec.wv:
        embedding_matrix[index] = word2vec.wv[word]
model_w2v = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=True),
    LSTM(64),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])



In [23]:
Q2_results={}
model_w2v.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_w2v.fit(X_train_padded, y_train, epochs=25, batch_size=32, validation_split=0.1)
metrics = assess_binary_model(model_w2v, X_test_padded, y_test)
Q2_results["W2V"] = metrics

print(Q2_results["W2V"])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
{'Accuracy': 0.5918367346938775, 'Precision': 0.5490196078431373, 'Recall': 0.7304347826086957, 'F1-Score': 0.6268656716417911}


In [29]:
# Function to read custom GloVe embeddings
def load_custom_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf8') as file:
        for line in file:
            split_line = line.strip().split()
            word = split_line[0]
            vector = np.array(split_line[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings
# Load GloVe vectors for Urdu
urdu_glove = load_custom_glove_embeddings("/content/glove.6B.100d.txt")
# Prepare embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in text_tokenizer.word_index.items():
    if word in urdu_glove:
        embedding_matrix[idx] = urdu_glove[word]
# Create LSTM model with GloVe embeddings
model_glove = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix],
              input_length=max_sequence_length, trainable=True),
    LSTM(64),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
model_glove.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_glove.fit(X_train_padded, y_train, epochs=25, batch_size=32, validation_split=0.1)



Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tf_keras.src.callbacks.History at 0x7e69de354a50>

In [31]:
# Evaluation
metrics = assess_binary_model(model_glove, X_test_padded, y_test)
Q2_results["Glove"] = metrics
print(Q2_results["Glove"])

{'Accuracy': 0.6244897959183674, 'Precision': 0.592, 'Recall': 0.6434782608695652, 'F1-Score': 0.6166666666666667}


In [35]:
import fasttext

# Function to clean and write Urdu text data to a file for FastText training
def prepare_fasttext_input(dataframe, filename):
    with open(filename, 'w', encoding='utf8') as file:
        for entry in dataframe['text']:
            cleaned_text = preprocess_text(entry)
            file.write(cleaned_text + '\n')
prepare_fasttext_input(data, 'urdu_fasttext.txt')
ft_model = fasttext.train_unsupervised('urdu_fasttext.txt', model='skipgram')
vector = ft_model.get_word_vector('آپ')
print(f"Embedding for the word 'آپ':\n{vector}")


Embedding for the word 'آپ':
[ 0.08756117  0.27233404 -0.03027672  0.02105954  0.28861082  0.06018716
 -0.10943206  0.09837036  0.26761922  0.2535413  -0.14851332  0.08834928
 -0.19490933  0.2747523  -0.08996201 -0.01164204 -0.00234025  0.02624479
  0.03076512  0.04537058 -0.04344109  0.10226315  0.10441438 -0.10251853
  0.01428516 -0.03792332  0.10909264  0.02143142 -0.08700453  0.07883111
 -0.00170354 -0.18016663 -0.2038048  -0.18591598 -0.18866086 -0.10308241
 -0.060835   -0.03287821 -0.00256075 -0.04365546 -0.01296125  0.13208042
  0.02172818  0.30531967  0.11016957 -0.18350549  0.04197414  0.07431116
 -0.13104574  0.24267495 -0.26439863  0.03081083 -0.15196662  0.10380402
 -0.02171432 -0.09979446  0.0901449   0.23734155  0.02337755  0.00050589
  0.04686249  0.06690668  0.3902374   0.00760218  0.03881198 -0.07111625
 -0.09715712  0.08002835 -0.10601524  0.08299984 -0.24419083  0.04942542
 -0.09706189 -0.04694813 -0.05761055  0.07260178  0.15999132  0.12502673
  0.10071829  0.040858

In [39]:
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in text_tokenizer.word_index.items():
    embedding_matrix[idx] = ft_model.get_word_vector(word)

# Construct the LSTM model using FastText embeddings
model_fasttext = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim,
              weights=[embedding_matrix], input_length=max_sequence_length, trainable=True),
    LSTM(64),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
model_fasttext.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_fasttext.fit(X_train_padded, y_train, epochs=25, batch_size=32, validation_split=0.1)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tf_keras.src.callbacks.History at 0x7e69dceb6cd0>

In [41]:
metrics = assess_binary_model(model_fasttext, X_test_padded, y_test)
Q2_results["FastText"] = metrics

print(Q2_results["FastText"])

{'Accuracy': 0.6244897959183674, 'Precision': 0.5804195804195804, 'Recall': 0.7217391304347827, 'F1-Score': 0.6434108527131783}


In [53]:

data = pd.read_csv("/content/Input.tsv", sep="\t", names=["text", "label"])
data['text'] = data['text'].apply(lambda t: re.sub(r'[^\w\s]', '', t).strip())
data = data[data['label'].isin(['P', 'N'])]

encoder = LabelEncoder()
data['label'] = encoder.fit_transform(data['label'])  # 'P' → 1, 'N' → 0
elmo_model = Embedder('177')
tokenized_texts = data['text'].tolist()
elmo_vectors = elmo_model.sents2elmo(tokenized_texts)

X = np.array([np.mean(emb, axis=0) for emb in elmo_vectors])
y = np.array(data['label'], dtype=np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

model_elmo = Sequential([
    LSTM(64, input_shape=(1, X.shape[1])),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
model_elmo.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model_elmo.fit(X_train_lstm, y_train, epochs=25, batch_size=32, validation_split=0.1)

def evaluate(model, X, y_true):
    y_prob = model.predict(X)
    y_pred = (y_prob > 0.5).astype(int)
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1-Score": f1_score(y_true, y_pred)
    }



Q2_results["ELMO"] = evaluate(model_elmo, X_test_lstm, y_test)
print(Q2_results["ELMO"])

{'Accuracy': 0.5183673469387755, 'Precision': 0.49032258064516127, 'Recall': 0.6608695652173913, 'F1-Score': 0.562962962962963}


In [56]:
df = pd.DataFrame(Q2_results).T
df = df.T
df = df.loc[['F1-Score', 'Accuracy', 'Precision', 'Recall']]
print(df)

                W2V     Glove  FastText      ELMO
F1-Score   0.626866  0.616667  0.643411  0.562963
Accuracy   0.591837  0.624490  0.624490  0.518367
Precision  0.549020  0.592000  0.580420  0.490323
Recall     0.730435  0.643478  0.721739  0.660870


# Part 2

# Q3

In [63]:


with open('/content/english.txt', 'r', encoding='utf-8') as f:
    english_sentences = f.readlines()

with open('/content/urdu.txt', 'r', encoding='utf-8') as f:
    urdu_sentences = f.readlines()

english_sentences = [line.strip() for line in english_sentences]
urdu_sentences = [line.strip() for line in urdu_sentences]

df = pd.DataFrame({
    'english': english_sentences,
    'urdu': urdu_sentences
})

df = df[['english', 'urdu']].dropna().reset_index(drop=True)

print(df.head())

df['urdu'] = df['urdu'].apply(lambda x: 'start ' + x + ' end')

eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(df['english'])

urdu_tokenizer = Tokenizer()
urdu_tokenizer.fit_on_texts(df['urdu'])
input_sequences = eng_tokenizer.texts_to_sequences(df['english'])
target_sequences = urdu_tokenizer.texts_to_sequences(df['urdu'])

max_input_len = max(len(seq) for seq in input_sequences)
max_output_len = max(len(seq) for seq in target_sequences)

input_sequences = pad_sequences(input_sequences, maxlen=max_input_len, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_output_len, padding='post')
X_train, X_val, y_train, y_val = train_test_split(input_sequences, target_sequences, test_size=0.1)

input_vocab_size = len(eng_tokenizer.word_index) + 1
output_vocab_size = len(urdu_tokenizer.word_index) + 1




                english                       urdu
0   is zain your nephew      زین تمہارا بھتیجا ہے۔
1  i wish youd trust me  کاش تم مجھ پر بھروسہ کرتے
2      did he touch you      کیا اس نے آپ کو چھوا؟
3      its part of life         اس کی زندگی کا حصہ
4        zain isnt ugly        زین بدصورت نہیں ہے۔


In [65]:
def evaluate_bleu(model, X_train, X_val, y_train, y_val, urdu_tokenizer, max_output_len):
    smooth = SmoothingFunction().method1
    scores = []

    # Combine first 90 from train and first 10 from val
    eval_inputs = np.concatenate([X_train[:90], X_val[:10]], axis=0)
    eval_targets = np.concatenate([y_train[:90], y_val[:10]], axis=0)

    for i in range(len(eval_inputs)):
        input_seq = eval_inputs[i:i+1]
        decoder_input = np.zeros((1, max_output_len - 1))
        decoder_input[0, 0] = urdu_tokenizer.word_index['start']

        for t in range(1, max_output_len):
            output_tokens = model.predict(input_seq, verbose=0)  # Make sure the input shape aligns with the model's input requirements
            sampled_token_index = np.argmax(output_tokens[0, t - 1, :])
            decoder_input[0, t] = sampled_token_index
            if sampled_token_index == urdu_tokenizer.word_index['end']:
                break

        predicted = decoder_input[0][1:t]
        reference = eval_targets[i][1:]  # Avoid 'start' token in the reference
        scores.append(sentence_bleu([reference], predicted, smoothing_function=smooth))

    return np.mean(scores)

In [73]:
def create_rnn_model():
    # Encoder
    encoder_input = Input(shape=(max_input_len,))
    enc_emb = Embedding(input_vocab_size, 256)(encoder_input)
    encoder_output, state_h = SimpleRNN(256, return_state=True)(enc_emb)

    # Decoder
    decoder_input = Input(shape=(max_output_len - 1,))
    dec_emb = Embedding(output_vocab_size, 256)(decoder_input)  # Shape: (batch_size, max_output_len-1, 256)
    decoder_output = SimpleRNN(256, return_sequences=True)(dec_emb, initial_state=state_h)
    output = Dense(output_vocab_size, activation='softmax')(decoder_output)  # Shape: (batch_size, max_output_len-1, output_vocab_size)

    model = Model([encoder_input, decoder_input], output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

rnn_model = create_rnn_model()
rnn_model.fit([X_train, y_train[:, :-1]], y_train[:, 1:, np.newaxis], epochs=50, batch_size=64,
              validation_data=([X_val, y_val[:, :-1]], y_val[:, 1:, np.newaxis]))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [74]:
trans_models = {}

In [75]:

trans_models['RNN_Model'] =evaluate_bleu(rnn_model)

print(trans_models['RNN_Model'])

0.0017900319764852235


In [81]:
def create_birnn_model():
    encoder_input = Input(shape=(max_input_len,))
    enc_emb = Embedding(input_vocab_size, 256)(encoder_input)

    bi_rnn = Bidirectional(SimpleRNN(256, return_state=True), merge_mode=None)
    forward_output, backward_output, forward_h, backward_h = bi_rnn(enc_emb)

    state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])

    decoder_input = Input(shape=(max_output_len-1,))
    dec_emb = Embedding(output_vocab_size, 512)(decoder_input)
    decoder_output = SimpleRNN(512, return_sequences=True)(dec_emb, initial_state=state_h)

    output = Dense(output_vocab_size, activation='softmax')(decoder_output)

    model = Model([encoder_input, decoder_input], output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    return model



birnn_model = create_birnn_model()
birnn_model.fit([X_train, y_train[:, :-1]], y_train[:, 1:, np.newaxis], epochs=50, batch_size=64, validation_data=([X_val, y_val[:, :-1]], y_val[:, 1:, np.newaxis]))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [82]:
trans_models['BIRNN_Model'] = evaluate_bleu(birnn_model)

print(trans_models['BIRNN_Model'])

0.05376326907496855


In [83]:
def create_lstm_model():
    encoder_input = Input(shape=(max_input_len,))
    enc_emb = Embedding(input_vocab_size, 256)(encoder_input)
    encoder_output, state_h, state_c = LSTM(256, return_state=True)(enc_emb)

    decoder_input = Input(shape=(max_output_len-1,))
    dec_emb = Embedding(output_vocab_size, 256)(decoder_input)
    decoder_output = LSTM(256, return_sequences=True)(dec_emb, initial_state=[state_h, state_c])
    output = Dense(output_vocab_size, activation='softmax')(decoder_output)

    model = Model([encoder_input, decoder_input], output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    return model



lstm_model = create_lstm_model()
lstm_model.fit([X_train, y_train[:, :-1]], y_train[:, 1:, np.newaxis], epochs=50, batch_size=64, validation_data=([X_val, y_val[:, :-1]], y_val[:, 1:, np.newaxis]))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [84]:
trans_models['LSTM_Model'] =evaluate_bleu(lstm_model)

print(trans_models['LSTM_Model'])

0.05732336259749588


In [89]:


def evaluate_model_bleu(model, tokenizer_src, tokenizer_tgt, input_texts, target_texts, max_input_len, max_output_len):
    bleu_scores = []
    for src, tgt in zip(input_texts, target_texts):
        src_seq = tokenizer_src.texts_to_sequences([src])
        src_seq = tf.keras.preprocessing.sequence.pad_sequences(src_seq, maxlen=max_input_len, padding='post')
        tgt_input_seq = [tokenizer_tgt.word_index['start']]

        for _ in range(max_output_len):
            tgt_seq = tf.keras.preprocessing.sequence.pad_sequences([tgt_input_seq], maxlen=max_output_len-1, padding='post')
            predictions = model.predict([src_seq, tgt_seq], verbose=0)
            next_word_id = predictions[0, len(tgt_input_seq)-1].argmax()
            tgt_input_seq.append(next_word_id)
            if tokenizer_tgt.index_word[next_word_id] == 'end':
                break
        translated = [tokenizer_tgt.index_word.get(idx, '') for idx in tgt_input_seq if idx != 0]
        reference = tgt.split()
        bleu_scores.append(sentence_bleu([reference], translated))
    return sum(bleu_scores) / len(bleu_scores)


In [90]:
class PositionalEncodingLayer(tf.keras.layers.Layer):
    def __init__(self, max_len, d_model):
        super(PositionalEncodingLayer, self).__init__()
        position = np.arange(max_len)[:, np.newaxis]
        dimension = np.arange(d_model)[np.newaxis, :]
        rates = 1 / np.power(10000, (2 * (dimension // 2)) / np.float32(d_model))
        angles = position * rates

        # Applying sine to even indices and cosine to odd indices
        angles[:, 0::2] = np.sin(angles[:, 0::2])
        angles[:, 1::2] = np.cos(angles[:, 1::2])

        self.positional_encoding = tf.cast(angles[np.newaxis, ...], dtype=tf.float32)

    def call(self, x):
        return x + self.positional_encoding[:, :tf.shape(x)[1], :]

class TransformerDecoderLayer(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerDecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.encoder_attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.feed_forward = tf.keras.Sequential([
            Dense(ff_dim, activation='relu'),
            Dense(embed_dim)
        ])
        self.layer_norm1 = LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = LayerNormalization(epsilon=1e-6)
        self.layer_norm3 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
        self.dropout3 = Dropout(rate)

    def call(self, x, encoder_output, training):
        # Creating a mask to prevent attention to future tokens
        look_ahead_mask = tf.linalg.band_part(tf.ones((tf.shape(x)[1], tf.shape(x)[1])), -1, 0)

        self_attention_output = self.self_attention(x, x, attention_mask=look_ahead_mask)
        self_attention_output = self.dropout1(self_attention_output, training=training)
        norm1_output = self.layer_norm1(x + self_attention_output)

        encoder_attention_output = self.encoder_attention(norm1_output, encoder_output)
        encoder_attention_output = self.dropout2(encoder_attention_output, training=training)
        norm2_output = self.layer_norm2(norm1_output + encoder_attention_output)

        ff_output = self.feed_forward(norm2_output)
        ff_output = self.dropout3(ff_output, training=training)
        return self.layer_norm3(norm2_output + ff_output)

def create_transformer(input_vocab_size, output_vocab_size, max_input_len, max_output_len):
    embed_dim = 256
    num_heads = 4
    ff_dim = 512
    encoder_inputs = Input(shape=(max_input_len,))
    encoder_emb = Embedding(input_vocab_size, embed_dim)(encoder_inputs)
    encoder_pos_emb = PositionalEncodingLayer(max_input_len, embed_dim)(encoder_emb)
    encoder_output = TransformerDecoderLayer(embed_dim, num_heads, ff_dim)(encoder_pos_emb, encoder_pos_emb, training=True)  # Using one decoder block for simplicity
    encoder_output = LayerNormalization()(encoder_output)
    decoder_inputs = Input(shape=(max_output_len-1,))
    decoder_emb = Embedding(output_vocab_size, embed_dim)(decoder_inputs)
    decoder_pos_emb = PositionalEncodingLayer(max_output_len, embed_dim)(decoder_emb)
    decoder_output = TransformerDecoderLayer(embed_dim, num_heads, ff_dim)(decoder_pos_emb, encoder_output, training=True)
    decoder_output = LayerNormalization()(decoder_output)
    final_output = Dense(output_vocab_size, activation="softmax")(decoder_output)
    transformer_model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=final_output)
    transformer_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

    return transformer_model


In [91]:

model = create_transformer(input_vocab_size, output_vocab_size, max_input_len, max_output_len)

model.fit(
    [X_train, y_train[:, :-1]],
    y_train[:, 1:, np.newaxis],
    epochs=50,
    batch_size=64,
    validation_data=([X_val, y_val[:, :-1]], y_val[:, 1:, np.newaxis])
)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [92]:

english_sentences = df['english'].values
urdu_sentences = df['urdu'].values

transformer_bleu_score = evaluate_model_bleu(
    transformer_model,
    eng_tokenizer,
    urdu_tokenizer,
    english_sentences[:100],
    urdu_sentences[:100],
    max_input_len,
    max_output_len
)


trans_models['Transformer'] =transformer_bleu_score


print(trans_models['Transformer'])


0.5158021472779767


In [93]:
blue_scores = pd.DataFrame(list(trans_models.items()), columns=['Model', 'BLEU Score'])
blue_scores['BLEU Score'] = blue_scores['BLEU Score'].map(lambda x: round(x, 4))
print(blue_scores.to_string(index=False))

      Model  BLEU Score
  RNN_Model      0.0018
BIRNN_Model      0.0538
 LSTM_Model      0.0573
Transformer      0.5158


In [94]:
#Load GloVe embeddings from a file
def load_glove_embeddings(filepath, embedding_dim=100):
    """Loads pre-trained GloVe embeddings from a file and returns them as a dictionary."""
    embeddings_index = {}
    with open(filepath, encoding='utf-8') as file:
        for line in file:
            values = line.strip().split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

#Generate the embedding matrix for the tokenizer's word index
def create_embedding_matrix(word_index, embeddings_index, embedding_dim=100):
    """Creates an embedding matrix using GloVe embeddings for each word in the word_index."""
    embedding_matrix = np.random.uniform(-0.05, 0.05, (len(word_index) + 1, embedding_dim))  # Random initialization
    for word, index in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector  # Replace with the GloVe vector
    return embedding_matrix

# Build an RNN-based sequence-to-sequence model
def build_rnn_model(embedding_matrix=None, embedding_dim=100, max_input_len=50, max_output_len=50, input_vocab_size=None, output_vocab_size=None):
    """
    Builds an RNN model with the option to include pre-trained word embeddings.
    If embedding_matrix is provided, it will use that for the encoder.
    """
    # Encoder
    encoder_input = Input(shape=(max_input_len,))
    if embedding_matrix is not None:
        enc_emb = Embedding(input_dim=embedding_matrix.shape[0],
                            output_dim=embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_input_len,
                            trainable=False)(encoder_input)
    else:
        enc_emb = Embedding(input_vocab_size, embedding_dim)(encoder_input)

    encoder_output, state_h = SimpleRNN(256, return_state=True)(enc_emb)

    # Decoder
    decoder_input = Input(shape=(max_output_len - 1,))
    dec_emb = Embedding(output_vocab_size, 256)(decoder_input)
    decoder_output = SimpleRNN(256, return_sequences=True)(dec_emb, initial_state=state_h)

    # Output layer
    output = Dense(output_vocab_size, activation='softmax')(decoder_output)

    # Compile the model
    model = Model([encoder_input, decoder_input], output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=["accuracy"])

    return model


In [95]:

rnn_model_random = build_rnn_model()

rnn_model_random.fit(
    [X_train, y_train[:, :-1]],
    y_train[:, 1:, np.newaxis],
    epochs=50,
    batch_size=64,
    validation_data=([X_val, y_val[:, :-1]], y_val[:, 1:, np.newaxis])
)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [97]:
RNN_models = {}

In [98]:
RNN_models['RNN_Random'] =evaluate_bleu(rnn_model_random)

print("RNN Random Model Blue Score: ",RNN_models['RNN_Random'])

RNN Random Model Blue Score:  0.0015748820939085696


In [101]:

glove_path = '/content/glove.6B.100d.txt'
embedding_dim = 100
glove_index = load_glove_embeddings(glove_path, embedding_dim)
embedding_matrix = create_embedding_matrix(eng_tokenizer.word_index, glove_index, embedding_dim)

rnn_model_glove = build_rnn_model(embedding_matrix, embedding_dim)
rnn_model_glove.fit([X_train, y_train[:, :-1]], y_train[:, 1:, np.newaxis],
                    epochs=50, batch_size=64,
                    validation_data=([X_val, y_val[:, :-1]], y_val[:, 1:, np.newaxis]))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50



In [102]:
RNN_models['RNN_Glove'] = evaluate_bleu(rnn_model_glove)

print("RNN Glove Model Blue Score: ",RNN_models['RNN_Glove'])

RNN Glove Model Blue Score:  0.0018692093576922583


In [103]:
rnn_blue_scores = pd.DataFrame(list(RNN_models.items()), columns=['Model', 'BLEU Score'])
rnn_blue_scores['BLEU Score'] = rnn_blue_scores['BLEU Score'].map(lambda x: round(x, 4))
print(rnn_blue_scores.to_string(index=False))

     Model  BLEU Score
RNN_Random      0.0016
 RNN_Glove      0.0019
