In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Bidirectional, Dropout, LayerNormalization, LeakyReLU
import random
from tensorflow.keras.callbacks import EarlyStopping

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
    print("Running on TPU")
except ValueError:
    strategy = tf.distribute.get_strategy()
    print("Running on CPU/GPU")


train_df = pd.read_csv("Train Data.csv")
test_df = pd.read_csv("Test Datas.csv")


MAX_VOCAB_SIZE = 30000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['SENTENCES'])
word_index = tokenizer.word_index


vocab_size = min(len(word_index), MAX_VOCAB_SIZE)


train_sequences = tokenizer.texts_to_sequences(train_df['SENTENCES'])


max_len = 256
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')


X, y = [], []
for seq in train_sequences:

    seq_padded = pad_sequences([seq], maxlen=max_len, padding='post')[0]

    nonzero_indices = [i for i, val in enumerate(seq_padded) if val != 0]
    for idx in nonzero_indices:

        seq_copy = seq_padded.copy()
        label = seq_copy[idx]
        seq_copy[idx] = 0
        X.append(seq_copy)
        y.append(label)


X = np.array(X)
y = np.array(y)


embeddings_index = {}

f = open('glove.6B.100d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()

embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    if i < vocab_size:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

with strategy.scope():
    embedding_layer = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=True
    )

    embedding_layer.build((1,))
    embedding_layer.set_weights([embedding_matrix])

    model = Sequential([
        embedding_layer,
        Bidirectional(GRU(256,activation = 'tanh', return_sequences=True)),
        LayerNormalization(),
        Dropout(0.2),

        Bidirectional(GRU(128,activation = 'tanh', return_sequences=True)),
        LayerNormalization(),
        Dropout(0.2),

        GRU(128),
        Dropout(0.2),

        Dense(128),
        LeakyReLU(alpha=0.01),
        Dropout(0.2),

        Dense(vocab_size, activation='softmax')
    ])

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(
    X,
    y,
    epochs=10,
    batch_size=64,
    verbose=1,
    validation_split=0.1
)
model.save('gru_model.h5')

test_df = pd.read_csv('Test Datas.csv')

test_sequences = tokenizer.texts_to_sequences(test_df['MASKED SENTENCES'])
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

predictions = model.predict(test_padded)

predicted_words = []
for prediction in predictions:
    predicted_index = np.argmax(prediction)
    predicted_word = [word for word, idx in word_index.items() if idx == predicted_index]
    if predicted_word:
        predicted_words.append(predicted_word[0])
    else:
        predicted_words.append('Unknown')

submission_df = pd.DataFrame({
    'IDS': test_df['IDS'],
    'PREDICTED WORDS': predicted_words
})

submission_df.to_csv('submission.csv', index=False)


Running on CPU/GPU




Epoch 1/10
[1m3373/6868[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m6:11:49[0m 6s/step - accuracy: 0.0631 - loss: 7.5756

In [4]:
!wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
!unzip -q glove.6B.zip

--2025-03-16 19:23:18--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2025-03-16 19:25:58 (5.12 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [None]:
from google.colab import files

uploaded = files.upload()

KeyboardInterrupt: 

In [None]:
!pip  install tensorflow


Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting tensorboard~=2.19.0 (from tensorflow)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1 (from tensorflow)
  Downloading tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting wheel<1.0,>=0.23.0 (from astunparse>=1.6.0->tensorflow