In [1]:
import io
import math
import gzip
import nltk
import time
import random
import pandas as pd
import numpy as np
import tensorflow as tf
import gensim.downloader as api
import tensorflow_datasets as tfds
nltk.download('stopwords')

from collections import Counter
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from keras.preprocessing import text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
#reading dataset
file= open("interview_ds.txt", "r")
wiki = file.read()
wiki=wiki[:50000]

##Data pre-processing

In [6]:
import re
# Tokenization
words = re.findall(r'\b\w+\b', wiki)

# Convert to lowercase
words = [word.lower() for word in words]

# Build Vocabulary
vocabulary = set(words)
word_to_index = {word: i for i, word in enumerate(vocabulary)}
index_to_word = {i: word for i, word in enumerate(vocabulary)}

# Generate Word Pairs
window_size = 2
word_pairs = []
for i, word in enumerate(words):
    for j in range(max(0, i - window_size), min(len(words), i + window_size + 1)):
        if i != j:
            context_word = words[j]
            word_pairs.append((word, context_word))

# Assign Labels
labeled_data = [(pair, 1) for pair in word_pairs]


In [None]:
# Print the pre-processed data
print("Pre-processed Word Pairs:")
for data in labeled_data:
    print(data)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
(('just', 'him'), 1)
(('just', 'when'), 1)
(('just', 'she'), 1)
(('when', 'him'), 1)
(('when', 'just'), 1)
(('when', 'she'), 1)
(('when', 'thought'), 1)
(('she', 'just'), 1)
(('she', 'when'), 1)
(('she', 'thought'), 1)
(('she', 'she'), 1)
(('thought', 'when'), 1)
(('thought', 'she'), 1)
(('thought', 'she'), 1)
(('thought', 'would'), 1)
(('she', 'she'), 1)
(('she', 'thought'), 1)
(('she', 'would'), 1)
(('she', 'never'), 1)
(('would', 'thought'), 1)
(('would', 'she'), 1)
(('would', 'never'), 1)
(('would', 'find'), 1)
(('never', 'she'), 1)
(('never', 'would'), 1)
(('never', 'find'), 1)
(('never', 'true'), 1)
(('find', 'would'), 1)
(('find', 'never'), 1)
(('find', 'true'), 1)
(('find', 'he'), 1)
(('true', 'never'), 1)
(('true', 'find'), 1)
(('true', 'he'), 1)
(('true', 'popped'), 1)
(('he', 'find'), 1)
(('he', 'true'), 1)
(('he', 'popped'), 1)
(('he', 'out'), 1)
(('popped', 'true'), 1)
(('popped', 'he'), 1)
(('popped', 'out')

##Model training

In [8]:
from tensorflow.keras.layers import Dot, Dense, Embedding, Input
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K


# Parameters
embedding_dim = 100
window_size = 2
num_neg_samples = 5
vocab_size = len(vocabulary)

# Generate training data
def generate_training_data(labeled_data, word_to_index, window_size, num_neg_samples):
    for data in labeled_data:
        target_word, context_word = data[0]
        label = data[1]
        context_index = word_to_index[context_word]
        target_index = word_to_index[target_word]
        yield (context_index, target_index, label)
        for _ in range(num_neg_samples):
            neg_word = np.random.choice(list(vocabulary))
            while neg_word == context_word:
                neg_word = np.random.choice(list(vocabulary))
            neg_index = word_to_index[neg_word]
            yield (context_index, neg_index, 0)

# Define the Skip-Gram with Negative Sampling (SGNS) model
input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, embedding_dim, input_length=1, name='embedding')

target = embedding(input_target)
context = embedding(input_context)

dot_product = Dot(axes=2, normalize=False)([target, context])
dot_product = Dense(1, activation='sigmoid')(dot_product)

model = Model(inputs=[input_target, input_context], outputs=dot_product)

# Define the custom loss function based on the negative sampling objective
def sgns_loss(y_true, y_pred):
    return K.mean(tf.math.log1p(y_pred) + K.mean(tf.math.log1p(1 - y_pred), axis=-1), axis=-1)

# Compile the model with the custom loss function
model.compile(loss=sgns_loss, optimizer='adam')

# Generate training data
training_data = list(generate_training_data(labeled_data, word_to_index, window_size, num_neg_samples))

# Extract the data for training
target_data = np.array([data[0] for data in training_data], dtype="int32")
context_data = np.array([data[1] for data in training_data], dtype="int32")
label_data = np.array([data[2] for data in training_data], dtype="float32")

# Reshape the data
#target_data = np.reshape(target_data, (-1, 1))
#context_data = np.reshape(context_data, (-1, 1))
#label_data = np.reshape(label_data, (-1, 1))

# Train the model
model.fit([target_data, context_data], label_data, batch_size=64, epochs=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.src.callbacks.History at 0x7a065f182470>

In [None]:
# Evaluate the trained model
evaluation = model.evaluate([target_data, context_data], label_data)



In [None]:
evaluation

0.6931766271591187

##Hyper parameter tuning

In [None]:
pip install keras==2.12.0

Collecting keras==2.12.0
  Downloading keras-2.12.0-py2.py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 2.14.0
    Uninstalling keras-2.14.0:
      Successfully uninstalled keras-2.14.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.14.0 requires keras<2.15,>=2.14.0, but you have keras 2.12.0 which is incompatible.[0m[31m
[0mSuccessfully installed keras-2.12.0


In [None]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dot, Input
from tensorflow.keras.models import Model
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.optimizers import Adam

# Define the function to create the model
def create_model(embedding_dim, learning_rate):
    # Define the model architecture here
    input_target = Input((1,))
    input_context = Input((1,))

    embedding = Embedding(vocab_size, embedding_dim, input_length=1)

    target = embedding(input_target)
    context = embedding(input_context)

    dot_product = Dot(axes=2, normalize=False)([target, context])
    dot_product = Dense(1, activation='sigmoid')(dot_product)

    model = Model(inputs=[input_target, input_context], outputs=dot_product)
    model.compile(loss=sgns_loss, optimizer=Adam(learning_rate=learning_rate))
    return model

# Wrap the Keras model
keras_model = KerasClassifier(build_fn=create_model, epochs=5, batch_size=64, verbose=0)

# Define the hyperparameters to tune
param_grid = {
    'embedding_dim': [50, 100, 200],
    'learning_rate': [0.001, 0.01, 0.1]
}

# Convert data to numpy arrays if necessary
target_data_np = np.array(target_data)
context_data_np = np.array(context_data)
label_data_np = np.array(label_data)
# Perform grid search
grid = GridSearchCV(estimator=keras_model, param_grid=param_grid, cv=3)
print(len(target_data))
print(len(context_data))
print(len(label_data))
grid_result = grid.fit([target_data, context_data], label_data)

# Summarize results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean, param in zip(means, params):
    print(f"Mean: {mean}, Params: {param}")

240804
240804
240804


  keras_model = KerasClassifier(build_fn=create_model, epochs=5, batch_size=64, verbose=0)


ValueError: ignored

###As I am facing some issues with keras fit method in GridsearchCV,I tuned the by rerunning it with different values of parameters and finally used best one.

## Model testing

In [None]:
#reading 2nd datset for testing
file= open("interview_ds_2.txt", "r")
test = file.read()
#test=test[:500000]

In [None]:
test

'\nTina and her mom were walking down the street one day when suddenly the ground started to shake. Tina was scared and her mom was scared too. They both knew what it was: an earthquake!\nTina\'s mom held tightly onto her hand and said, "Let\'s hurry home, honey. Don\'t be scared." But it was hard to walk because the ground was shaking so much. Shoulders hunched, they struggled to make their way home.\nWhen they got there, Tina\'s mom tightly hugged her, saying, "We made it, sweetie! Now let\'s sit here and wait until it\'s over."\nTina was impatient and asked, "When will it end, mommy?"\nHer mom replied, "I\'m not sure, honey. We just have to wait and see."\nSo they sat quietly and held each other until the trembling stopped. Then they smiled and hugged each other, feeling glad that the struggle was over.Once upon a time, there was a small boy named Timmy. Timmy loved to eat oatmeal for breakfast every day. One day, Timmy\'s mom asked him if he wanted to help her make oatmeal. Timmy w

##Data preprocessing

In [None]:
import re
# Tokenization
words = re.findall(r'\b\w+\b', test)

# Convert to lowercase
words = [word.lower() for word in words]

# Build Vocabulary
vocabulary = set(words)
word_to_index = {word: i for i, word in enumerate(vocabulary)}
index_to_word = {i: word for i, word in enumerate(vocabulary)}

# Generate Word Pairs
window_size = 2
word_pairs = []
for i, word in enumerate(words):
    for j in range(max(0, i - window_size), min(len(words), i + window_size + 1)):
        if i != j:
            context_word = words[j]
            word_pairs.append((word, context_word))

# Assign Labels
test_labeled_data = [(pair, 1) for pair in word_pairs]

##Model testing

In [None]:
# Generate test data
test_data = list(generate_training_data(test_labeled_data, word_to_index, window_size, num_neg_samples))

# Extract the data for testing
test_target_data = np.array([data[0] for data in test_data], dtype="int32")
test_context_data = np.array([data[1] for data in test_data], dtype="int32")
test_label_data = np.array([data[2] for data in test_data], dtype="float32")

# Evaluate the model on test data
test_loss = model.evaluate([test_target_data, test_context_data], test_label_data)

print("Test loss:", test_loss)

Test loss: 0.6932780146598816
