<a href="https://colab.research.google.com/github/AdiTheRipper/DL-Lab/blob/main/Ass5(smol).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from tensorflow import keras
from keras.preprocessing import text
from keras.utils import to_categorical
from keras.preprocessing import sequence
from keras.utils import pad_sequences
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances

# Data preparation
data = """Deep learning (also known as deep structured learning) is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised.
Deep-learning architectures such as deep neural networks, deep belief networks, deep reinforcement learning, recurrent neural networks, convolutional neural networks and Transformers have been applied to fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, drug design, medical image analysis, climate science, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance.
"""
dl_data = data.split()

# Tokenization
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(dl_data)
word2id = tokenizer.word_index

word2id['PAD'] = 0
id2word = {v: k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in dl_data]

vocab_size = len(word2id)
embed_size = 100
window_size = 2

# Generate training data
def generate_context_word_pairs(corpus, window_size=2, vocab_size=None):
    context_length = window_size * 2

    for words in corpus:
        sentence_length = len(words)

        for index, word in enumerate(words):
            context_words = []
            label_word = []
            start = index - window_size
            end = index + window_size + 1

            context_words.append([words[i] for i in range(start, end) if 0 <= i < sentence_length and i != index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen=context_length)
            y = to_categorical(label_word, num_classes=vocab_size)

            yield (x, y)

# Model building
cbow = keras.models.Sequential()
cbow.add(keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(keras.layers.Lambda(lambda x: keras.backend.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(keras.layers.Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# Training the model
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

# Output
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1]
                   for search_term in ['deep']}

print(similar_words)


Epoch: 1 	Loss: 433.1032280921936

Epoch: 2 	Loss: 428.7868821620941

Epoch: 3 	Loss: 425.5364544391632

Epoch: 4 	Loss: 422.5021929740906

Epoch: 5 	Loss: 420.14970874786377

(74, 100)
(74, 74)
{'deep': ['on', 'including', 'human', 'family', 'speech']}
