In [1]:
import itertools
import pandas as pd
import numpy as np
import re
import os
from tqdm import tqdm
import matplotlib.pyplot as plt

# Deep learning:
# from tensorflow.python.keras.models import Input

from keras.layers import Input, Dense, Dropout, Flatten, Conv2D, MaxPooling2D

from keras.models import Model

# from keras.models import Input, Model

from scipy import sparse


In [2]:
document1 = "Machine learning (ML) is a field of inquiry devoted to understanding and building methods that 'learn', that is, methods that leverage data to improve performance on some set of tasks.[1] It is seen as a part of artificial intelligence. Machine learning algorithms build a model based on sample data, known as training data, in order to make predictions or decisions without being explicitly programmed to do so.[2] Machine learning algorithms are used in a wide variety of applications, such as in medicine, email filtering, speech recognition, agriculture, and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks.[3][4] A subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers, but not all machine learning is statistical learning. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a related field of study, focusing on exploratory data analysis through unsupervised learning.[6][7] Some implementations of machine learning use data and neural networks in a way that mimics the working of a biological brain.[8][9] In its application across business problems, machine learning is also referred to as predictive analytics."

In [3]:
texts = pd.read_csv("E:\\College\\FCAI-4th Year\\First Term\\Generative Adversarial Networks\\Lab Codes\\TextNLP.csv")
texts

Unnamed: 0,text
0,The future king is the prince
1,Daughter is the princess
2,Son is the prince
3,Only a man can be a king
4,Only a woman can be a queen
5,The princess will be a queen
6,Queen and king rule the realm
7,The prince is a strong man
8,The princess is a beautiful woman
9,The royal family is the king and queen and the...


In [4]:
texts = [x for x in texts['text']]
print(len(texts))
print(texts)

12
['The future king is the prince', 'Daughter is the princess ', 'Son is the prince', 'Only a man can be a king ', 'Only a woman can be a queen', 'The princess will be a queen', 'Queen and king rule the realm', 'The prince is a strong man', 'The princess is a beautiful woman ', 'The royal family is the king and queen and their children', 'Prince is only a boy now', 'A boy will be a man']


In [5]:
def text_preprocessing(
        text: list,
        punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_“~''',
        stop_words=['and', 'a', 'is', 'the', 'in', 'be', 'will', 'was', 'but', 'this', 'were', 'with', 'of', 'also',
                    'on', '.', 'for', 'any', 'its', 'and', 'are', 'from', 'both', 'as']
) -> list:
    """
    A method to preproces text
    """
    for x in text.lower():
        if x in punctuations:
            text = text.replace(x, "")

    # Removing words that have numbers in them
    text = re.sub(r'\w*\d\w*', '', text)

    # Removing digits
    text = re.sub(r'[0-9]+', '', text)

    # Cleaning the whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Setting every word to lower
    text = text.lower()

    # Converting all our text to a list 
    text = text.split(' ')

    # Droping empty strings
    text = [x for x in text if x != '']

    # Droping stop words
    text = [x for x in text if x not in stop_words]

    return text

In [6]:
window = 4
# Creating a placeholder for the scanning of the word list
word_lists = []
all_text = []
for text in texts:

    # Cleaning the text
    text = text_preprocessing(text)
    # print (text)

    # Appending to the all text list
    all_text += text

    # Creating a context dictionary
    for i, word in enumerate(text):
        for w in range(window):
            # Getting the context that is ahead by *window* words
            if i + 1 + w < len(text):
                word_lists.append([word] + [text[(i + 1 + w)]])
            # Getting the context that is behind by *window* words
            if i - w - 1 >= 0:
                word_lists.append([word] + [text[(i - w - 1)]])

In [7]:
def create_unique_word_dict(text: list) -> dict:
    """
    A method that creates a dictionary where the keys are unique words
    and key values are indices
    """
    # Getting all the unique words from our text and sorting them alphabetically
    words = list(set(text))
    words.sort()

    # Creating the dictionary for the unique words
    unique_word_dict = {}
    for i, word in enumerate(words):
        unique_word_dict.update({
            word: i
        })

    return unique_word_dict

In [8]:
unique_word_dict = create_unique_word_dict(all_text)
# Defining the number of features (unique words)
n_words = len(unique_word_dict)
print(unique_word_dict)

# Getting all the unique words
words = list(unique_word_dict.keys())
print(words)

# Creating the X and Y matrices using one hot encoding
print(n_words)
X = []
Y = []
for i, word_list in tqdm(enumerate(word_lists)):
    # Getting the indices
    print(word_list)
    main_word_index = unique_word_dict.get(word_list[0])
    context_word_index = unique_word_dict.get(word_list[1])
    # print (word_list)
    print(word_list[0], main_word_index)
    print(word_list[1], context_word_index)

    # Creating the placeholders
    X_row = np.zeros(n_words)
    Y_row = np.zeros(n_words)

    # One hot encoding the main word
    X_row[main_word_index] = 1

    # One hot encoding the Y matrix words
    Y_row[context_word_index] = 1

    # Appending to the main matrices
    X.append(X_row)
    Y.append(Y_row)


{'beautiful': 0, 'boy': 1, 'can': 2, 'children': 3, 'daughter': 4, 'family': 5, 'future': 6, 'king': 7, 'man': 8, 'now': 9, 'only': 10, 'prince': 11, 'princess': 12, 'queen': 13, 'realm': 14, 'royal': 15, 'rule': 16, 'son': 17, 'strong': 18, 'their': 19, 'woman': 20}
['beautiful', 'boy', 'can', 'children', 'daughter', 'family', 'future', 'king', 'man', 'now', 'only', 'prince', 'princess', 'queen', 'realm', 'royal', 'rule', 'son', 'strong', 'their', 'woman']
21


102it [00:00, 52185.78it/s]

['future', 'king']
future 6
king 7
['future', 'prince']
future 6
prince 11
['king', 'prince']
king 7
prince 11
['king', 'future']
king 7
future 6
['prince', 'king']
prince 11
king 7
['prince', 'future']
prince 11
future 6
['daughter', 'princess']
daughter 4
princess 12
['princess', 'daughter']
princess 12
daughter 4
['son', 'prince']
son 17
prince 11
['prince', 'son']
prince 11
son 17
['only', 'man']
only 10
man 8
['only', 'can']
only 10
can 2
['only', 'king']
only 10
king 7
['man', 'can']
man 8
can 2
['man', 'only']
man 8
only 10
['man', 'king']
man 8
king 7
['can', 'king']
can 2
king 7
['can', 'man']
can 2
man 8
['can', 'only']
can 2
only 10
['king', 'can']
king 7
can 2
['king', 'man']
king 7
man 8
['king', 'only']
king 7
only 10
['only', 'woman']
only 10
woman 20
['only', 'can']
only 10
can 2
['only', 'queen']
only 10
queen 13
['woman', 'can']
woman 20
can 2
['woman', 'only']
woman 20
only 10
['woman', 'queen']
woman 20
queen 13
['can', 'queen']
can 2
queen 13
['can', 'woman']
can 2




In [9]:
print(X)
print(Y)

[array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0.]), array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0.]),

In [10]:
import tensorflow as tf
# X= sparse.csr_matrix(X)
# Y = sparse.csr_matrix(Y)
# print (X)
# print(X[0, :].toarray())
# print (X.todense())
# a = X.todense()
# print (a[0])
XX = tf.convert_to_tensor(X, dtype=tf.float32)
YY = tf.convert_to_tensor(Y, dtype=tf.float32)
print(XX.shape)
print(YY.shape)

(102, 21)
(102, 21)


In [11]:
def CreateModel():
    # Defining the size of the embedding
    embed_size = 2
    # Defining the neural network

    # inp = Input(shape=(X.shape[1],))
    inp = Input(shape=XX.shape[1])  # 21
    x = Dense(units=embed_size, activation='linear')(inp)
    # x = Dense(units=21, activation='softmax')(x)
    x = Dense(units=YY.shape[1], activation='softmax')(x)

    model = Model(inputs=inp, outputs=x)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    model.summary()
    return model

In [12]:
model = CreateModel()
# Optimizing the network weights
model.fit(x=XX, y=YY, batch_size=10, epochs=500)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 21)]              0         
                                                                 
 dense (Dense)               (None, 2)                 44        
                                                                 
 dense_1 (Dense)             (None, 21)                63        
                                                                 
Total params: 107
Trainable params: 107
Non-trainable params: 0
_________________________________________________________________
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/50

<keras.callbacks.History at 0x2a1ba304940>

In [13]:
weights = model.get_weights()[0] #21*2
print(weights.shape)
print(weights[1][1])
print(weights)

(21, 2)
-1.3235352
[[-1.613885    0.17449677]
 [ 1.4671553  -1.3235352 ]
 [ 0.1650999  -0.9315504 ]
 [ 0.85392195  0.8085883 ]
 [-1.9027175   1.2868837 ]
 [ 0.5412559   1.0760299 ]
 [ 0.11879098 -1.3585513 ]
 [ 0.95871425  0.77462554]
 [ 0.10023533 -0.7401052 ]
 [ 1.9945936  -1.0369549 ]
 [-0.20707697 -1.1891186 ]
 [ 0.8307448  -0.03291896]
 [-1.3777212  -0.8106481 ]
 [-0.46663186  0.8495313 ]
 [ 0.21958198  0.70498455]
 [ 1.080116    0.8149148 ]
 [ 0.15476292  0.7409237 ]
 [ 1.0004083  -1.4575133 ]
 [ 1.8120517  -2.2360523 ]
 [ 0.3620179   0.89346546]
 [-0.9493005  -0.21866085]]


In [14]:
embedding_dict = {}
for word in words: #to pick the a row of weight of two values for each unique word since weights = 21*2
    embedding_dict.update({
        word: weights[unique_word_dict.get(word)]
    })

print(embedding_dict)

{'beautiful': array([-1.613885  ,  0.17449677], dtype=float32), 'boy': array([ 1.4671553, -1.3235352], dtype=float32), 'can': array([ 0.1650999, -0.9315504], dtype=float32), 'children': array([0.85392195, 0.8085883 ], dtype=float32), 'daughter': array([-1.9027175,  1.2868837], dtype=float32), 'family': array([0.5412559, 1.0760299], dtype=float32), 'future': array([ 0.11879098, -1.3585513 ], dtype=float32), 'king': array([0.95871425, 0.77462554], dtype=float32), 'man': array([ 0.10023533, -0.7401052 ], dtype=float32), 'now': array([ 1.9945936, -1.0369549], dtype=float32), 'only': array([-0.20707697, -1.1891186 ], dtype=float32), 'prince': array([ 0.8307448 , -0.03291896], dtype=float32), 'princess': array([-1.3777212, -0.8106481], dtype=float32), 'queen': array([-0.46663186,  0.8495313 ], dtype=float32), 'realm': array([0.21958198, 0.70498455], dtype=float32), 'royal': array([1.080116 , 0.8149148], dtype=float32), 'rule': array([0.15476292, 0.7409237 ], dtype=float32), 'son': array([ 1.

In [18]:
embedding_dict['king']

array([0.95871425, 0.77462554], dtype=float32)

In [19]:
from scipy.spatial.distance import cosine

def document_similarity(doc1, doc2):
    doc1 = list(text_preprocessing(doc1))
    doc2 = list(text_preprocessing(doc2))
    sims = np.zeros(len(doc2))
    summ = 0
    for i in range(len(doc1)):
        x = embedding_dict[doc1[i]]
        for j in range(len(doc2)):
            y = embedding_dict[doc2[j]]
            sims[j] = cosine(x, y) - 1
        summ += np.max(sims)
    return summ/len(doc1)

In [20]:
doc1 = "The future king is the prince"
doc2 = "Daughter is the princess"
print(document_similarity(doc1, doc2))

0.8230771025021871
