In [9]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import tensorflow as tf

ModuleNotFoundError: No module named 'tensorflow'

In [3]:
glove = pd.read_csv(r"F:\Downloads\glove.6B.200d.txt",sep=" ", quoting=3, header=None, index_col=0)
glove_embedding = {key: val.values for key, val in glove.T.items()}

In [6]:
def create_embedding_matrix(word_index,embedding_dict,dimension):
    embedding_matrix=np.zeros((len(word_index)+1,dimension))

    for word,index in word_index.items():
        if word in embedding_dict:
            if word not in ["<UNK>","<PAD>","SOS","EOS"]:
                embedding_matrix[index]=[*embedding_dict[word],0,0,0,0,0]
            else:
                embedding_matrix[index] = embedding_dict[word]
    return embedding_matrix
original_embedding_len = len(glove_embedding["<unk>"])
glove_embedding["<UNK>"] = [*glove_embedding["<unk>"],0,1,0,0,0]
glove_embedding["<PAD>"] = [original_embedding_len*0,0,0,0,1]
glove_embedding["<SOS>"] = [original_embedding_len*0,0,0,1,0]
glove_embedding["<EOS>"] = [original_embedding_len*0,0,1,0,0]

In [None]:
class MyVocab:
    """
        This class is responsible for constructing the dictionary which contains
        all the words that appear over a certain frequency, which we will use to
        tokenize any given sentence for our RNN model.

    """
    def __init__(self):
        #Pre restore the tokens mapping.
        """
        These are severals pre-defined tokens to pre process the sentence(sequqnce).
        <PAD>: Used to pad any given sentence to a uniform length, making it easier for
        RNN model to handle.
        <SOS>: Inserted at the start of each sentence.
        <EOS>: Appended at the end of each sentence.
        <UNK>: Mark the word that hasn't appeared in the captions in the training data.
        """
        self.index_to_tokens = {0:"<PAD>",1:"<SOS>",2:"<EOS>",3:"<UNK>"}

        #Inverse the above dictionary
        self.tokens_to_index = {value:key for key,value in self.index_to_tokens.items()}



    def __len__(self):
        """
        :return: int The number of the stored tokens
        """
        return len(self.index_to_tokens)


    def build_vocab(self,sentence_list,min_count=1,max_count=None,max_features=None):
        """
        This function builds the dictionary for RNN model
        :param sentence_list: An iterable containers that includes all the sentences
        :param min_count: The minimum number of the time that a word should appear in all the sentences.
        :param max_count: The maximum number of the time that a word should appear in all the sentences.
        :param max_features: Number of words to keep(From the most frequent words).
        :return:
        """

        #Create a dictionary for counting word frequency
        self.frequency_counter = {}

        #Create word_dict from several sentences
        for sentence in sentence_list:
            for word in self.tokenize(sentence):
                self.frequency_counter[word] = self.frequency_counter.get(word,0)+1

        #Filtering
        if min_count is not None:
            self.frequency_counter = {word:value for word,value in self.frequency_counter.items() if value >= min_count}

        if max_count is not None:
            self.frequency_counter = {word:value for word,value in self.frequency_counter.items() if value <= max_count}

        if max_features is not None:
            self.frequency_counter = dict(list(sorted(self.frequency_counter.items(),key=lambda x:x[-1],reverse=True))[:max_features])

        #Creating words_to_index mapping
        for word in self.frequency_counter:
            self.tokens_to_index[word] = len(self.tokens_to_index)


        #Creating index_to_words mapping
        self.index_to_tokens = dict(zip(self.tokens_to_index.values(),self.tokens_to_index.keys()))



    def sentence_to_index(self,sentence,max_len = 20):
        """
        This function converts the sentence to word index and controls
        the maximum length of the sentence. Meanwhile, it adds <SOS> and <EOS> tags
        to the beginning and the ending of a given sentence.
        :param sentence: string, A sentence in string.
        :param max_len: int, performing sentence pruning
        :return:
        """
        tokenized_sentence = self.tokenize(sentence)
        if max_len is not None:
            tokenized_sentence = tokenized_sentence[:max_len]

        return [self.tokens_to_index.get(word,self.tokens_to_index["<UNK>"]) for word in tokenized_sentence]


    def index_to_sentence(self,indices):
        """
        This function converts the index back to caption words, used for visualization.
        :param indices: A list of the index of words, e.g. [2,3,6,9,10,...]
        :return: A list of word corresponding to the indices, ["today","good","date",...]
        """
        return [self.index_to_tokens.get(index) for index in indices]



    @staticmethod
    def tokenize(content):
        tokens = [token.text.lower() for token in SPACY_OBJ.tokenizer(content)]
        return tokens


In [None]:
text=["The cat sat on mat","we can play with model"]

word_dict = MyVocab()
word_dict.build_vocab(text)
print(word_dict.tokens_to_index)

embedding_matrix=create_embedding_matrix(word_dict.tokens_to_index,embedding_dict=glove_embedding,dimension=100)
print(embedding_matrix.shape)

array([-5.1113e-01, -4.7518e-01,  2.2871e-01,  8.6524e-03, -4.3737e-01,
       -2.8747e-01,  2.3416e-01, -2.0332e-02,  5.0697e-01, -2.4367e-01,
       -2.8646e-01, -2.4133e-02, -9.6845e-06,  4.8092e-02, -2.4467e-01,
       -1.2121e-01,  1.3644e-01, -1.6190e-01,  9.9349e-02,  3.6545e-02,
       -3.1657e-02, -8.4172e-01,  2.3022e-01,  3.0332e-02, -6.7638e-01,
       -2.9399e-01,  1.3298e-01, -3.7917e-02, -1.0970e-01,  2.6541e-03,
       -4.6669e-01,  1.2329e-01, -4.0373e-03, -2.8782e-01, -1.6733e-01,
        2.0938e-01,  4.9163e-01, -6.0818e-02,  1.4326e-01, -1.3628e-01,
       -1.5650e-01, -8.3060e-02, -8.5820e-02, -1.9864e-01,  3.6016e-01,
       -1.6752e-01, -6.4389e-03,  2.3173e-01, -1.6636e-01, -1.7120e-01,
        1.8400e-01, -5.2889e-01, -7.0440e-02, -3.7772e-01, -6.6473e-02,
        3.8519e-01,  2.6262e-01, -3.9148e-02, -2.8182e-01, -1.6908e-01,
       -4.9491e-01, -1.4138e-01,  4.1732e-01, -6.8889e-02,  1.0125e-01,
        1.8499e-02, -6.6256e-02,  1.9328e-01, -2.4784e-01,  2.50