In [1]:
import re
import pickle
import collections
import numpy as np
import tensorflow as tf
from unidecode import unidecode

### TEXT MINING

In [2]:
with open('data/cuatro_corazones_con_freno_y_marcha_atras', 'r') as file_obj: #encoding="ISO-8859-1"
    text = file_obj.read()

with open('data/caperucita_roja', 'r') as file_obj: #encoding="ISO-8859-1"
    text = file_obj.read()
    
with open('data/stop_words', 'r') as file_obj:
    stopwords = file_obj.readlines()

In [3]:
class data_preparation(object):
       
    
    
    def make_disintegration(self, text):
        
        '''
        the main object is to convert a text to a "plain text" with only lower letters and stops.
        
        input :  real text
        output : plain text
        '''
        
        text = re.sub(r'\n+','\n', text)
        text = re.sub(r'<.*?>',' ', text)
        text = re.sub('^\\[a-zA-Z]*',' ', text)
        
        text = re.sub(r',|;|\n|—|-|“|”|:|\"','.', text)
        text = re.sub(r'\?|¿|!|¡','.', text)
        text = re.sub(r'\)|\(','.',text)
        text = re.sub(r' \.','.', text)
        
        text = re.sub(r'\.+','. ', text)
        text = re.sub(' +|\t',' ', text)
        
        return text.lower()
    
    
    
    
    
    
    def get_sentences(self, text):
        
        '''
        text: plain text with only lower letters and stops.
        
        setences: list of text chunks split by stops.
        '''
        
        sentences = []

        for sentence in text.split('.'):
            sentences.append(sentence.split())
            
        return sentences
    
    
    
    
    
    def get_dictionary(self, text, stopwords,vocab_size):
        
        '''
        This is made for getting an index-representation for the words in the text.
        It only creates an index for the "vocab_size" most popular words in the text.
        
        text: plain text with only lower letters and stops.
        
        dicc_w2i: mapping word
        '''  
        
        words = []
        
        
        for word in text.split(' '): 
            
            word = re.sub(r'\.','',word) #con esto quitamos el punto de la última palabra en cada frase

            if ((word not in stopwords) and (re.match('^[a-zA-Z]*$',unidecode(word))) and (word != '')):
                
                words.append(word)
              
            
        count = collections.Counter(words).most_common(vocab_size-1) # el -1 es porque para guardar dentro del vocabulario un espacio para las palabras desconocidas
    
    
        
        dicc_w2i = dict([(counter[0], index+1) for (index, counter) in enumerate(count)]) # el index+1 es para reservar el índice 0 para las palabras desconocidas
        dicc_i2w = dict([(index+1, counter[0]) for (index, counter) in enumerate(count)])
        
        dicc = {'w2i' : dicc_w2i, 'i2w' : dicc_i2w}
        
        
        with open("model/dicc.pkl","wb") as file:
            pickle.dump(dicc,file)
            
            
        return (dicc)
    
    
    
    ''' 
    def get_word_word(self, sentences, stopwords, window_size = 2):
    
        data = []
    
        for sentence in sentences:
            sentence = [word for word in sentence if ((word.lower() not in stopwords) and (re.match('^[a-zA-Z]*$',unidecode(word))))]
    
     
            for word_index, word in enumerate(sentence):      
                neighbourhood_words = sentence[max(word_index - window_size, 0) : min(word_index + window_size, len(sentence)) + 1]
            
        
                for neighbour_word in neighbourhood_words:       
                    neighbour_word = neighbour_word.lower()
                    word = word.lower()      
            
            
                    if neighbour_word != word:
                        data.append([word, neighbour_word])
                                      
        return(data)
    '''

    def get_word_list(self, sentences, stopwords, window_size = 2):
        
        '''
        Given a list of sentences, it makes a list with each word and the "window_size" words around.
        
        sentence = ['word1 word2 word3...', '...', ...]
        data =  = [word2, [word1,word2]]
        '''

        data = []

        for sentence in sentences:   
            sentence = [word for word in sentence if ((word.lower() not in stopwords) and (re.match('^[a-zA-Z]*$',unidecode(word))))]
       
    
            for word_index, word in enumerate(sentence):
                word = word.lower()
                neighbourhood_words = sentence[max(word_index - window_size, 0) : min(word_index + window_size, len(sentence)) + 1]
                neighbourhood_words = [neighbour.lower() for neighbour in neighbourhood_words if neighbour.lower()!=word]
       
                
                while (len(neighbourhood_words)<(2*window_size)):
                    neighbourhood_words.append(word)
                
                                               
                data.append([word, neighbourhood_words])    
                    
        return(data)

In [4]:
class word2vec(object):
    
    '''
    Object for implementing word2vec algorithm in a dataset with the requiered structure.
    
    Requires:
    
        - The dataset
        - Dictionary of words and indexes
        - Parameters
        
    Saves in local:
    
        - Tensorflow graph
        - Tensors W1 and b1 as a np.array for the encoder and decoder.
    '''
    
    def __init__(self,vocab_size,embedding_dim):
        
        '''
        Feed forward neuralnet architecture with two hidden layers.
        
        input: word vector in one-hot-encoding representation
        label: window size words
        
        The vector representation of the word is the tensor "encoder"
        '''
        
        # DIMENSIONS
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.optimizer_step = 0.01
        
    
        # NEURALNET
        self.input_data = tf.placeholder(tf.float32, shape=(None, vocab_size), name = 'input_data')
        self.output_data = tf.placeholder(tf.float32, shape=(None, vocab_size), name = 'output_data')

        
        self.W1 = tf.Variable(tf.random_normal([vocab_size, embedding_dim]), name = 'W1')
        self.b1 = tf.Variable(tf.random_normal([embedding_dim]), name = 'b1')
        self.vector = tf.add(tf.matmul(self.input_data,self.W1), self.b1, name = 'encoder')
        
        
        self.W2 = tf.Variable(tf.random_normal([embedding_dim, vocab_size]), name = 'W2')
        self.b2 = tf.Variable(tf.random_normal([vocab_size]), name = 'b2')
        self.prediction = tf.nn.softmax(tf.add( tf.matmul(self.vector, self.W2), self.b2), name = 'prediction')

        
        
        # OPTIMIZATION
        self.cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(self.output_data * tf.log(self.prediction), reduction_indices=[1]))
        self.train_step = tf.train.GradientDescentOptimizer(self.optimizer_step).minimize(self.cross_entropy_loss)   
    
        
        
        
        
    def to_one_hot(self, data_point_index):
        
        temp = np.zeros(self.vocab_size)
        temp[data_point_index] = 1
        return temp
    
    
    
    
    
    def training_data(self, data):
        
        '''
        First it transforms the word data to the index representation.
        Then it transforms the index representation to one-hot-encoding representation.
        
        It works with training data structure ([[word, [word,word]],...]) and with predictive ([[word],...])
        
        '''
        
        with open('./model/dicc.pkl','rb') as file:
            dicc = pickle.load(file)
            dicc_w2i = dicc['w2i']

            
            
        input_train = []
        output_train = []
        
        
        #if(len(data[0])==2): #if data is predictive data
        for data_word in data:
            
            #input_indexes = word2int[data_word[0]]            
            input_index = dicc_w2i[data_word[0]] if  data_word[0] in dicc_w2i.keys() else 0 #el imput siempre es solo una palabra
            input_train.append(self.to_one_hot(input_index))
            
            output_index = []
            for word in np.array(data_word[1]).reshape(-1):#el output es más enrevesado porque puede ser una palabra o una lista de palabras
                output_index.append(dicc_w2i[word] if word in dicc_w2i.keys() else 0)
                
            #output_index = [dicc_w2i[word] for word in np.array(data_word[1]).reshape(-1)] #el output es más enrevesado porque puede ser una palabra o una lista de palabras
            output_train.append(self.to_one_hot(output_index))

        
        '''elif(len(data[0])==1): #if data is training data
            
            for data_word in data:
            
                #input_indexes = word2int[data_word[0]]
                input_indexes = word2int[np.reshape(data_word,(1,-1))[0]] #el imput siempre es solo una palabra
                input_train.append(self.to_one_hot(input_indexes, self.vocab_size))  
              
        
        else: 
            continue
        '''
        
        input_train = np.asarray(input_train)
        output_train = np.asarray(output_train)  
        
        
        return (input_train,output_train)
    
    
    
    

    def train(self, x_train, y_train, batch_size = 256):
        
        '''
        Train the tensorflow graph.
        '''
        
        n_data = len(x_train)
        n_batch = n_data//batch_size
        
        
        
        with tf.Session() as sess:
            
            sess.run(tf.global_variables_initializer())
            
            for batch_index in range(n_batch):
                
                
                
                x = x_train[(n_batch*batch_size):((n_batch+1)*batch_size)]
                y = y_train[(n_batch*batch_size):((n_batch+1)*batch_size)]

                    
                sess.run([self.train_step,self.vector], feed_dict={self.input_data: x, self.output_data: y})
                
                #if (batch_index+1)%(n_batch//100)==0:
                #    print('Progress: ', batch_index//n_batch*100)
        
        
        
        
            saver = tf.train.Saver()
            saver.save(sess, "./model/model")
            
            W1 = sess.run(self.W1)
            b1 = sess.run(self.b1)
            
            np.save('model/W1.npy', W1)
            np.save('model/b1.npy', b1)
        
        
        return (W1, b1)
    
        
        
        
    def encoder(self, words):
        
        '''
        Load the save graph and execute for the words in one-hot-representation
        '''
        
        '''
        with tf.Session() as sess:
            
            saver = tf.train.import_meta_graph('./model/model.meta')
            saver.restore(sess,tf.train.latest_checkpoint('./model'))

            graph = tf.get_default_graph()
            
            input_data = graph.get_tensor_by_name("input_data:0")
            output_data = graph.get_tensor_by_name("output_data:0")
            vector = graph.get_tensor_by_name("vector:0")
                
            x_train = self.prediction_data(word)    
            vector = sess.run(vector, feed_dict={input_data: x_train})
        '''
        
        W1 = np.load('model/W1.npy')
        b1 = np.load('model/b1.npy')
        
        with open('./model/dicc.pkl','rb') as file:
            dicc = pickle.load(file)
        
        dicc_w2i = dicc['w2i']
        
        indexes = [dicc_w2i[word] if word in dicc_w2i else 0 for word in words]
        input_data = [self.to_one_hot(index) for index in indexes]
        input_data = np.reshape(input_data,(-1,self.vocab_size))
        vectors = np.dot(input_data,W1)+b1
        
        return vectors.tolist()     
           
    
    
    def decoder(self, vectors):
        
        '''
        Returns the nearest word in the word-representation for the given vectors.
        
        It loads the graph, extract the tensors W1 y b1 and 
        '''
        
        W1 = np.load('model/W1.npy')
        b1 = np.load('model/b1.npy')
        
        with open('./model/dicc.pkl','rb') as file:
            dicc = pickle.load(file)
        
        dicc_i2w = dicc['i2w']
    
        def euclidean_dist(vector1, vector2): return np.sqrt(np.sum((vector1-vector2)**2))   

        
        '''with tf.Session() as sess:

            saver = tf.train.import_meta_graph('./model/model.meta')
            saver.restore(sess,tf.train.latest_checkpoint('./model'))
   
            graph = tf.get_default_graph()
            input_data = graph.get_tensor_by_name("input_data:0")
            output_data = graph.get_tensor_by_name("output_data:0")
            vocab_vector = graph.get_tensor_by_name("vector:0")        
        
        '''
        vocab_vectors = W1+b1
        

        words = []
        
        for vector in vectors:
            
            distances = np.apply_along_axis(euclidean_dist, 1, vocab_vectors, vector)
            nearest_index = np.argmin(distances)
            nearest_word = dicc_i2w[nearest_index] if nearest_index!=0 else ''
            
            words.append(nearest_word)
            
        return words

In [5]:
vocab_size = 200
embedding_dim = 10

In [6]:
#if __name__ == "__main__":
prepare = data_preparation()

text = prepare.make_disintegration(text)
sent = prepare.get_sentences(text)
dicc = prepare.get_dictionary(text, stopwords, vocab_size)
data = prepare.get_word_list(sent, stopwords,window_size =1)

In [7]:
model = word2vec(vocab_size, embedding_dim)
x_train,y_train = model.training_data(data)
_ = model.train(x_train,y_train)

In [9]:
vectors = model.encoder(['caperucita','lobo','abuela'])
palabras = model.decoder(vectors)
print(palabras)

['caperucita', 'lobo', 'abuela']
