**Curso de Inteligencia Artificial y Aprendizaje Profundo**


In [None]:
import tensorflow as tf
print(tf.__version__)

# !pip install -q tensorflow-datasets

## Introducción


Gran conjunto de datos basado en sonetos de Shekespeare.


## Librerías


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.utils import to_categorical
import numpy as np 

from tensorflow.keras.utils import plot_model

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.tokenize import TweetTokenizer

# diccionarios especiales para puntuación y palabras vacias
nltk.download('punkt') # Manejo de puntuación
nltk.download('stopwords')

# wordnet
nltk.download('wordnet')

from nltk.corpus import stopwords

# lematizador basado en WordNet de nltk
from nltk.stem import WordNetLemmatizer 

# steemer de nltk. Raiz de las palabras
#from nltk.stem import SnowballStemmer
import pandas as pd
import numpy as np
import statistics as st 
import matplotlib.pyplot as plt
import seaborn as sb
import tensorflow as tf
import re

import gensim
from gensim.parsing.preprocessing import STOPWORDS

## Lee los datos

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
ruta = 'gdrive/My Drive/Colab Notebooks/Reto 5/cuentos.txt'

In [None]:
data = open(ruta, encoding="utf8").read()
data = data.replace(',', '')
data = data.replace('«', '')
data = data.replace('»', '')
corpus = data.lower().split(".")


In [None]:
#nombre_gpu = tf.test.gpu_device_name()
#if nombre_gpu != '/device:GPU:0':
#  raise SystemError('GPU no encontrada')
#print('GPU encontrada: {}'.format(nombre_gpu))

print(corpus)

## Tokeniza el texto


In [None]:
tokenizer = Tokenizer()

corpus = data.lower().split("\n")
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

#print('Dictionary: ',tokenizer.word_index,'\n')
print('index de verdad = ',tokenizer.word_index['verdad'])
print('\nTotal de palabras en le Tokenizer (+OOV) =',total_words,'\n')
print('Primeras Lineas del corpus: \n')
corpus[0:10]

## Crea n-Grams para datos de entrenamiento y etiquetas


In [None]:
# create input sequences using list of tokens
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
        
print('input_sequences[0:10] = ',input_sequences[:10],'\n')

# Determine longest n-gram
max_seq_test_len = max([len(x) for x in input_sequences])
print('\nlong. n-grama más largo:',max_seq_test_len)


In [None]:
# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# create predictors and label
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

label = to_categorical(label, num_classes=total_words)

In [None]:
label

## Crea el modelo


In [None]:
model = Sequential()
model.add(Embedding(total_words, 50, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(100, return_sequences = True)))
model.add(Dropout(0.2))
model.add(LSTM(50))
model.add(Dense(total_words/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
print(model.summary())


## Entrena

In [None]:
#model = tf.keras.models.load_model('gdrive/My Drive/Colab Notebooks/Reto 5/TextGeneratorModel2.h5')
#print(model.summary())

### Intente correr este modelo como unas 15 veces y nunca pude lograr el no se me rompiera cuando faltaba un epochs por correr

In [None]:
import timeit

def entrenamiento_gpu():
  with tf.device('/device:GPU:0'):
    model.fit(predictors, label, epochs=20, verbose=1)
  return None

gpu_time = timeit.timeit('entrenamiento_gpu()', number=1, setup='from __main__ import entrenamiento_gpu')

#history = model.fit(predictors, label, epochs=10, verbose=1)

In [None]:
model.save('gdrive/My Drive/Colab Notebooks/Reto 5/TextGeneratorModel3.h5')

In [None]:
import matplotlib.pyplot as plt
acc = model.history.history['accuracy']
loss = model.history.history['loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'b', label='Training accuracy')
plt.title('Training accuracy')

plt.figure()

plt.plot(epochs, loss, 'b', label='Training Loss')
plt.title('Training loss')
plt.legend()

plt.show()

## Genera texto automáticamente

In [None]:
seed_text = "Hoy me levante pensando"
next_words = 50
  
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
print(seed_text)