In [4]:
import tensorflow as tf
import numpy as np
import os
from os.path import exists
import csv
import pandas as pd
from string import punctuation
from keras.utils import np_utils
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, Bidirectional, GlobalMaxPooling1D
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model

#obtenir les lignes de chaque poèmes
def ObtainTexts(theme):
    TextLine = []
    repo = "PoemTopics/"+ str(theme) + "/"
    for filename in os.listdir(repo):
        dir = str(repo)+str(filename)
        file = open(dir, encoding="utf8")
        while(True):
            line = file.readline()
            if not line:
                break
            else:
                TextLine.append(line)
    return TextLine

def clean_text(text):
    text = re.sub(r',', '', text)
    text = re.sub(r'\'', '',  text)
    text = re.sub(r'\"', '', text)
    text = re.sub(r'\(', '', text)
    text = re.sub(r'\)', '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'“', '', text)
    text = re.sub(r'”', '', text)
    text = re.sub(r'’', '', text)
    text = re.sub(r'\.', '', text)
    text = re.sub(r';', '', text)
    text = re.sub(r':', '', text)
    text = re.sub(r'\-', '', text)
    return text


def dataCleaning(TextLine):
    #enlever les \n
    for i in range(len(TextLine)):
        TextLine[i] = TextLine[i].replace('\n', '')
        TextLine[i] = TextLine[i].lower()
        TextLine[i] = clean_text(TextLine[i])
    return TextLine

def tokenize(TextLine):
   # Instantiating the Tokenizer
    max_vocab = 1000000
    tokenizer = Tokenizer(num_words=max_vocab)
    tokenizer.fit_on_texts(TextLine)   

    # Getting the total number of words of the data.
    word2idx = tokenizer.word_index
    vocab_size = len(word2idx) + 1 
    return vocab_size, tokenizer

# We will turn the sentences to sequences line by line and create n_gram sequences [3,67,2,...]
def sentToSeq(TextLine,tokenizer):
    input_seq = []

    for line in TextLine:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_seq = token_list[:i+1]
            input_seq.append(n_gram_seq)
    return input_seq

# Getting the maximum length of sequence for padding purpose
def maxLenSequence(input_seq):
    return max(len(x) for x in input_seq)

# Padding the sequences and converting them to array
def padAndArray(input_seq,max_seq_length):
    return np.array(pad_sequences(input_seq, maxlen=max_seq_length, padding='pre'))

# Taking xs and labels to train the model. [0,0,0,...,x] x is a word that has been categorise
def XAndLabel(input_seq):
    xs = input_seq[:, :-1]        # xs contains every word in sentence except the last one because we are using this value to predict the y value
    labels = input_seq[:, -1]     # labels contains only the last word of the sentence which will help in hot encoding the y value in next step
    return xs,labels

# one-hot encoding the labels according to the vocab size

# The matrix is square matrix of the size of vocab_size. Each row will denote a label and it will have 
# a single +ve value(i.e 1) for that label and other values will be zero. 
def categorical(labels,vocab_size):
    return to_categorical(labels, num_classes=vocab_size)


def textGeneratorModel(vocab_size,max_seq_length,xs,ys,theme):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_seq_length-1))
    model.add(Bidirectional(LSTM(150, return_sequences = True)))
    model.add(Dropout(0.2))
    model.add(LSTM(100))
    model.add(Dense(vocab_size/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    r = model.fit(xs,ys,epochs=125)
    
    repo = "models/model_"+str(theme)+".h5"
    model.save(repo)
    return r

def DisplayAccuracy(r):
    import matplotlib.pyplot as plt
    plt.plot(r.history['accuracy'])
    
def predict_words(seed, no_words,model,tokenizer,maxLenInputSeq,theme):
    for i in range(no_words):
        token_list = tokenizer.texts_to_sequences([seed])[0]
        token_list = pad_sequences([token_list], maxlen=maxLenInputSeq-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=1)

        new_word = ''

        for word, index in tokenizer.word_index.items():
            if predicted == index:
                new_word = word
                break
        seed += " " + new_word
    print("Generated text: \n", seed)
    filename = "./outputs/"+str(theme) + ".txt"
    with open(filename, 'w') as txtfile:
        txtfile.write(seed) 
    
    
def Menu():
    choice = ['depression','evil','funny','god','love','moon','mother','romance','war','wedding']
    position = -1
    while (position < 0 or position > 9):
        try: 
            position = int(input("Choose 1 topic :0. depression, 1. evil, 2. funny, 3. god, 4. love, 5. moon, 6. mother, 7. romance, 8. war, 9. wedding \n"))
        except: 
            print("Choose a number")
    print("You have chosen:",str(choice[position]),"\n")
    return choice[position]
        
def main():
    theme = Menu()
    TextLine = ObtainTexts(theme)
    TextLine = dataCleaning(TextLine)
    vocab_size,tokenizer = tokenize(TextLine)
    input_seq = sentToSeq(TextLine,tokenizer)
    maxLenInputSeq = maxLenSequence(input_seq)
    input_seq  = padAndArray(input_seq,maxLenInputSeq)
    xs,labels = XAndLabel(input_seq)
    ys = categorical(labels,vocab_size)
    
    repo = "models/model_"+str(theme)+".h5"
    if not(exists(repo)):
        r = textGeneratorModel(vocab_size,maxLenInputSeq,xs,ys,theme)
        DisplayAccuracy(r)
    
    
    model=load_model(repo) 
    seed_text = input('Write the begining of a sentence. (3 words)') #Begining of the sentence.
    next_words = 300 #Number of word to generate
    predict_words(seed_text, next_words,model,tokenizer,maxLenInputSeq,theme)

if __name__ == "__main__":
    main()

Choose 1 topic :0. depression, 1. evil, 2. funny, 3. god, 4. love, 5. moon, 6. mother, 7. romance, 8. war, 9. wedding 
1
You have chosen: evil 

Write the begining of a sentence. (3 words)I am evil
Generated text: 
 I am evil that i wish to protect out the evil in ten sleep crying requests to testify and lives in arkansas a foolish furrow dead for your honours invitation inside he life seems this children they caught called in my spirit dies my children dwelled for if you cannot neglect change my sin displays but hunger in this campus one of the street men a woman made you make you pay – ive omitted forever was in a screw and you the war and in my spirit dies is indeed but refs decision fazio drowned the forest men you met them you do not show respect her pages not to find if they die— told can never be returned eight dull satan my spirit displays and im wins as long if the diary of her mind in the heart of parents if him if the vision of a woman you walk alone of all at its soil my pa