In [2]:
import tensorflow as tf
import numpy as np
import os
from os.path import exists
import csv
import pandas as pd
from string import punctuation
from keras.utils import np_utils
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, Bidirectional, GlobalMaxPooling1D
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model

#obtenir les lignes de chaque poèmes
def ObtainTexts(theme):
    TextLine = []
    repo = "PoemTopics/"+ str(theme) + "/"
    for filename in os.listdir(repo):
        dir = str(repo)+str(filename)
        file = open(dir, encoding="utf8")
        while(True):
            line = file.readline()
            if not line:
                break
            else:
                TextLine.append(line)
    return TextLine

def clean_text(text):
    text = re.sub(r',', '', text)
    text = re.sub(r'\'', '',  text)
    text = re.sub(r'\"', '', text)
    text = re.sub(r'\(', '', text)
    text = re.sub(r'\)', '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'“', '', text)
    text = re.sub(r'”', '', text)
    text = re.sub(r'’', '', text)
    text = re.sub(r'\.', '', text)
    text = re.sub(r';', '', text)
    text = re.sub(r':', '', text)
    text = re.sub(r'\-', '', text)
    return text


def dataCleaning(TextLine):
    #enlever les \n
    for i in range(len(TextLine)):
        TextLine[i] = TextLine[i].replace('\n', '')
        TextLine[i] = TextLine[i].lower()
        TextLine[i] = clean_text(TextLine[i])
    return TextLine

def tokenize(TextLine):
   # Instantiating the Tokenizer
    max_vocab = 1000000
    tokenizer = Tokenizer(num_words=max_vocab)
    tokenizer.fit_on_texts(TextLine)   

    # Getting the total number of words of the data.
    word2idx = tokenizer.word_index
    vocab_size = len(word2idx) + 1 
    return vocab_size, tokenizer

# We will turn the sentences to sequences line by line and create n_gram sequences [3,67,2,...]
def sentToSeq(TextLine,tokenizer):
    input_seq = []

    for line in TextLine:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_seq = token_list[:i+1]
            input_seq.append(n_gram_seq)
    return input_seq

# Getting the maximum length of sequence for padding purpose
def maxLenSequence(input_seq):
    return max(len(x) for x in input_seq)

# Padding the sequences and converting them to array
def padAndArray(input_seq,max_seq_length):
    return np.array(pad_sequences(input_seq, maxlen=max_seq_length, padding='pre'))

# Taking xs and labels to train the model. [0,0,0,...,x] x is a word that has been categorise
def XAndLabel(input_seq):
    xs = input_seq[:, :-1]        # xs contains every word in sentence except the last one because we are using this value to predict the y value
    labels = input_seq[:, -1]     # labels contains only the last word of the sentence which will help in hot encoding the y value in next step
    return xs,labels

# one-hot encoding the labels according to the vocab size

# The matrix is square matrix of the size of vocab_size. Each row will denote a label and it will have 
# a single +ve value(i.e 1) for that label and other values will be zero. 
def categorical(labels,vocab_size):
    return to_categorical(labels, num_classes=vocab_size)


def textGeneratorModel(vocab_size,max_seq_length,xs,ys,theme):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_seq_length-1))
    model.add(Bidirectional(LSTM(150, return_sequences = True)))
    model.add(Dropout(0.2))
    model.add(LSTM(100))
    model.add(Dense(vocab_size/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    r = model.fit(xs,ys,epochs=125)
    
    repo = "models/model_"+str(theme)+".h5"
    model.save(repo)
    return r

def DisplayAccuracy(r):
    import matplotlib.pyplot as plt
    plt.plot(r.history['accuracy'])
    
def predict_words(seed, no_words,model,tokenizer,maxLenInputSeq,theme):
    for i in range(no_words):
        token_list = tokenizer.texts_to_sequences([seed])[0]
        token_list = pad_sequences([token_list], maxlen=maxLenInputSeq-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=1)

        new_word = ''

        for word, index in tokenizer.word_index.items():
            if predicted == index:
                new_word = word
                break
        seed += " " + new_word
    print(seed)
    filename = "./outputs/"+str(theme) + ".txt"
    with open(filename, 'w') as txtfile:
        txtfile.write(seed) 
    
    
def Menu():
    choice = ['depression','evil','funny','god','love','moon','mother','romance','war','wedding']
    position = -1
    while (position < 0 or position > 9):
        try: 
            position = int(input("Choose 1 topic :0. depression, 1. evil, 2. funny, 3. god, 4. love, 5. moon, 6. mother, 7. romance, 8. war, 9. wedding \n"))
        except: 
            print("Choose a number")
    return choice[position]
        
def main():
    theme = Menu()
    TextLine = ObtainTexts(theme)
    TextLine = dataCleaning(TextLine)
    vocab_size,tokenizer = tokenize(TextLine)
    input_seq = sentToSeq(TextLine,tokenizer)
    maxLenInputSeq = maxLenSequence(input_seq)
    input_seq  = padAndArray(input_seq,maxLenInputSeq)
    xs,labels = XAndLabel(input_seq)
    ys = categorical(labels,vocab_size)
    
    repo = "models/model_"+str(theme)+".h5"
    if not(exists(repo)):
        r = textGeneratorModel(vocab_size,maxLenInputSeq,xs,ys,theme)
        DisplayAccuracy(r)
    
    
    model=load_model(repo) 
    seed_text = input('Write the begining of a sentence. (3 words)') #Begining of the sentence.
    next_words = 300 #Number of word to generate
    predict_words(seed_text, next_words,model,tokenizer,maxLenInputSeq,theme)

if __name__ == "__main__":
    main()

Choose 1 topic :0. depression, 1. evil, 2. funny, 3. god, 4. love, 5. moon, 6. mother, 7. romance, 8. war, 9. wedding 
1
Write the begining of a sentence. (3 words)does the evil
does the evil in your mate hell not many ripped and ragged rugs 3 years hunger goes you grow as long as the law of matches go off in your hand she in every faith is instead i see you make breakups of turning of a sudden of your wife by parents along the next day i will eat late for turning around you is this decaying evil soul the streets trying to begin drowned the cornland off never a woman made you pay at america need this phenomenon the divil wid a stablefork bedivillin their tails my memory of a box – preferably on a bed thinking if they could sing to aim i try her knuckles rubbishing each other and because old final words born but the towers side made to work that water is one must walk alone of if god can never throw in the field and street around the countries on strife your good man is his cries hate i