In [16]:
import tensorflow as tf
import numpy as np
import os
import pandas as pd
from string import punctuation
from keras.utils import np_utils
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, Bidirectional, GlobalMaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model

#obtenir les lignes de chaque poèmes
def ObtainTexts():
    LigneTexte = []
    for filename in os.listdir("war/"):
        dir = "war/"+str(filename)
        file = open(dir, encoding="utf8")
        while(True):
            texte = []
            line = file.readline()
            if not line:
                break
            else:
                LigneTexte.append(line)
    return LigneTexte

def clean_text(text):
    text = re.sub(r',', '', text)
    text = re.sub(r'\'', '',  text)
    text = re.sub(r'\"', '', text)
    text = re.sub(r'\(', '', text)
    text = re.sub(r'\)', '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'“', '', text)
    text = re.sub(r'”', '', text)
    text = re.sub(r'’', '', text)
    text = re.sub(r'\.', '', text)
    text = re.sub(r';', '', text)
    text = re.sub(r':', '', text)
    text = re.sub(r'\-', '', text)
    return text


def dataCleaning(LigneTexte):
    #enlever les \n
    for i in range(len(LigneTexte)):
        LigneTexte[i] = LigneTexte[i].replace('\n', '')
        LigneTexte[i] = LigneTexte[i].lower()
        LigneTexte[i] = clean_text(LigneTexte[i])
    return LigneTexte

def tokenize(LigneTexte):
   # Instantiating the Tokenizer
    max_vocab = 1000000
    tokenizer = Tokenizer(num_words=max_vocab)
    tokenizer.fit_on_texts(LigneTexte)   

    # Getting the total number of words of the data.
    word2idx = tokenizer.word_index
    vocab_size = len(word2idx) + 1 
    return vocab_size, tokenizer

# We will turn the sentences to sequences line by line and create n_gram sequences
def sentToSeq(LigneTexte,tokenizer):
    input_seq = []

    for line in LigneTexte:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_seq = token_list[:i+1]
            input_seq.append(n_gram_seq)
    return input_seq

# Getting the maximum length of sequence for padding purpose
def maxLenSequence(input_seq):
    return max(len(x) for x in input_seq)

# Padding the sequences and converting them to array
def padAndArray(input_seq,max_seq_length):
    return np.array(pad_sequences(input_seq, maxlen=max_seq_length, padding='pre'))

# Taking xs and labels to train the model.
def XAndLabel(input_seq):
    xs = input_seq[:, :-1]        # xs contains every word in sentence except the last one because we are using this value to predict the y value
    labels = input_seq[:, -1]     # labels contains only the last word of the sentence which will help in hot encoding the y value in next step
    return xs,labels

# one-hot encoding the labels according to the vocab size

# The matrix is square matrix of the size of vocab_size. Each row will denote a label and it will have 
# a single +ve value(i.e 1) for that label and other values will be zero. 
def categorical(labels,vocab_size):
    return to_categorical(labels, num_classes=vocab_size)


def textGeneratorModel(vocab_size,max_seq_length,xs,ys):
    model = Sequential()
    model.add(Embedding(vocab_size, 124, input_length=max_seq_length-1))
    model.add(Dropout(0.2))
    model.add(LSTM(520, return_sequences=True))
    model.add(Bidirectional(LSTM(340, return_sequences=True)))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    
    model.compile(optimizer=Adam(lr=0.001),loss = 'categorical_crossentropy',metrics=['accuracy'])
    
    r = model.fit(xs,ys,epochs=100)
    model.save('modelWar.h5')
    return r

def DisplayAccuracy(r):
    import matplotlib.pyplot as plt
    plt.plot(r.history['accuracy'])
    
def predict_words(seed, no_words,model,tokenizer,maxLenInputSeq):
    for i in range(no_words):
        token_list = tokenizer.texts_to_sequences([seed])[0]
        token_list = pad_sequences([token_list], maxlen=maxLenInputSeq-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=1)

        new_word = ''

        for word, index in tokenizer.word_index.items():
            if predicted == index:
                new_word = word
                break
        seed += " " + new_word
    print(seed)
    
def main():
    LigneTexte = ObtainTexts()
    LigneTexte = dataCleaning(LigneTexte)
    vocab_size,tokenizer = tokenize(LigneTexte)
    input_seq = sentToSeq(LigneTexte,tokenizer)
    maxLenInputSeq = maxLenSequence(input_seq)
    input_seq  = padAndArray(input_seq,maxLenInputSeq)
    xs,labels = XAndLabel(input_seq)
    ys = categorical(labels,vocab_size)
    #r = textGeneratorModel(vocab_size,maxLenInputSeq,xs,ys)
    #DisplayAccuracy(r)

    
    model=load_model('modelWar.h5') 
    seed_text = 'Today we want' #Début de la phrase, qui peut être changé mais seulement en Anglais
    next_words = 500 #Nombre de mots que l'on veut générer
    predict_words(seed_text, next_words,model,tokenizer,maxLenInputSeq)

if __name__ == "__main__":
    main()

Today we want war exchequer what than any faster than a powerful vehicle not a face – rent me what than do all rent and blackbrowd clue – than do after what and what than see claimed was what sea she intellectual and as not intellectual a cause and had what than not intellectual a saw to dight not got and sea have not got and the almondblossom of them not us and when them our orders me not got and the almondblossom which is what is heart was sea sea have not perceiving to sea what is not a amends is not us and when them all life and sea it as a cause and as a cause and as a cause and as a cause and she a or is like a meal – not a clue – is the sea me as claimed not a meal part was what in our meal me life and what is a heart was rent it as a us and when them them all claimed not a face – and see claimed not a fairer part more minds more me not a fairer part more minds white part livid skin me was a repulsive part livid part more bodies like few small hospital part when aroused like cad