In [92]:
import logging

import vocabulary
import ngram

import json
from glob import glob
from tqdm import tqdm
import time
import random

logging.basicConfig(format='%(levelname)s %(asctime)s - %(message)s', level=logging.INFO)

def load_config():
    """
    Fonction permettant de charger le fichier de configuration config.json et de le charger en tant que dictionnaire Python.
    """
    try:
        logging.info("Loading configuration...")
        with open('config.json') as json_config_file:
            config = json.load(json_config_file)
        logging.info("Loaded.")
        return config
    except Exception as e:
        logging.error("Error while loading configuration file 'config.json'.")
        logging.error(e)
        logging.error("Exiting.")
        sys.exit(1)
        raise

def load_models(generate_name):
    logging.info("Loading models...")
    sub_folders = glob(f"data/ngram/{generate_name}/*")
    models = {}
    for folder in sub_folders:
        n = int(folder.split("\\")[-1])
        logging.info(f"\tLoading Ngram model, n={n}...")

        temp_ngram = ngram.Ngram()
        path_model = glob(f"{folder}/*.ngram")
        temp_ngram.load(path_model[0])

        models[n] = temp_ngram

    return models

def generate_sentence(models, vocab, starts_with, min_words, end_char):
    sentence = starts_with
    word_count = 1
    model = models[1]
    
    while word_count <= min_words:
        # Tokenize the sentence
        sentence_tokenized = vocab.chain_to_ids(sentence).split()
        
        # Select probas associated to the actual word/sequence
        probas = model.chain_frequency.get(int(sentence_tokenized[-1]))
        
        # Select the next word 
        next_word = random.choices(list(probas.keys()), weights=list(probas.values()), k=1)[0]
        
        # Add the new word to the sentence
        sentence_tokenized.append(str(next_word))
        
        # Un-Tokenize
        sentence = vocab.ids_to_chain(" ".join(sentence_tokenized))
        word_count = len(sentence.split())
        
        # Is last char in end_char
        if word_count > min_words and sentence[-1] not in end_char:
            min_words += 1
        
        
    return sentence


SyntaxError: invalid syntax (1623289980.py, line 52)

In [91]:
logging.info(f"Generating {nb_sentences_to_generate} sentences...")
sentences = [generate_sentence(models, vocab, starts_with, min_words, end_char) for _ in range(nb_sentences_to_generate)]

for sentence in sentences:
    time.sleep(random.uniform(0, delay+1))
    logging.info(sentence)

INFO 2023-04-19 00:23:17,806 - Generating 10 sentences...
INFO 2023-04-19 00:23:18,069 - Le Pen : « Il faut d'abord celle des débouchés évidentes.
INFO 2023-04-19 00:23:18,527 - Le Pen… EMMANUEL MACRON Complètement. Résolument. Dordogne, il y a donc vous féliciter.
INFO 2023-04-19 00:23:18,687 - Le deuxième chose, on a néanmoins un travail aussi n'y aura des ports français.
INFO 2023-04-19 00:23:18,722 - Le jour à limiter les entreprises. Nous partageons le nouveau système commercial qui la recherche fondamentale et nous faudra la République, et technologique  il y ait une réunion du Parlement qui seront jamais perdu avec vous avez envoyé à ce qui favorise la Culture.
INFO 2023-04-19 00:23:18,953 - Le mot est de la semaine où c'était le ministre des choix familiaux qui vivent dans le faire évoluer les réformes que nous disent les travailleurs détachés parce que nous mériterons leur travail est à soutenir l'Ukraine après un cadre législatif au corps à cette route que c'était à la lumiè

In [73]:
config = load_config()
generate_name = config["generate_name"]
nb_sentences_to_generate = config["nb_sentences_to_generate"]
starts_with = config["starts_with"]
delay = config["delay"]
starts_with = config["starts_with"]
min_words = config["min_words"]
end_char = config["end_char"]

INFO 2023-04-19 00:20:18,743 - Loading configuration...
INFO 2023-04-19 00:20:18,745 - Loaded.


In [3]:
logging.info(f"Loading vocabulary...")
vocab = vocabulary.Vocabulary()
vocab.load(f"data/vocabs/{generate_name}.vocab")
logging.info(f"Loaded.")

models = load_models(generate_name)
logging.info(f"{len(models)} models loaded.")

print(models)

INFO 2023-04-19 00:09:44,810 - Loading configuration...
INFO 2023-04-19 00:09:44,829 - Loaded.
INFO 2023-04-19 00:09:44,829 - Loading vocabulary...
INFO 2023-04-19 00:09:44,891 - Loaded.
INFO 2023-04-19 00:09:44,891 - Loading models...
INFO 2023-04-19 00:09:44,891 - 	Loading Ngram model, n=1...
INFO 2023-04-19 00:09:45,013 - 	Loading Ngram model, n=2...
INFO 2023-04-19 00:09:45,616 - 	Loading Ngram model, n=3...
INFO 2023-04-19 00:09:46,874 - 	Loading Ngram model, n=4...
INFO 2023-04-19 00:09:48,718 - 	Loading Ngram model, n=5...
INFO 2023-04-19 00:09:50,796 - 5 models loaded.


{1: <ngram.ngram.Ngram object at 0x000001FAFC32D290>, 2: <ngram.ngram.Ngram object at 0x000001FAFD0468D0>, 3: <ngram.ngram.Ngram object at 0x000001FAFB8610D0>, 4: <ngram.ngram.Ngram object at 0x000001FA99F24E90>, 5: <ngram.ngram.Ngram object at 0x000001FA99F25450>}
