<h4> Pentru a adresa limitările de creativitate în poezia generată înlocuiți aleator cuvinte cu sinonime. Se cere ca sinonimele să fie obținute folosind embedding-uri. (i.e. Cuvântul ales e transformat în forma sa embedded și se alege embedding-ul cel mai apropiat care este convertit la string) </h4>

<h1>Importuri</h1>

In [1]:
import numpy as np
import pandas as pd
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random

<h1>Citirea datelor</h1>

In [2]:
from datasets import load_dataset

dataset = load_dataset("biglam/gutenberg-poetry-corpus")
train_data = dataset["train"]

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print("Numărul de linii =", len(train_data))
print(train_data[:4])

Numărul de linii = 3085117
{'line': ['The Song of Hiawatha is based on the legends and stories of', 'many North American Indian tribes, but especially those of the', 'Ojibway Indians of northern Michigan, Wisconsin, and Minnesota.', 'They were collected by Henry Rowe Schoolcraft, the reknowned'], 'gutenberg_id': [19, 19, 19, 19]}


<h1>Cleaning DataSet</h1>

In [4]:
train_data.features

{'line': Value(dtype='string', id=None),
 'gutenberg_id': Value(dtype='int64', id=None)}

In [13]:
def clean_text(text):
    cleaned_text = []
    for example in text:
        poem_text = example["line"]  # Accesează textul poeziei din exemplu
        poem_text = poem_text.lower()
        poem_text = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", poem_text)
        tokens = word_tokenize(poem_text)
        words = [word for word in tokens if word.isalpha()]
        cleaned_text.extend(words)
    return cleaned_text

cleaned_poetry = clean_text(train_data)
print("Numărul de propoziții =", len(cleaned_poetry))
print("Primele 5 propoziții:")
print(cleaned_poetry[:5])

Numărul de propoziții = 21740946
Primele 5 propoziții:
['the', 'song', 'of', 'hiawatha', 'is']


<h1>Word2Vec</h1>

In [70]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

model = Word2Vec(sentences=[cleaned_poetry], vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")
model = Word2Vec.load("word2vec.model")

<h1>Antrenam modelul</h1>

In [74]:
model.train(cleaned_poetry, total_examples=len(cleaned_poetry), epochs=100)

(8311201551, 9182398800)

<h2>Random testing</h2>

In [75]:
vector = model.wv['grandpa']  # get numpy vector of a word
sims = model.wv.most_similar('grandpa', topn=10)  # get other similar words
print(sims)

[('dichten', 0.45711588859558105), ('smirching', 0.4185357987880707), ('unlovable', 0.41729921102523804), ('matricks', 0.4032917022705078), ('dämmernde', 0.4003211557865143), ('primordial', 0.39884865283966064), ('whiaesaes', 0.3972812592983246), ('ualere', 0.3962159752845764), ('anteferantur', 0.39552658796310425), ('neceslitie', 0.3954140543937683)]


<h1>Crearea modelului Markov</h1>

In [31]:
def make_markov_model(cleaned_stories, n_gram):
    markov_model = {}
    for i in range(len(cleaned_stories)-n_gram):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_stories[i+j] + " "
            next_state += cleaned_stories[i+j+n_gram] + " " if i + j + n_gram < len(cleaned_stories) else ""
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total
        
    return markov_model

In [32]:
n = 1
markov_model = make_markov_model(cleaned_poetry, n)

In [33]:
print("number of states = ", len(markov_model.keys()))

number of states =  242615


<h1>Generare text</h1>

In [34]:
def generate_story(markov_model, limit, start):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story+=curr_state+" "
    while n<limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))
        
        curr_state = next_state[0]
        story+=curr_state+" "
        n+=1
    return story

<h1>Utilizare</h1>

In [76]:
import random
from termcolor import colored  

replace_prob = 0.3  # Probabilitatea 

def replace_with_synonyms(word):
    try:
        synonyms = model.wv.most_similar(word, topn=10)  
        synonym = synonyms[0][0]
        return synonym, word 
    except:
        return word, word  # Dacă nu găsim sinonim, returnăm cuvântul original

poetry = generate_story(markov_model, 189, "with")
words = poetry.split()  
words_printed = 0 
for word in words:
    if words_printed % 19 == 0 and words_printed > 0:
        print()  
    if replace_prob == 1 or (replace_prob > 0 and random.random() < replace_prob):
        synonym, original_word = replace_with_synonyms(word)
        if original_word and original_word != synonym:  # Verificăm dacă cuvântul original și sinonimul sunt diferite
            print(colored(f"{synonym} ({original_word})", 'red'), end=" ")  
        else:
            print(synonym, end=" ")  # Dacă nu sunt diferite, printăm doar sinonimul
    else:
        print(word, end=" ")
    words_printed += 1


with mind she [31mholdere (strayed)[0m [31mthe (from)[0m [31mof (their)[0m heels [31mthe (and)[0m [31mhis (made)[0m hem with natures [31mit (only)[0m have proved [31mstrike (nought)[0m me [31mjubileedom (yours)[0m my 
precontract and [31mthe (she)[0m [31myour (looked)[0m [31mthe (in)[0m [31mthe (his)[0m [31mdevoveat (name)[0m but nature to none learn [31mhis (the)[0m ledges the [31msongin (fruit)[0m and [31msalons (tempest)[0m [31mwater (found)[0m 
the frame while the dead or some melodious lay o [31mboldaeurotm (philomela)[0m [31mchauffeurs (talkd)[0m with numbers long swords and [31mjessie (mild)[0m his 
shoulder [31mroofes (pressing)[0m forward through the cleft shield he amid the kind [31mhiawatha (heart)[0m [31minicere (doth)[0m linger sweet so [31mpensacola (blissfully)[0m in flowing 
crystal carefully kept her speed [31mthen (fire)[0m and [31mcene (agin)[0m the mountains head the weight [31mhanc (oppressd)[0m of these he said