<h4> Folosiți biblioteca markovify (sau implementarea voastră de la problema 1) pentru a genera o strofă de poezie în limba engleză folosind unul din următoarele corpus-uri (sau orice altă sursă găsiți voi):</h4>

<h1>Importuri</h1>

In [38]:
import numpy as np
import pandas as pd
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random

<h1>Citirea datelor</h1>

In [75]:
from datasets import load_dataset

dataset = load_dataset("biglam/gutenberg-poetry-corpus")
train_data = dataset["train"]

In [76]:
print("Numărul de linii =", len(train_data))
print(train_data[:4])

Numărul de linii = 3085117
{'line': ['The Song of Hiawatha is based on the legends and stories of', 'many North American Indian tribes, but especially those of the', 'Ojibway Indians of northern Michigan, Wisconsin, and Minnesota.', 'They were collected by Henry Rowe Schoolcraft, the reknowned'], 'gutenberg_id': [19, 19, 19, 19]}


<h1>Cleaning DataSet</h1>

In [79]:
train_data.features

{'line': Value(dtype='string', id=None),
 'gutenberg_id': Value(dtype='int64', id=None)}

In [80]:
def clean_text(text):
    cleaned_text = []
    for example in text:
        poem_text = example["line"]  # Accesează textul poeziei din exemplu
        poem_text = poem_text.lower()
        poem_text = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", poem_text)
        tokens = word_tokenize(poem_text)
        words = [word for word in tokens if word.isalpha()]
        cleaned_text.extend(words)
    return cleaned_text

cleaned_poetry = clean_text(train_data)
print("Numărul de cuvinte =", len(cleaned_poetry))
print(cleaned_poetry[:50])

Numărul de cuvinte = 21740946
['the', 'song', 'of', 'hiawatha', 'is', 'based', 'on', 'the', 'legends', 'and', 'stories', 'of', 'many', 'north', 'american', 'indian', 'tribes', 'but', 'especially', 'those', 'of', 'the', 'ojibway', 'indians', 'of', 'northern', 'michigan', 'wisconsin', 'and', 'minnesota', 'they', 'were', 'collected', 'by', 'henry', 'rowe', 'schoolcraft', 'the', 'reknowned', 'schoolcraft', 'married', 'jane', 'the', 'fur', 'trader', 'and', 'the', 'woman', 'of', 'the']


<h1>Crearea modelului Markov</h1>

In [84]:
def make_markov_model(cleaned_stories, n_gram):
    markov_model = {}
    for i in range(len(cleaned_stories)-n_gram):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_stories[i+j] + " "
            next_state += cleaned_stories[i+j+n_gram] + " " if i + j + n_gram < len(cleaned_stories) else ""
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total
        
    return markov_model

In [85]:
n = 1
markov_model = make_markov_model(cleaned_poetry, n)

In [86]:
print("number of states = ", len(markov_model.keys()))

number of states =  242615


In [87]:
print("All possible transitions from 'a face' state: \n")
print(markov_model['with'])

All possible transitions from 'a face' state: 



<h1>Generarea de proverbe</h1>

In [88]:
def generate_story(markov_model, limit, start):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story+=curr_state+" "
    while n<limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))
        
        curr_state = next_state[0]
        story+=curr_state+" "
        n+=1
    return story

<h1>Utilizare</h1>

In [93]:
poetry = generate_story(markov_model, 189 ,"with")

words = poetry.split()  # Splituiește poezia în cuvinte
for i, word in enumerate(words):
    print(word, end=" ")
    if (i + 1) % 19 == 0:  # Adaugă un rând nou la fiecare 19 cuvinte
        print()  # Afiseaza un rand nou

with the worlds shall die land breeding women new drama cares to the great hector and sun that drives 
huge projection overbrow a mal deulew ein bunter schmetterling doch hoellenverdammt wer clene out to downward upward fates decree 
us daub for not electra there rose amid the birds are the life for penelopes of kinde he found 
slow in miserie such such haste the tablet here is crushed white past now at lament unknown his elbow 
and over get a look on anothers life began every good which vices of the bells let this was 
cold earth thy smiles behold a prodigy his stalwart youth was in this lady clare her wings stoop where 
i never had curbd the tales i am mistaken maid of sure you tend aspiring nor thinks of their 
tale odysseus had at the only starry skies his hand the quarry and as his cleared from their mittens 
on records which side deep spirits of faith zeal should dote not this shows of an eddying wind doth 
wash d upon myn hertes vnkynde for love they in revelry as thick rugs and