<h4> Lanturile Markov avand 1 stare </h4>

<h1>Importuri</h1>

In [1]:
import numpy as np
import pandas as pd
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random

<h1>Citirea datelor</h1>

In [2]:
def read_all(path):
    txt = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line == '----------':
                break
            if line != '':
                txt.append(line)
    return txt

path = "D:/FACULTATE/SEM4/AI/Lab8/data/proverbe.txt"
stories = read_all(path)
print("Numărul de linii =", len(stories))

Numărul de linii = 330


<h1>Cleaning DataSet</h1>

In [3]:
def clean_txt(txt):
    cleaned_txt = []
    for line in txt:
        line = line.lower() 
        
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
        
        tokens = word_tokenize(line)  # Tokenizam 
        words = [word for word in tokens if word.isalpha()]  # Alegem doar cuvintele
        cleaned_txt.extend(words)  
    return cleaned_txt

cleaned_stories = clean_txt(stories)
print("Numărul de cuvinte =", len(cleaned_stories))
print(cleaned_stories[:50])

Numărul de cuvinte = 2211
['a', 'aduce', 'apa', 'dupa', 'ce', 'stins', 'focul', 'a', 'aduna', 'nuiele', 'pentru', 'spinarea', 'sa', 'a', 'ajunge', 'cutitul', 'la', 'os', 'a', 'auzit', 'clopotul', 'dar', 'nu', 'stie', 'de', 'la', 'ce', 'biserica', 'a', 'avea', 'ac', 'de', 'cojocul', 'cuiva', 'a', 'avea', 'mai', 'mult', 'noroc', 'decat', 'minte', 'a', 'bate', 'apa', 'in', 'piua', 'a', 'bate', 'calul', 'care']


<h1>Crearea modelului Markov</h1>

In [26]:
def make_markov_model(cleaned_stories, n_gram):
    markov_model = {}
    for i in range(len(cleaned_stories)-n_gram):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_stories[i+j] + " "
            next_state += cleaned_stories[i+j+n_gram] + " " if i + j + n_gram < len(cleaned_stories) else ""
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total
        
    return markov_model

In [27]:
markov_model = make_markov_model(cleaned_stories, 1)

In [28]:
print("number of states = ", len(markov_model.keys()))

number of states =  824


In [29]:
print("All possible transitions from 'the game' state: \n")
print(markov_model['face'])

All possible transitions from 'the game' state: 

{'borta': 0.029411764705882353, 'cruce': 0.029411764705882353, 'cuiva': 0.029411764705882353, 'cum': 0.029411764705882353, 'din': 0.058823529411764705, 'ochi': 0.029411764705882353, 'pe': 0.11764705882352941, 'treaba': 0.029411764705882353, 'umbra': 0.029411764705882353, 'un': 0.029411764705882353, 'vara': 0.029411764705882353, 'ce': 0.058823529411764705, 'omul': 0.029411764705882353, 'carte': 0.029411764705882353, 'cine': 0.029411764705882353, 'primavara': 0.029411764705882353, 'rai': 0.029411764705882353, 'doage': 0.029411764705882353, 'obada': 0.029411764705882353, 'calul': 0.029411764705882353, 'ciorba': 0.029411764705882353, 'otetul': 0.029411764705882353, 'gatul': 0.029411764705882353, 'popa': 0.029411764705882353, 'azi': 0.029411764705882353, 'dreapta': 0.029411764705882353, 'roade': 0.029411764705882353, 'haina': 0.029411764705882353, 'sarguinta': 0.029411764705882353}


<h1>Generarea de proverbe</h1>

In [34]:
def generate_story(markov_model, limit, start):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story+=curr_state+" "
    while n<limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))
        
        curr_state = next_state[0]
        story+=curr_state+" "
        n+=1
    return story

<h1>Utilizare</h1>

In [35]:
for i in range(40):
    print(str(i)+". ", generate_story(markov_model, 8 ,"face"))

0.  face cuiva in capusa ce face doage nici nu 
1.  face pe dumnezeu ii mult pagubeste lacomia pierde omenia 
2.  face ce zice hop pana ai ales pana ce 
3.  face ce naste din cal magar si sapa groapa 
4.  face ochi nu piere in pravalie cu varza a 
5.  face rai dati cezarului ce tie nu ai alta 
6.  face doage nici nu ai carte ai sarit parleazul 
7.  face sarguinta e mai bun bucatar frate frate dar 
8.  face roade cine se cunoaste de la roma tot 
9.  face vara sanie si baba rada baba ce ii 
10.  face treaba haina nu crede tot raul doamne de 
11.  face obada de prieteni ca se poate oase roade 
12.  face un bou cine rade de neghiob rusinosul roade 
13.  face haina si putin ca musca in gura de 
14.  face cuiva in tavan a avea ac de multe 
15.  face din talpa casei nu e un bou cine 
16.  face ce am avut si ar manca de la 
17.  face haina pe gard nu vrei sa nu duce 
18.  face gatul gros lupul la os a face haina 
19.  face omul sarac nici boii batrani sa se uita 
20.  face calul mai mult noroc