In [1]:
import numpy as np

def run_markov_chain(transition_matrix, n=10, print_transitions=False):
    """
    Takes the transition matrix and runs through each state of the Markov chain for n time steps. When the chain reaches a steady state, returns the transition probabilities and the time step of the convergence.    
    @params:
    - transition matrix: transition probabilities
    - n: number of time steps to run. default is 10 steps
    - print_transitions: tells if we want to print the transition matrix at each time step
    """ 
    step = transition_matrix    
    for time_step in range(1, n):
       
        if print_transitions:
            print('Transition Matrix at step:' + str(time_step))
            print(step)
            print('-------------------------')      
            
        next_step = np.matmul(step, transition_matrix).round(2)
      
        if np.array_equal(step, next_step):
            print('Markov chain reached steady-state at time-step = ' + str(time_step))            
            if not print_transitions:
                print(step)            
            return step
        else:
            step = next_step

        
        
    return step

In [2]:

transition_matrix = np.array([[0.1, 0.4, 0.3, 0.2],
                              [0.35, 0.1, 0.25, 0.3],
                              [0.4, 0.3, 0.05, 0.25],
                              [0.42, 0.42, 0.08, 0.08]])

power_transition_matrix = run_markov_chain(transition_matrix, print_transitions=True)

Transition Matrix at step:1
[[0.1  0.4  0.3  0.2 ]
 [0.35 0.1  0.25 0.3 ]
 [0.4  0.3  0.05 0.25]
 [0.42 0.42 0.08 0.08]]
-------------------------
Transition Matrix at step:2
[[0.35 0.25 0.16 0.23]
 [0.3  0.35 0.17 0.19]
 [0.27 0.31 0.22 0.2 ]
 [0.25 0.27 0.24 0.24]]
-------------------------
Transition Matrix at step:3
[[0.28 0.31 0.19 0.2 ]
 [0.3  0.29 0.2  0.22]
 [0.31 0.29 0.19 0.22]
 [0.32 0.3  0.17 0.21]]
-------------------------
Transition Matrix at step:4
[[0.3  0.28 0.19 0.21]
 [0.3  0.3  0.19 0.21]
 [0.3  0.3  0.19 0.21]
 [0.29 0.3  0.2  0.21]]
-------------------------
Transition Matrix at step:5
[[0.29 0.29 0.19 0.21]
 [0.3  0.3  0.19 0.21]
 [0.3  0.3  0.19 0.21]
 [0.3  0.29 0.19 0.21]]
-------------------------
Markov chain reached steady-state at time-step = 5


In [46]:
corpus_words = ["Ola", "Bom dia", "tudo bem", "sou o Leo"]

distinct_words = list(set(corpus_words))
word_idx_dict = {word: i for i, word in enumerate(distinct_words)}
distinct_words_count = len(list(set(corpus_words)))
distinct_words_count # 32663

k = 2 # adjustable

sets_of_k_words = [ ' '.join(corpus_words[i:i+k]) for i, _ in enumerate(corpus_words[:-k]) ]

from scipy.sparse import dok_matrix

sets_count = len(list(set(sets_of_k_words)))
next_after_k_words_matrix = dok_matrix((sets_count, len(distinct_words)))


distinct_sets_of_k_words = list(set(sets_of_k_words))
k_words_idx_dict = {word: i for i, word in enumerate(distinct_sets_of_k_words)}

for i, word in enumerate(sets_of_k_words[:-k]):
   
    word_sequence_idx = k_words_idx_dict[word]
    
    next_word_idx = word_idx_dict[corpus_words[i+k]]
    next_after_k_words_matrix[word_sequence_idx, next_word_idx] +=1



In [49]:
def sample_next_word_after_sequence(word_sequence, alpha = 0):
 
    next_word_vector = next_after_k_words_matrix[k_words_idx_dict[word_sequence]] + alpha

    likelihoods = next_word_vector/next_word_vector.sum()
    print(likelihoods)
    return weighted_choice(distinct_words, likelihoods.toarray())
    
def stochastic_chain(seed, chain_length=15, seed_length=4):
    current_words = seed.split(' ')
    if len(current_words) != seed_length:
        raise ValueError(f'wrong number of words, expected {seed_length}')
    sentence = seed

    for _ in range(chain_length):
        sentence+=' '
        next_word = sample_next_word_after_sequence(' '.join(current_words))
        sentence+=next_word
        current_words = current_words[1:]+[next_word]
    return sentence


In [50]:
# example use    
# print()
# print(dok_matrix((sets_count, len(distinct_words))))
# print(dok_matrix((50, 50), dtype=np.float32))
stochastic_chain(seed='Bom dia tudo bem')
# mtx = dok_matrix((sets_count, len(distinct_words)))




NameError: name 'weighted_choice' is not defined

In [51]:
# mtx.todense()