# Markov Chains

In [None]:
%matplotlib notebook

In [None]:
import ipywidgets as widgets
import random
import matplotlib.pyplot as plt
import networkx as nx
import time
import asyncio
import threading
import spacy
from tqdm import tqdm
from pathlib import Path
from collections import Counter, defaultdict
import re
import os

In [None]:
class MarkovChain:
    
    def __init__(self, states, P, phi):
        '''
        args
        ----
        states - stati
        P - matrice delle probabilità di transizione 
        phi - se phi è una lista phi[i] è la probabilità che il primo stato sia i,
            altrimenti è lo stato di partenza
        '''
        self.n = len(states)  # numero di stati della MC
        self.P = P  # matrice di transizione
        self.states = states

        self.freqs = defaultdict(int)  # frequenze delle visite agli stati
        if isinstance(phi, list):
            self.state = random.choices(states, weights=phi, k=1)[0]
        else:
            self.state = phi
        self.freqs[self.state] += 1
        
        self.steps_num = 1
        
    def get_state_idx(self):
        return self.states.index(self.state)

    def step(self):
        ''' esegue una transizione'''
        current_state_transition_freqs = [self.P[self.state][state] for state in self.states]
        self.state = random.choices(self.states, weights=current_state_transition_freqs, k=1)[0]
        self.freqs[self.state] += 1
        self.steps_num += 1
        return self.state

    def get_absolute_freqs(self):
        ''' ritorna le frequenze assolute degli stati'''
        return self.freqs

# Simulatore

In [None]:
class McApp:
    ''' Simulatore di Markov Chain'''
    def __init__(self, states, P, phi):
        self.n = len(states)  # numero di stati
        self.P = P
        self.states = states
        self.mc = MarkovChain(states, P, phi)
        self.G = self.build_graph()
        self.pos = nx.circular_layout(self.G)  # layout del grafo
        self.show_interface()
        self.pause = True
        self.speed = 1
        self.play_thread = None

    def play(self):
        while not self.pause:
            self.step()
            time.sleep(1 / self.speed)

    def on_play_button_clicked(self, button):
        if self.pause == True:
            self.pause = False
            self.play_thread = threading.Thread(target=self.play)
            self.play_thread.start()

    def on_pause_button_clicked(self, button):
        self.pause = True

    def on_step_button_clicked(self, button):
        self.step()

    def step(self):
        self.mc.step()
        self.refresh_interface()

    def refresh_interface(self):
        steps = self.mc.steps_num
        freqs_dict = self.mc.get_absolute_freqs()
        abs_freqs = [freqs_dict[state] for state in self.states]
        perc_freqs = [abs_freq / steps * 100 for abs_freq in abs_freqs]
        self.steps_counter.value += 1

        for abs_freq, abs_freq_out in zip(abs_freqs, self.abs_freq_outs):
            abs_freq_out.value = str(abs_freq)

        for perc_freq, perc_freq_out in zip(perc_freqs, self.perc_freq_outs):
            perc_freq_out.value = f'{perc_freq:.2f}'

        if self.pause or self.speed < 5:
            self.show_graph()

    def on_speed_change(self, value):
        self.speed = value['new']

    def show_interface(self):
        self.fig, self.ax = plt.subplots(1, 1)
        self.fig.show()

        cmds_grid = self.build_cmd_grid()
        outs_grid = self.build_output_grid()
        display(cmds_grid, outs_grid)
        self.show_graph()

    def build_output_grid(self):
        def add_outs_to_layout():
            outs_grid[0, 0] = title
            outs_grid[1, 0] = self.steps_counter
            outs_grid[3, 0] = row2_label
            outs_grid[4, 0] = row3_label

        # layout
        outs_grid = widgets.GridspecLayout(5, 1 + self.n)

        title = widgets.Label(value='Statistiche')
        row2_label = widgets.Label(value='Visite')
        row3_label = widgets.Label(value='Visite (%)')

        # step counter
        self.steps_counter = widgets.IntText(value=1,
                                             description='Step',
                                             disabled=True)

        # stati
        for i in range(self.n):
            outs_grid[2, i + 1] = widgets.Label(value=str(i))

        # inizializza frequenze assoute e relative
        self.abs_freq_outs = []
        self.perc_freq_outs = []
        for i in range(self.n):
            abs_freq_label = widgets.Label(value=str(0))
            self.abs_freq_outs.append(abs_freq_label)
            outs_grid[3, i + 1] = abs_freq_label

            perc_freq_label = widgets.Label(value=str(0))
            self.perc_freq_outs.append(perc_freq_label)
            outs_grid[4, i + 1] = perc_freq_label

        add_outs_to_layout()

        return outs_grid

    def build_cmd_grid(self):
        def add_cmds_to_layout():
            cmds_grid[0, 0] = play_button
            cmds_grid[0, 1] = pause_button
            cmds_grid[0, 2] = step_button
            cmds_grid[0, 3] = speed_field

        play_button = widgets.Button(description='Play')
        play_button.on_click(self.on_play_button_clicked)
        pause_button = widgets.Button(description='Pause')
        pause_button.on_click(self.on_pause_button_clicked)
        step_button = widgets.Button(description='Step')
        step_button.on_click(self.on_step_button_clicked)
        speed_field = widgets.IntSlider(value=1, min=1, max=1000)
        speed_field.observe(self.on_speed_change, names='value')

        # layout
        cmds_grid = widgets.GridspecLayout(1, 5)
        add_cmds_to_layout()

        return cmds_grid

    def build_graph(self):
        G = nx.DiGraph()
        # aggiunge i nodi
        for state in self.states:
            G.add_node(state)
        # aggiunge gli archi
        for state1 in self.states:
            for state2 in self.states:
                if self.P[state1][state2] > 0:
                    G.add_edge(state1, state2)

        return G

    def show_graph(self):
        node_colors = ['b'] * self.n
        node_colors[self.mc.get_state_idx()] = 'r'
        nx.draw_networkx(self.G,
                         node_color=node_colors,
                         ax=self.ax,
                         pos=self.pos,
                         with_labels=True)


<img src="img/markov_chain.png" alt="markov chain" style="width: 600px;"/>

In [None]:
# stati
S = [0, 1, 2]

# matrice di transizione
P = [
    [0.7, 0.3, 0],
    [0.4, 0.5, 0.1],
    [0.7, 0.2, 0.1]
]

# probabilità iniziali
phi = [0.3, 0.3, 0.4]

In [None]:
app = McApp(S, P, phi)

# Generatore di testo

In [None]:
CORPUS_FOLDER = r'testi_italiano'

In [None]:
def load_corpus(folder):
    big_string = ''
    nlp = spacy.load("it_core_news_sm")
    nlp.max_length = 20000000 
    files = os.listdir(folder)
    for file in files:
        with open(Path(folder) / file, encoding='utf8') as f:
            big_string += f.read().lower()
            
    pattern = re.compile(r'\s+')
    big_string = re.sub(pattern, ' ', big_string)
    corpus = nlp(big_string, disable=['parser', 'tagger', 'ner'])

    counters = defaultdict(Counter)
    word_counter = Counter(word.text.lower() for word in corpus)
    for word, next_word in zip(corpus[:-1], corpus[1:]):
        word, next_word = word.text.lower(), next_word.text.lower()
        counters[word][next_word] += 1
        
    return list(set(word.text.lower() for word in corpus)), counters

In [None]:
words, counters = load_corpus(CORPUS_FOLDER)

In [None]:
mc = MarkovChain(words, counters, 'la')

In [None]:
string = mc.state
for i in range(100):
    string += ' ' + mc.step()
print(string)