## Shahnameh Characters

In [1]:
import pandas as pd
import numpy as np
import io

import hazm

normalizer = hazm.Normalizer(token_based=True)

poems = list(normalizer.normalize(line.strip()) for line in
             io.open('../datasets/shahnameh.txt', mode="r", encoding="utf-8").readlines())

poems = np.array(poems)
poems = np.apply_along_axis(' / '.join, 1, poems.reshape(-1, 2))

chars = set(normalizer.normalize(regex) for regex in 
             pd.read_csv('../datasets/shahnameh_characters.csv')['regex'])

In [2]:
characters = np.array(list(chars))

In [3]:
from tqdm.notebook import tqdm
import re

character_placement = np.zeros((len(characters), len(poems)), dtype=bool)
character_regex = [re.compile(character) for character in characters]

for row, rgx in enumerate(tqdm(character_regex)):
    for col, poem in enumerate(poems):
        character_placement[row, col] = bool(rgx.search(poem))

  0%|          | 0/530 [00:00<?, ?it/s]

In [4]:
threshold = 1

counts = np.sum(character_placement, axis=1)

characters, character_placement = \
    characters[counts >= 1], character_placement[counts >= 1]

id2characters = dict(enumerate(characters))

In [5]:
window_size = 5

In [6]:
padding = window_size - character_placement.shape[1] % window_size

char_sh = character_placement.shape[0]
poem_sh = character_placement.shape[1]

character_pp_1 = np.pad(
    character_placement, pad_width=((0, 0), (0, padding))).reshape(char_sh, -1, window_size)

character_pp_1 = np.any(character_pp_1, axis=2).astype(int)

In [7]:
# character_pp_1.shape

In [8]:
shift = window_size // 2

In [9]:
padding = window_size - (character_placement.shape[1] + shift) % window_size

char_sh = character_placement.shape[0]
poem_sh = character_placement.shape[1] + shift

character_pp_2 = np.pad(
    character_placement, pad_width=((0, 0), (shift, padding))).reshape(char_sh, -1, window_size)

character_pp_2 = np.any(character_pp_2, axis=2).astype(int)

In [10]:
# character_pp_2.shape

In [11]:
adjacency = (
    np.dot(character_pp_1, character_pp_1.T) + np.dot(character_pp_2, character_pp_2.T)
).astype(bool).astype(int)

<hr>

In [12]:
import networkx as nx

graph = nx.from_numpy_matrix(adjacency)

pagerank = nx.pagerank(graph, alpha=0.9)

pranks = pd.DataFrame([
    {'pagerank-name': id2characters[k].split('|')[0], 'prob': v} for k, v in pagerank.items()
])

hubs, authorities = nx.hits(graph, max_iter=1e3)

hranks = pd.DataFrame([
    {'hits-name': id2characters[kh].split('|')[0], 'hubs-prob': vh, 'authorities-prob': va} 
    for (kh, vh), (ka, va) in zip(hubs.items(), authorities.items())
])

In [13]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [14]:
display(pranks.sort_values('prob', ascending=False).head(10))
display(hranks.sort_values('hubs-prob', ascending=False).head(10))

Unnamed: 0,pagerank-name,prob
36,گشتاسپ,0.018381
215,جمشید,0.01613
255,بید,0.012487
148,شهریار,0.012338
138,سرو,0.012223
135,پیروز,0.011456
370,بهمن,0.011388
359,خسرو,0.011137
152,کیخسرو,0.011137
371,فریدون,0.010925


Unnamed: 0,hits-name,hubs-prob,authorities-prob
36,گشتاسپ,0.00994,0.00994
215,جمشید,0.00977,0.00977
138,سرو,0.009135,0.009135
255,بید,0.009044,0.009044
148,شهریار,0.008889,0.008889
359,خسرو,0.008834,0.008834
152,کیخسرو,0.008834,0.008834
370,بهمن,0.008812,0.008812
135,پیروز,0.008716,0.008716
72,تور,0.008609,0.008609


## Shahnameh Cities

In [15]:
city = set(normalizer.normalize(regex) for regex in 
             pd.read_csv('../datasets/shahnameh_cities.csv')['city'])

cities = np.array(list(city))

In [16]:
city_placement = np.zeros((len(cities), len(poems)), dtype=bool)
city_regex = [re.compile(cit) for cit in cities]

for row, rgx in enumerate(tqdm(city_regex)):
    for col, poem in enumerate(poems):
        city_placement[row, col] = bool(rgx.search(poem))

  0%|          | 0/100 [00:00<?, ?it/s]

In [17]:
threshold = 1

counts = np.sum(city_placement, axis=1)

cities, city_placement = \
    cities[counts >= 1], city_placement[counts >= 1]

id2cities = dict(enumerate(cities))

In [18]:
window_size = 5

In [19]:
padding = window_size - city_placement.shape[1] % window_size

city_sh = city_placement.shape[0]
poem_sh = city_placement.shape[1]

city_pp_1 = np.pad(
    city_placement, pad_width=((0, 0), (0, padding))).reshape(city_sh, -1, window_size)

city_pp_1 = np.any(city_pp_1, axis=2).astype(int)

In [21]:
# city_pp_1.shape

In [22]:
shift = window_size // 2

In [23]:
padding = window_size - (city_placement.shape[1] + shift) % window_size

city_sh = city_placement.shape[0]
poem_sh = city_placement.shape[1] + shift

city_pp_2 = np.pad(
    city_placement, pad_width=((0, 0), (shift, padding))).reshape(city_sh, -1, window_size)

city_pp_2 = np.any(city_pp_2, axis=2).astype(int)

In [25]:
# city_pp_2.shape

In [26]:
adjacency = (
    np.dot(city_pp_1, city_pp_1.T) + np.dot(city_pp_2, city_pp_2.T)
).astype(bool).astype(int)

<hr>

In [27]:
graph = nx.from_numpy_matrix(adjacency)

pagerank = nx.pagerank(graph, alpha=0.9)

pranks = pd.DataFrame([
    {'pagerank-name': id2cities[k].split('|')[0], 'prob': v} for k, v in pagerank.items()
])

hubs, authorities = nx.hits(graph, max_iter=1e3)

hranks = pd.DataFrame([
    {'hits-name': id2cities[kh].split('|')[0], 'hubs-prob': vh, 'authorities-prob': va} 
    for (kh, vh), (ka, va) in zip(hubs.items(), authorities.items())
])

In [28]:
CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [29]:
display(pranks.sort_values('prob', ascending=False).head(10))
display(hranks.sort_values('hubs-prob', ascending=False).head(10))

Unnamed: 0,pagerank-name,prob
3,ری,0.061869
19,بست,0.04115
25,شیر,0.039684
38,مای,0.036031
5,هند,0.032152
46,روم,0.031713
53,چین,0.031653
61,ختن,0.030392
35,پارس,0.028373
4,مرو,0.024689


Unnamed: 0,hits-name,hubs-prob,authorities-prob
3,ری,0.039299,0.039299
25,شیر,0.035416,0.035416
19,بست,0.035069,0.035069
38,مای,0.034109,0.034109
53,چین,0.03201,0.03201
46,روم,0.031956,0.031956
61,ختن,0.030943,0.030943
5,هند,0.030837,0.030837
35,پارس,0.02816,0.02816
4,مرو,0.027064,0.027064
