## Shahnameh Characters

In [1]:
import pandas as pd
import numpy as np
import io

import hazm

normalizer = hazm.Normalizer(token_based=True)

poems = list(normalizer.normalize(line.strip()) for line in
             io.open('../datasets/shahnameh.txt', mode="r", encoding="utf-8").readlines())

poems = np.array(poems)
poems = np.apply_along_axis(' / '.join, 1, poems.reshape(-1, 2))

chars = set(normalizer.normalize(regex) for regex in 
             pd.read_csv('../datasets/shahnameh_characters.csv')['regex'])

In [2]:
characters = np.array(list(chars))

In [3]:
from tqdm.notebook import tqdm
import re

character_placement = np.zeros((len(characters), len(poems)), dtype=bool)
character_regex = [re.compile(character) for character in characters]

for row, rgx in enumerate(tqdm(character_regex)):
    for col, poem in enumerate(poems):
        character_placement[row, col] = bool(rgx.search(poem))

  0%|          | 0/530 [00:00<?, ?it/s]

In [4]:
threshold = 1

counts = np.sum(character_placement, axis=1)

characters, character_placement = \
    characters[counts >= 1], character_placement[counts >= 1]

id2characters = dict(enumerate(characters))

In [5]:
window_size = 5

In [6]:
padding = window_size - character_placement.shape[1] % window_size

char_sh = character_placement.shape[0]
poem_sh = character_placement.shape[1]

character_placement = np.pad(
    character_placement, pad_width=((0, 0), (0, padding))).reshape(char_sh, -1, window_size)

character_placement = np.any(character_placement, axis=2).astype(int)

In [7]:
adjacency = np.dot(character_placement, character_placement.T).astype(bool).astype(int)

<hr>

In [8]:
import networkx as nx

graph = nx.from_numpy_matrix(adjacency)

pagerank = nx.pagerank(graph, alpha=0.9)

pranks = pd.DataFrame([
    {'pagerank-name': id2characters[k].split('|')[0], 'prob': v} for k, v in pagerank.items()
])

hubs, authorities = nx.hits(graph, max_iter=1e3)

hranks = pd.DataFrame([
    {'hits-name': id2characters[kh].split('|')[0], 'hubs-prob': vh, 'authorities-prob': va} 
    for (kh, vh), (ka, va) in zip(hubs.items(), authorities.items())
])

In [9]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [10]:
display(pranks.sort_values('prob', ascending=False).head(10))
display(hranks.sort_values('hubs-prob', ascending=False).head(10))

Unnamed: 0,pagerank-name,prob
226,گشتاسپ,0.021102
192,جمشید,0.017096
161,سرو,0.01346
86,شهریار,0.01262
167,بید,0.012401
156,بهمن,0.012149
278,خسرو,0.01197
344,کیخسرو,0.01197
233,پیروز,0.011481
255,فریدون,0.0114


Unnamed: 0,hits-name,hubs-prob,authorities-prob
226,گشتاسپ,0.010921,0.010921
192,جمشید,0.01066,0.01066
161,سرو,0.009982,0.009982
344,کیخسرو,0.009501,0.009501
278,خسرو,0.009501,0.009501
86,شهریار,0.009468,0.009468
156,بهمن,0.009457,0.009457
167,بید,0.009436,0.009436
233,پیروز,0.009278,0.009278
204,دارا,0.009176,0.009176


## Shahnameh Cities

In [11]:
city = set(normalizer.normalize(regex) for regex in 
             pd.read_csv('../datasets/shahnameh_cities.csv')['city'])

cities = np.array(list(city))

In [12]:
city_placement = np.zeros((len(cities), len(poems)), dtype=bool)
city_regex = [re.compile(cit) for cit in cities]

for row, rgx in enumerate(tqdm(city_regex)):
    for col, poem in enumerate(poems):
        city_placement[row, col] = bool(rgx.search(poem))

  0%|          | 0/100 [00:00<?, ?it/s]

In [13]:
threshold = 1

counts = np.sum(city_placement, axis=1)

cities, city_placement = \
    cities[counts >= 1], city_placement[counts >= 1]

id2cities = dict(enumerate(cities))

In [14]:
window_size = 5

In [15]:
padding = window_size - city_placement.shape[1] % window_size

city_sh = city_placement.shape[0]
poem_sh = city_placement.shape[1]

city_placement = np.pad(
    city_placement, pad_width=((0, 0), (0, padding))).reshape(city_sh, -1, window_size)

city_placement = np.any(city_placement, axis=2).astype(int)

In [16]:
adjacency = np.dot(city_placement, city_placement.T).astype(bool).astype(int)

<hr>

In [17]:
graph = nx.from_numpy_matrix(adjacency)

pagerank = nx.pagerank(graph, alpha=0.9)

pranks = pd.DataFrame([
    {'pagerank-name': id2cities[k].split('|')[0], 'prob': v} for k, v in pagerank.items()
])

hubs, authorities = nx.hits(graph, max_iter=1e3)

hranks = pd.DataFrame([
    {'hits-name': id2cities[kh].split('|')[0], 'hubs-prob': vh, 'authorities-prob': va} 
    for (kh, vh), (ka, va) in zip(hubs.items(), authorities.items())
])

In [18]:
CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [19]:
display(pranks.sort_values('prob', ascending=False).head(10))
display(hranks.sort_values('hubs-prob', ascending=False).head(10))

Unnamed: 0,pagerank-name,prob
8,ری,0.069718
39,بست,0.042241
58,شیر,0.037177
56,مای,0.037044
61,روم,0.033543
24,هند,0.032881
35,ختن,0.031578
14,چین,0.031182
10,پارس,0.028642
43,کابل,0.0237


Unnamed: 0,hits-name,hubs-prob,authorities-prob
8,ری,0.043346,0.043346
39,بست,0.036755,0.036755
56,مای,0.036535,0.036535
58,شیر,0.036496,0.036496
14,چین,0.034113,0.034113
61,روم,0.034089,0.034089
35,ختن,0.033619,0.033619
24,هند,0.032741,0.032741
10,پارس,0.029653,0.029653
22,مرغ,0.027771,0.027771
