# Visualization

In [57]:
import collections
import json
import os
import pickle
import string
import warnings
from multiprocessing.pool import ThreadPool
from operator import itemgetter

import igviz as ig
import networkx as nx
import nltk
import pandas as pd
import plotapi
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import pyecharts.options as opts
import regex
import requests
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from plotapi import SplitChord, Terminus
from plotly.subplots import make_subplots
from pyecharts.charts import WordCloud
from tqdm import tqdm

warnings.filterwarnings('ignore')
nltk.download('stopwords')
plotapi.api_key("d494c31b-ce51-4470-aa8c-7749ac52ac0b")

Your PlotAPI API key has been saved in your local environment. You will not need to set it again.


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/crinstaniev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
pio.templates.default = 'ggplot2'


In [3]:
# switch working directory to root
while True:
    if 'ic-gov-visualization' != os.getcwd().split('/')[-1]:
        os.chdir('..')
    else:
        print(f'working dir: {os.getcwd()}')
        break


working dir: /Users/crinstaniev/Dev/ic-gov-visualization


## Fetch Proposals

In [4]:
base_url = 'https://ic-api.internetcomputer.org'

In [5]:
# get total proposal counts
res = requests.get(f'{base_url}/api/nns/proposals-count')
proposals_count = json.loads(res.text)['proposals_count']

print(f'Total Proposals: {proposals_count}')


Total Proposals: 92513


In [6]:
# fetch proposals
def get_proposal(id):
    url = base_url + '/api/v3/proposals/' + str(id)
    res = requests.get(url)
    res_dict = json.loads(res.text)
    return res_dict


def get_neuron(id):
    url = f'https://ic-api.internetcomputer.org/api/v3/neurons/{id}'
    res = requests.get(url)
    res_dict = json.loads(res.text)
    return res_dict


In [11]:
proposals = []
proposal_ids = [i for i in range(proposals_count)]


def dummy(id):
    res = get_proposal(id)
    proposals.append(dict(
        id=id,
        data=res
    ))
    print(f'fetched: proposal {id}')


# enable multithreading for faster download
pool = ThreadPool(50)

pool.imap_unordered(dummy, proposal_ids)


<multiprocessing.pool.IMapUnorderedIterator at 0x1475ccbb0>

fetched: proposal 0fetched: proposal 10

fetched: proposal 48
fetched: proposal 35
fetched: proposal 32
fetched: proposal 47
fetched: proposal 14
fetched: proposal 28
fetched: proposal 41
fetched: proposal 7
fetched: proposal 8
fetched: proposal 2
fetched: proposal 12
fetched: proposal 1
fetched: proposal 11
fetched: proposal 5
fetched: proposal 6
fetched: proposal 3
fetched: proposal 18
fetched: proposal 15
fetched: proposal 37
fetched: proposal 13
fetched: proposal 9
fetched: proposal 16
fetched: proposal 44
fetched: proposal 29
fetched: proposal 45
fetched: proposal 36
fetched: proposal 24
fetched: proposal 23fetched: proposal 4

fetched: proposal 43
fetched: proposal 21
fetched: proposal 40
fetched: proposal 30
fetched: proposal 17
fetched: proposal 38
fetched: proposal 33
fetched: proposal 42
fetched: proposal 31
fetched: proposal 49
fetched: proposal 19
fetched: proposal 46
fetched: proposal 39
fetched: proposal 26
fetched: proposal 22
fetched: proposal 34fetched: proposal 20

fe

In [13]:
pickle.dump(proposals, open('data/nns_data_raw.pkl', 'wb'))

In [7]:
data = pickle.load(open('data/nns_data_raw.pkl', 'rb'))
data.sort(key=itemgetter('id'))
json.dump(data, open('data/proposals.json', 'w'))

In [8]:
# filter empty data
proposals_no_empty = []

for item in data:
    if (item['data'].get('code') != 404):
        proposals_no_empty.append(
            item['data']
        )
json.dump(proposals_no_empty, open('data/proposals_no_empty.json', 'w'))


In [9]:
df = pd.read_json('data/proposals_no_empty.json')


## Wordcloud - Proposal Summary

In [10]:
with open('data/proposal_summary_text.txt', 'w') as f:
    f.write(' '.join(list(df['summary'].values)))


In [11]:
# cleaning data for wordcloud
df_wordcloud = df[['topic', 'summary']]
df_wordcloud = df_wordcloud.groupby('topic').aggregate(' '.join).reset_index()
df_wordcloud.to_csv('data/topic_summary_joined.csv')

# remove non-utf8 characters


def remove_non_utf8(x: str):
    x = x.encode('utf-8', errors='ignore').decode('utf-8')
    x = regex.sub(r'[^\x00-\x7f]', u'', x)
    x = regex.sub(r"http\S+", "", x)
    x = x.translate(str.maketrans('', '', string.punctuation))
    x = word_tokenize(x)
    x = [word for word in x if not word in stopwords.words('english')]

    return (" ").join(x)


df_wordcloud['summary'] = df_wordcloud['summary'].apply(remove_non_utf8)
df_wordcloud.to_csv('data/topic_summary_joined.csv')


In [12]:
df_wordcloud.head()


Unnamed: 0,topic,summary
0,TOPIC_EXCHANGE_RATE,The ICPXDR conversion rate set 404904 The ICPX...
1,TOPIC_GOVERNANCE,SetAuthorizedSubnetworks SetAuthorizedSubnetwo...
2,TOPIC_KYC,The proposal unlocks neurons associated listed...
3,TOPIC_NETWORK_CANISTER_MANAGEMENT,Upgrade ledger canister git commit 8a560f9510b...
4,TOPIC_NETWORK_ECONOMICS,ClearProvisionalWhitelist Initialize rewards t...


In [13]:
def generate_wordcloud(df_wordcloud, topics):

    words = []
    for topic in topics:
        words.append(
            df_wordcloud.query(f"topic == '{topic}'")['summary'].values[0]
        )

    all_words = (' ').join(words)

    with open('data/all_words.txt', 'w') as f:
        f.write(all_words)

    wc = WordCloud(
        background_color='black',
        repeat=True,
        width=1024 * 2,
        height=768 * 2,
        max_words=200,
        colormap='Set2',
        font_path='fonts/impact.ttf'
    )
    wc.generate(all_words)

    return wc


In [14]:
def generate_word_freq(df_wordcloud, topics):

    words = []
    for topic in topics:
        words.append(
            df_wordcloud.query(f"topic == '{topic}'")['summary'].values[0]
        )

    all_words = [ele for ele in words if len(ele) > 20]
    all_words = (' ').join(all_words).upper()
    all_words = regex.sub(r'\b[0-9]+\b\W*', '', all_words)
    all_words = regex.sub(r'(?:^| )\w(?:$| )', ' ', all_words)

    def replace_digit(string):
        string = regex.sub(r'\d', '', string).strip()
        return string

    all_words = all_words.upper()

    tokens = nltk.word_tokenize(all_words)
    text = nltk.Text(tokens)
    freq = nltk.Counter(text).items()
    res = []
    for item in freq:
        res.append(item)

    res.sort(key=lambda x: x[1], reverse=True)
    return res


data = generate_word_freq(
    df_wordcloud=df_wordcloud,
    topics=[
        'TOPIC_EXCHANGE_RATE',
        'TOPIC_GOVERNANCE',
        'TOPIC_KYC',
        'TOPIC_NETWORK_CANISTER_MANAGEMENT',
        'TOPIC_NETWORK_ECONOMICS',
        'TOPIC_NODE_ADMIN',
        'TOPIC_NODE_PROVIDER_REWARDS',
        'TOPIC_PARTICIPANT_MANAGEMENT',
        'TOPIC_SUBNET_MANAGEMENT',
    ]
)

(
    WordCloud(init_opts=opts.InitOpts())
    .add(
        series_name="Keywords",
        data_pair=data,
        word_size_range=[20, 80],
        textstyle_opts=opts.TextStyleOpts(
            font_family="impact",
            font_weight='bold',
            color='red'
        ),
        rotate_step=90
    )
    .set_global_opts(
        # title_opts=opts.TitleOpts("Theme-dark"),
        title_opts=opts.TitleOpts(
            title="Keywords", title_textstyle_opts=opts.TextStyleOpts(font_size=23),
        ),
        tooltip_opts=opts.TooltipOpts(is_show=True),
    )
    .render('figures/wordcloud.html')
)


'/Users/crinstaniev/Dev/ic-gov-visualization/figures/wordcloud.html'

## Area Chart - Topics River

In [15]:
topic_time_df = df[['proposal_id', 'topic',
                    'updated_at']].set_index('proposal_id')

topic_time_df['updated_at'] = topic_time_df['updated_at'].dt.floor('d')

topic_time_stats = topic_time_df.groupby(
    'updated_at').value_counts().reset_index()
topic_time_stats.columns = ['date', 'topic', 'count']
topic_time_stats


Unnamed: 0,date,topic,count
0,2021-08-05,TOPIC_EXCHANGE_RATE,265
1,2021-08-05,TOPIC_SUBNET_MANAGEMENT,100
2,2021-08-05,TOPIC_NODE_PROVIDER_REWARDS,100
3,2021-08-05,TOPIC_GOVERNANCE,86
4,2021-08-05,TOPIC_NETWORK_CANISTER_MANAGEMENT,59
...,...,...,...
1253,2022-11-18,TOPIC_NETWORK_CANISTER_MANAGEMENT,7
1254,2022-11-18,TOPIC_SUBNET_MANAGEMENT,5
1255,2022-11-18,TOPIC_REPLICA_VERSION_MANAGEMENT,3
1256,2022-11-18,TOPIC_PARTICIPANT_MANAGEMENT,1


In [16]:
fig = px.area(topic_time_stats, x='date', y='count',
              color='topic')
fig.update_yaxes(type='log', range=[0, 4])

with open('figures/timeriver.html', 'w') as f:
    f.write(fig.to_html())

fig.show()


In [19]:
topic_time_df = df[['proposal_id', 'topic',
                    'updated_at']].set_index('proposal_id')

topic_time_df['updated_at'] = topic_time_df['updated_at'].dt.floor('d')

topic_time_stats = topic_time_df.groupby(
    'updated_at').value_counts().unstack(fill_value=0).reset_index()
topics = list(topic_time_stats.columns[1:])

topic_time_stats['sum'] = 0

for topic in topics:
    topic_time_stats['sum'] += topic_time_stats[topic]


In [21]:
fig = make_subplots(rows=2, cols=1, shared_xaxes=True)

for topic in topics:
    fig.add_trace(go.Scatter(
        x=topic_time_stats['updated_at'],
        y=topic_time_stats[topic],
        mode='lines',
        line=dict(width=0.5),
        stackgroup='one',
        groupnorm='percent',
        name=topic,
    ), row=2, col=1)

fig.add_traces(
    go.Line(x=topic_time_stats['updated_at'], y=topic_time_stats['sum'], name='Sum of all Topics'))

fig.update_layout(
    dict(title='Proposal Topic Changes Over Time', width=1200, height=800))

# fig.update_layout(yaxis_range=(0, 100))
fig.update_xaxes(title='Time')

fig['layout']['yaxis2'].update(title='Percentage', range=[60, 100])
fig['layout']['yaxis1'].update(title='Count')

fig.show()
with open('figures/topic_area_chart.html', 'w') as f:
    f.write(fig.to_html())


## Chord

In [28]:
proposer_network = df[['proposal_id', 'proposer',
                       'known_neurons_ballots', 'updated_at']]

# drop proposals with no votes
proposer_network = proposer_network[proposer_network['known_neurons_ballots'].apply(
    len) != 0]
proposer_network['proposer'] = proposer_network['proposer'].astype(int)

# build proposer roster
voters = proposer_network['known_neurons_ballots'].values.flatten()
voters_temp = set()
voters_id = set()
for voter_group in tqdm(voters):
    for voter in voter_group:
        voters_id.add(int(voter.get('id')))
        voter = (voter.get('id'), voter.get('name'))
        voters_temp.add(voter)

voters = voters_temp

proposer_set = set()
for proposal in proposer_network.itertuples():
    proposer_id = proposal.proposer
    proposer_set.add(int(proposer_id))


100%|██████████| 76471/76471 [00:00<00:00, 514620.07it/s]


In [31]:
chord_df = pd.DataFrame(0, columns=list(voters_id), index=list(proposer_set))

for proposal in tqdm(proposer_network.itertuples(), total=len(proposer_network)):
    voters = proposal.known_neurons_ballots
    proposer_id = int(proposal.proposer)
    for voter in voters:
        voter_id = int(voter.get('id'))
        vote = voter.get('vote')
        chord_df.loc[proposer_id, voter_id] += 1


100%|██████████| 76471/76471 [00:15<00:00, 4979.02it/s] 


In [35]:
links = []
nodes = []

for item in chord_df.itertuples():
    proposer = f'Proposer {item.Index}'
    nodes.append(dict(
        name=proposer,
        group='left'
    ))
    for voter in chord_df.columns:
        amount = int(chord_df.loc[item.Index, voter])
        voter = f'Voter {voter}'
        links.append(dict(
                source=voter,
                target=proposer,
                value=amount
        ))

for voter in chord_df.columns:
    voter = f'Voter {voter}'
    nodes.append(dict(
        name=voter,
        group='right'
    ))


In [38]:
SplitChord(
    links,
    nodes,
    directed=True,
    conjunction='to',
    verb='give',
    noun='votes',
    color='purple_red'
).to_html('figures/chord.html')


## Terminus

In [39]:
few_links = list(filter(lambda x: x['value'] > 1000, links))


In [40]:
Terminus(
    few_links,
    show_stats=False
).to_html('terminus.html')

## Network

In [42]:
df_network = df[df['topic'] != 'TOPIC_EXCHANGE_RATE']['summary']

In [43]:
network_all_words = ('. '.join(list(df_network.values)))
network_all_words = regex.sub(r'\b[0-9]+\b\W*', '', network_all_words)
network_all_words = regex.sub(r'(?:^| )\w(?:$| )', ' ', network_all_words)


In [48]:
def remove_url(txt):
    url_pattern = regex.compile(r'https?://\S+|www\.\S+')
    no_url = url_pattern.sub(r'', txt)

    return no_url


def stem_tokens(tokens):
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens


def lemmatize_tokens(tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens


In [49]:
text_no_urls = remove_url(network_all_words)
text_no_urls = text_no_urls.encode('utf-8', errors='ignore').decode('utf-8')
words_in_text = text_no_urls.split()
stop_words = set(stopwords.words('english'))
words_nsw = [word for word in words_in_text if not word in stop_words]
words_nsw = [ele for ele in words_nsw if len(ele) < 20]


In [52]:
lemmatizer = WordNetLemmatizer()
words_nsw_nc = [lemmatizer.lemmatize(w) for w in words_nsw]
words_nsw_nc = [
    word for word in words_nsw_nc if word not in string.punctuation]


In [53]:
bigram = list(nltk.bigrams(words_nsw_nc))
bigram_counts = collections.Counter(bigram).most_common(100)

In [55]:
G = nx.Graph()
for edge in bigram_counts:
    link = edge[0]
    count = edge[1]
    
    G.add_edge(link[0], link[1], weight=count)

In [59]:
fig = ig.plot(
    G,
    node_opacity=0.8,
)
fig.update_layout(
    dict(
        width=800,
        height=600,
    )
)

with open('figures/network.html', 'w') as f:
    f.write(fig.to_html())

fig.show()
