In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import networkx as nx
from numba import jit, prange

In [2]:
DATASET_ID = 'goop'
FILENAME_STRUCTURE = 'prep-data/bnu-2204.csv'
FILENAME_CONTENT = 'similarity-graphs/filter_50/bnu-2204-adj.npy'

In [3]:
def remove_protocol(url):
    assert url.startswith('http'), f'Formatting error: URL "{url}" not valid.'
    
    if url.startswith('http://'):
        return url[7:]
    else:
        return url[8:]

## Load data

In [4]:
structure_df = pd.read_csv(FILENAME_STRUCTURE, usecols=['url', 'connected_to'])
structure_df['url'] = structure_df['url']
structure_df.head()

Unnamed: 0,url,connected_to
0,english.bnu.edu.cn/,"english.bnu.edu.cn/about/index.htm,english.bnu..."
1,english.bnu.edu.cn/lifeatbnu/artsculture/index...,"english.bnu.edu.cn/about/index.htm,english.bnu..."
2,english.bnu.edu.cn/newsevents/index.htm,"english.bnu.edu.cn/about/index.htm,english.bnu..."
3,english.bnu.edu.cn/newsevents/events/index.htm,"english.bnu.edu.cn/about/index.htm,english.bnu..."
4,english.bnu.edu.cn/lifeatbnu/sportswellbeing/i...,"english.bnu.edu.cn/about/index.htm,english.bnu..."


### Create the graph of the structure

### Check if there are duplicates

In [5]:
unique_urls, count_duplicates = np.unique(structure_df['url'].values, return_counts=True)
duplicate_urls = unique_urls[count_duplicates > 1]
assert np.sum(count_duplicates > 1) == 0, 'There sould not be duplicates in data'

### Effectively creating the graph

In [6]:
structure_graph = nx.Graph()
structure_graph.add_nodes_from(structure_df['url'].values)

### Create an adjency matrix

In [7]:
for _, row in structure_df.iterrows():
    from_url = row['url']
    connected_to = row['connected_to']
    
    # Don't consider null values
    if not pd.isnull(connected_to):
        for to_url in connected_to.split(','):
            # Don't consider connections which are not pages themselves
            if to_url in structure_graph:
                structure_graph.add_edge(from_url, to_url)

### Transform it to an adj matrix

In [8]:
structure = nx.to_numpy_matrix(structure_graph, nodelist=structure_df['url'].values, dtype=np.bool)

### Load the content adj matrix

In [9]:
content = np.load(FILENAME_CONTENT)

## Get data sizes

In [10]:
print('Structure len:', structure.shape[0])
print('Content len:', content.shape[0])

Structure len: 990
Content len: 990


## Compute phase

### Degree cumulative distribution

In [25]:
@jit(nopython=True, nogil=True, parallel=True, fastmath=True)
def node_degrees_m(adj, out):
    for row_idx in prange(adj.shape[0]):
        row = adj[row_idx]
        out[row_idx] = np.count_nonzero(row)
        
def degrees_cum_distrib(adj, out):
    node_degrees_m(adj, out)
    
    vals, counts = np.unique(out, return_counts=True)
    cum_counts = np.cumsum(counts)
    cum_counts = cum_counts / np.max(cum_counts)
    return vals, cum_counts

def plot_degrees_cum_distrib(adj, title, xlim_right=None):
    out = np.zeros(adj.shape[0], dtype=np.int32)
    vals, counts = degrees_cum_distrib(adj, out)
    
    fig = go.Figure(data=go.Scatter(x=vals, y=counts,
                                   mode='lines+markers'))
    fig.update_layout(title=f'CDF for the degree distribution of {DATASET_ID} {title}',
                      xaxis_title='Node degree', yaxis_title='CDF')
    fig.show()

In [26]:
plot_degrees_cum_distrib(structure, '(structure)')
plot_degrees_cum_distrib(content, '(content)')