# Data exploration

In [None]:
# Fill in with your data
DATASET_ID = 'watt'
FILENAME_STRUCTURE = 'prep-data/watt-2604.csv'
FILENAME_CONTENT = 'similarity-graph/full/watt-2604-adj.npy'
THRESHOLD = 0.9332889

In [None]:
import math
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import networkx as nx
from numba import jit, prange

## Load data

### Load structure

In [None]:
structure_df = pd.read_csv(FILENAME_STRUCTURE, usecols=['url', 'connected_to'])
structure_df['url'] = structure_df['url']
structure_df.head()

### Check if there are duplicates

In [None]:
unique_urls, count_duplicates = np.unique(structure_df['url'].values, return_counts=True)
duplicate_urls = unique_urls[count_duplicates > 1]
assert np.sum(count_duplicates > 1) == 0, 'There sould not be duplicates in data'

### Effectively creating the graph

In [None]:
structure_graph = nx.Graph()
structure_graph.add_nodes_from(structure_df['url'].values)

In [None]:
for _, row in structure_df.iterrows():
    from_url = row['url']
    connected_to = row['connected_to']
    
    # Don't consider null values
    if not pd.isnull(connected_to):
        for to_url in connected_to.split(','):
            # Don't consider connections which are not pages themselves
            if to_url in structure_graph:
                structure_graph.add_edge(from_url, to_url)

### Transform it to an adj matrix

In [None]:
structure = np.array(nx.to_numpy_matrix(structure_graph, nodelist=structure_df['url'].values, dtype=np.int32))
np.fill_diagonal(structure, 0)

### Load the content adj matrix

In [None]:
content = np.load(FILENAME_CONTENT)
content = 1 - content
np.fill_diagonal(content, 0)

content = (content > THRESHOLD).astype(dtype=np.int32)

## Get data sizes

In [None]:
print('Structure len:', structure.shape[0])
print('Content len:', content.shape[0])

assert structure.shape[0] == content.shape[0], 'Structure and content does not have the same number of pages'

## Compute phase

### Degree cumulative distribution

In [None]:
@jit(nopython=True, nogil=True, parallel=True, fastmath=True)
def node_degrees_m(adj, out):
    for row_idx in prange(adj.shape[0]):
        row = adj[row_idx]
        out[row_idx] = np.count_nonzero(row)

def plot_degrees_cum_distrib(adj, title):
    degrees = np.zeros(adj.shape[0], dtype=np.int32)
    node_degrees_m(adj, degrees)
    
    fig = go.Figure(data=go.Histogram(x=degrees, histnorm='probability', 
                                      cumulative_enabled=True, autobinx=True))
    fig.update_layout(title=f'{DATASET_ID} {title}',
                      xaxis_title='Node degree', yaxis_title='CDF')
    fig.update_traces(opacity=0.75)
    fig.show()

In [None]:
plot_degrees_cum_distrib(structure, '(structure)')
plot_degrees_cum_distrib(content, '(content)')

### Clustering coefficient

In [None]:
@jit(nopython=True, nogil=True, parallel=True, fastmath=True)
def local_cc(adj, out):
    adj_norm = adj.copy() / np.max(adj)
    
    for node_insp_idx in prange(adj_norm.shape[0]):
        # Compute numerator
        num = 0.0
        for neigh_1_idx in prange(adj_norm.shape[0]):
            insp_to_1 = adj_norm[node_insp_idx, neigh_1_idx]
            
            # Skip nodes if there is no connection at first
            if insp_to_1 != 0:
                for neigh_2_idx in prange(adj_norm.shape[0]):
                    gmean = (insp_to_1
                             * adj_norm[neigh_2_idx, neigh_1_idx] 
                             * adj_norm[neigh_2_idx, node_insp_idx])
                    gmean = np.power(gmean, 1/3)
                    num += gmean
                
        # Compute denominator
        degree = np.count_nonzero(adj_norm[node_insp_idx])
        denom = degree * (degree - 1)
        
        # Store the result for each node
        if denom > 0:
            cc = num / denom
        else:
            cc = 0
        out[node_insp_idx] = cc
    
        
def plot_cc_cum_distrib(adj, title):
    ccs = np.zeros(adj.shape[0], dtype=np.float32)
    local_cc(adj, ccs)
    
    fig = go.Figure(data=go.Histogram(x=ccs, histnorm='probability', 
                                      cumulative_enabled=True, autobinx=True))
    fig.update_layout(title=f'{DATASET_ID} {title}',
                      xaxis_title='Clustering coefficient', yaxis_title='CDF')
    fig.update_traces(opacity=0.75)
    fig.show()
    
    print('Average:', np.average(ccs))

In [None]:
plot_cc_cum_distrib(structure, '(structure)')
plot_cc_cum_distrib(content, '(content)')

## License
<small>Copyright (C) 2020 MaLGa ML4DS 

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see &lt;https://www.gnu.org/licenses/&gt;.</small>