In [1]:
import pandas as pd
import numpy as np
import networkx as nx

In [60]:
FILENAME_STRUCTURE = 'prep-data/bnu-2204.csv'
FILENAME_SIM = 'watt-2204-adj.npy'

## Load data

In [61]:
structure = pd.read_csv(FILENAME_STRUCTURE, usecols=['url', 'connected_to'])
structure['url'] = structure['url']
structure.head()

Unnamed: 0,url,connected_to
0,english.bnu.edu.cn/,"english.bnu.edu.cn/about/index.htm,english.bnu..."
1,english.bnu.edu.cn/lifeatbnu/artsculture/index...,"english.bnu.edu.cn/about/index.htm,english.bnu..."
2,english.bnu.edu.cn/newsevents/index.htm,"english.bnu.edu.cn/about/index.htm,english.bnu..."
3,english.bnu.edu.cn/newsevents/events/index.htm,"english.bnu.edu.cn/about/index.htm,english.bnu..."
4,english.bnu.edu.cn/lifeatbnu/sportswellbeing/i...,"english.bnu.edu.cn/about/index.htm,english.bnu..."


## Create the graph of the structure

### Check if there are duplicates

In [64]:
unique_urls, count_duplicates = np.unique(structure['url'].values, return_counts=True)
duplicate_urls = unique_urls[count_duplicates > 1]
assert np.sum(count_duplicates > 1) == 0, 'There sould not be duplicates in data'

### Effectively creating the graph

In [65]:
structure_graph = nx.Graph()
structure_graph.add_nodes_from(structure['url'].values)

## Load the content adj matrix

In [66]:
for _, row in structure.iterrows():
    from_url = row['url']
    connected_to = row['connected_to']
    
    # Don't consider null values
    if not pd.isnull(connected_to):
        for to_url in connected_to.split(','):
            # Don't consider connections which are not pages themselves
            if to_url in structure_graph:
                structure_graph.add_edge(from_url, to_url)

In [67]:
content = np.load(FILENAME_SIM)
content_graph = nx.from_numpy_matrix(content)

FileNotFoundError: [Errno 2] No such file or directory: 'watt-2204-adj.npy'

## Get data sizes

In [68]:
print('Structure len:', structure_graph.number_of_nodes())
print('Content len:', content_graph.number_of_nodes())

Structure len: 990
Content len: 380
