In [1]:
import pandas as pd
import numpy as np
import networkx as nx

In [11]:
FILENAME_STRUCTURE = 'scraper-data/watt-2204.csv'
FILENAME_SIM = 'watt-2204-adj.npy'

In [12]:
def remove_protocol(url):
    assert url.startswith('http'), f'Formatting error: URL "{url}" not valid.'
    
    if url.startswith('http://'):
        return url[7:]
    else:
        return url[8:]

## Load data

In [20]:
structure = pd.read_csv(FILENAME_STRUCTURE, usecols=['url', 'connected_to'])
structure['url'] = structure['url'].apply(remove_protocol)
structure.head()

Unnamed: 0,url,connected_to
0,www.wattpadwriters.com,"https://www.wattpadwriters.com/,https://www.wa..."
1,www.wattpadwriters.com/guidelines,"https://www.wattpadwriters.com/,https://www.wa..."
2,www.wattpadwriters.com/c/story-services,"https://www.wattpadwriters.com/,https://www.wa..."
3,www.wattpadwriters.com/c/genre-clubs,"https://www.wattpadwriters.com/,https://www.wa..."
4,www.wattpadwriters.com/c/the-pub,"https://www.wattpadwriters.com/,https://www.wa..."


## Create the graph of the structure

### Check if there are duplicates

In [21]:
unique_urls, count_duplicates = np.unique(structure['url'].values, return_counts=True)
duplicate_urls = unique_urls[count_duplicates > 1]
print('Duplicate URLs:', np.sum(count_duplicates > 1))

Duplicate URLs: 0


### Effectively creating the graph

In [48]:
structure_graph = nx.Graph()
structure_graph.add_nodes_from(structure['url'].values)

In [49]:
for _, row in structure.iterrows():
    from_url = row['url']
    connected_to = row['connected_to']
    
    # Don't consider null values
    if not pd.isnull(connected_to):
        to_urls = map(remove_protocol, connected_to.split(','))
        for to_url in to_urls:
            # Don't consider connections which are not pages themselves
            if to_url in structure_graph:
                structure_graph.add_edge(from_url, to_url)

## Load the content adj matrix

In [17]:
content = np.load(FILENAME_SIM)
content_graph = nx.from_numpy_matrix(content)

## Get data sizes

In [50]:
print('Structure len:', structure_graph.number_of_nodes())
print('Content len:', content_graph.number_of_nodes())

Structure len: 380
Content len: 380
