In [1]:
import pandas as pd
import numpy as np
import networkx as nx

In [2]:
FILENAME_STRUCTURE = 'prep-data/goop-2104.csv'
FILENAME_SIM = 'similarity-graphs/goop-2104-adj.npy'

In [3]:
def remove_protocol(url):
    assert url.startswith('http'), f'Formatting error: URL "{url}" not valid.'
    
    if url.startswith('http://'):
        return url[7:]
    else:
        return url[8:]

## Load data

In [4]:
structure_df = pd.read_csv(FILENAME_STRUCTURE, usecols=['url', 'connected_to'])
structure_df['url'] = structure_df['url']
structure_df.head()

Unnamed: 0,url,connected_to
0,goop.com/,"goop.com/#main-content,shop.goop.com/shop/coll..."
1,shop.goop.com/shop/collection/accessories,shop.goop.com/shop/collection/accessories#page
2,shop.goop.com/shop/collection/books,shop.goop.com/shop/collection/books#page
3,shop.goop.com/shop/collection/home,shop.goop.com/shop/collection/home#page
4,goop.com/wellness/spirituality/faith-in-the-ti...,goop.com/wellness/spirituality/faith-in-the-ti...


## Create the graph of the structure

### Check if there are duplicates

In [5]:
unique_urls, count_duplicates = np.unique(structure_df['url'].values, return_counts=True)
duplicate_urls = unique_urls[count_duplicates > 1]
assert np.sum(count_duplicates > 1) == 0, 'There sould not be duplicates in data'

### Effectively creating the graph

In [6]:
structure_graph = nx.Graph()
structure_graph.add_nodes_from(structure_df['url'].values)

### Create an adjency matrix

In [7]:
for _, row in structure_df.iterrows():
    from_url = row['url']
    connected_to = row['connected_to']
    
    # Don't consider null values
    if not pd.isnull(connected_to):
        for to_url in connected_to.split(','):
            # Don't consider connections which are not pages themselves
            if to_url in structure_graph:
                structure_graph.add_edge(from_url, to_url)

## Load the content adj matrix

In [8]:
content = np.load(FILENAME_SIM)

## Get data sizes

In [9]:
print('Structure len:', structure_graph.number_of_nodes())
print('Content len:', content.shape[0])

Structure len: 26219
Content len: 26219
