In [1]:
import networkx as nx
import matplotlib.colors as clr
from utils.graph import Graph_Viz_Engine
from utils.io import read_in
from utils.data_models import Endpoint_Record

In [2]:
page_relations = './data/outbound-requests-by-domain.json'
relations = read_in(page_relations)

G = nx.Graph(relations)
pos = nx.spring_layout(G)

In [7]:
def generate_node_color_map(nodes: list) -> dict:

    c_map = clr.CSS4_COLORS
    colors = list(c_map.keys())

    return {node: c_map[colors[idx+10]] for idx, node in enumerate(nodes)}

cmap = generate_node_color_map(nodes=list(relations.keys()))

In [21]:
engine = Graph_Viz_Engine(
    graph=G,
    pos=pos,
    node_cmap=cmap
)
engine.fig.show()

In [49]:
def get_subgraph(G: nx.Graph, sub_nodes: list) -> tuple:

    if len(sub_nodes) == 1:
        node = sub_nodes[0]
        graph = nx.ego_graph(G, node)

    else:
        graph = nx.edge_subgraph(G, nx.edges(G, sub_nodes))

    pos = nx.spring_layout(graph)

    return graph, pos

In [56]:
outbound_request = 'pixel.quantserve.com'
list(G.neighbors(outbound_request))


['www.politico.com',
 'www.thedailybeast.com',
 'news.vice.com/en_us',
 'apnews.com/',
 'www.motherjones.com',
 'theweek.com',
 'www.buzzfeednews.com',
 'www.nationalreview.com']

In [57]:
g, p = get_subgraph(G=G, sub_nodes=[outbound_request])
e = Graph_Viz_Engine(
    graph=g,
    pos=p,
    node_cmap=cmap
)
e.fig.show()

In [51]:
newsite = 'nytimes.com'
list(G.neighbors(newsite))

['g1.nyt.com',
 'www.nytimes.com',
 'static01.nyt.com',
 'www.googletagmanager.com',
 'a.et.nytimes.com',
 'samizdat-graphql.nytimes.com',
 'als-svc.nytimes.com',
 'rumcdn.geoedge.be',
 'c.amazon-adsystem.com',
 'securepubads.g.doubleclick.net',
 'nytimes-d.openx.net',
 'fastlane.rubiconproject.com',
 'prebid.media.net',
 'ib.adnxs.com',
 'tlx.3lift.com',
 's.amazon-adsystem.com',
 'sb.scorecardresearch.com',
 'dd.nytimes.com',
 'meter-svc.nytimes.com',
 'content.api.nytimes.com',
 'purr.nytimes.com',
 'a.nytimes.com',
 'adservice.google.com',
 'mwcm.nytimes.com',
 'www.google-analytics.com',
 'tags.bluekai.com',
 '5290727.fls.doubleclick.net',
 'static.chartbeat.com',
 'tags.bkrtx.com',
 'a1.nyt.com',
 'cdn.brandmetrics.com',
 'insight.adsrvr.org',
 '52699c9a8373e35393f2c392f653289f.safeframe.googlesyndication.com',
 'stags.bluekai.com',
 'pnytimes.chartbeat.net',
 'eb2.3lift.com',
 'cs.media.net',
 'dis.criteo.com',
 'medianet-match.dotomi.com',
 'pm.w55c.net',
 'u.openx.net',
 'x.bi

In [54]:
g, p = get_subgraph(G=G, sub_nodes=[newsite])
e = Graph_Viz_Engine(
    graph=g,
    pos=p,
    node_cmap=cmap
)
e.fig.show()

In [50]:
sub_nodes = [
    'nytimes.com',
    'thehill.com',
    'www.latimes.com'
]
sub_g, sub_pos = get_subgraph(G=G, sub_nodes=sub_nodes)
sub_engine = Graph_Viz_Engine(
    graph=sub_g,
    pos=sub_pos,
    node_cmap=cmap
)
sub_engine.fig.show()

In [93]:

def _calculate_differences(
    
    G: nx.Graph, 
    node_1: str, 
    node_2: str, 
    diffs: dict
    
    ) -> dict:

    common = list(nx.common_neighbors(G, node_1, node_2))
    n1 = list(G.neighbors(node_1))
    n2 = list(G.neighbors(node_2))

    diffs[(node_1, node_2)] = {}
    diffs[(node_1, node_2)]['common'] = common
    diffs[(node_1, node_2)][node_1] = n1
    diffs[(node_1, node_2)][node_2] = n2

    diffs[(node_1, node_2)][f"{node_1} stats"] = {

        'total': len(n1),
        'shared': len(common),
        'shared_percent': round(len(common)/len(n1), 2)*100


    }
    diffs[(node_1, node_2)][f"{node_2} stats"] = {

        'total': len(n2),
        'shared': len(common),
        'shared_percent': round(len(common)/len(n2), 2)*100
    }

    return diffs    


 
def calculate_differences(
    
    G: nx.Graph, 
    sub_nodes: list
    
    ) -> dict:

    diffs = {}

    if len(sub_nodes) > 2:

        for idx, node in enumerate(sub_nodes):

            if idx + 1 == len(sub_nodes):
                return diffs

            node_1 = sub_nodes[idx]
            node_2 = sub_nodes[idx + 1]

            diff = calculate_differences(
                G=G, 
                sub_nodes=[
                    node_1, 
                    node_2
                ]
            )
            diffs.update(diff)

    else:

        return _calculate_differences(
            
            G=G, 
            node_1=sub_nodes[0], 
            node_2=sub_nodes[1], 
            diffs=diffs
            
        )



differences = calculate_differences(G, sub_nodes=['nytimes.com', 'thehill.com', 'www.latimes.com'])

In [98]:
print(differences[('nytimes.com', 'thehill.com')]['thehill.com stats'])
print(differences[('nytimes.com', 'thehill.com')]['nytimes.com stats'])

{'total': 60, 'shared': 13, 'shared_percent': 22.0}
{'total': 47, 'shared': 13, 'shared_percent': 28.000000000000004}


In [38]:
def most_central(G: nx.Graph, top: int = 10) -> list:
    return [site for site, degree in sorted(list(G.degree), key=lambda x: x[1], reverse=True)[:top]]

In [39]:
most_central(G=G, top=10)

['www.nationalreview.com',
 'time.com/',
 'www.nydailynews.com',
 'apnews.com/',
 'www.cnn.com/us',
 'nymag.com',
 'www.bloomberg.com',
 'www.latimes.com',
 'news.vice.com/en_us',
 'nypost.com']

In [65]:
list(nx.common_neighbors(G, 'www.nationalreview.com', 'time.com/'))

['b-code.liadm.com',
 'sb.scorecardresearch.com',
 's.ntv.io',
 'www.googletagmanager.com',
 'tag.bounceexchange.com',
 'connect.facebook.net',
 'fastlane.rubiconproject.com',
 'tlx.3lift.com',
 'ib.adnxs.com',
 'hbopenbid.pubmatic.com',
 'cdn.parsely.com',
 'securepubads.g.doubleclick.net',
 'rp.liadm.com',
 'www.google-analytics.com',
 'www.google.com',
 'jadserve.postrelease.com',
 'ads.pubmatic.com',
 'p1.parsely.com',
 'rtb-use.mfadsrvr.com',
 'stats.g.doubleclick.net',
 'ad-cdn.technoratimedia.com',
 'acdn.adnxs.com',
 'eb2.3lift.com',
 'eus.rubiconproject.com',
 'adservice.google.com',
 'i.liadm.com',
 'www.facebook.com',
 'fonts.googleapis.com',
 'gum.criteo.com',
 'api.rlcdn.com',
 'fonts.gstatic.com',
 'pagead2.googlesyndication.com',
 'tpc.googlesyndication.com']

In [60]:
list(nx.common_neighbors(G, 'www.googletagmanager.com', 'www.google.com'))

['nytimes.com',
 'www.politico.com',
 'www.reuters.com',
 'www.latimes.com',
 'www.newsweek.com',
 'www.thedailybeast.com',
 'news.vice.com/en_us',
 'www.npr.org',
 'nypost.com',
 'www.vox.com',
 'nymag.com',
 'www.motherjones.com',
 'time.com/',
 'www.bloomberg.com',
 'www.nydailynews.com',
 'www.axios.com',
 'breitbart.com',
 'www.theblaze.com',
 'www.thenation.com',
 'thefederalist.com',
 'www.redstate.com',
 'www.nationalreview.com',
 'www.dailystar.co.uk']

In [None]:
#function that has common and non-common neighbors with venn diagram calcualted out

In [62]:
non_edgelist = list(nx.non_edges(G))

In [64]:
# non_G = nx.Graph()
# non_G.add_edges_from(non_edgelist)
# non_pos = nx.spring_layout(non_G)

# n = Graph_Viz_Engine(
#     graph=non_G,
#     pos=non_pos,
#     node_cmap=cmap
# )
# n.fig.show()