In [2]:
from itertools import combinations
import pandas as pd
import networkx as nx
import numpy as np
from tqdm import tqdm

import cleantools as clnt

# Generation of Network #

In [4]:
def getEdges(column):
    """Takes in a data frame. Returns a list of edges for networkx.
    The edges are generated with the assumption that everyone in the same contract is connected"""
    return list(combinations(list(column),2))

In [5]:
# Load data frames
sub_tables_ungrouped = clnt.loadPickle("../data/pickles/sub_tables.pkl")
mentions = clnt.loadPickle("../data/pickles/clean_mentions.pkl")[["Person ID", "Tag"]]

# Group the contracts
sub_table_groups = sub_tables_ungrouped.groupby("Contract ID")

# Extract the edgelist by assuming everyone in a contract is interconnected
edges = list()
for name, table in tqdm(sub_table_groups):
    edges += getEdges(table.index)

100%|██████████| 54088/54088 [00:07<00:00, 6793.76it/s]


In [6]:
# We generate the graph and store it here
G = nx.Graph()
G.add_nodes_from(sub_tables_ungrouped.index)
G.add_edges_from(edges)

# We remove the unconnected nodes, roughly 21 of them are present in this graph
G.remove_nodes_from([node for node in G.nodes if G.degree[node] == 0])

# Save graph
# clnt.savePickle(G, "../data/pickles/social_network_graph_dc.pkl")

# Initial Check #

In [10]:
# Load graph
G = clnt.loadPickle("../data/pickles/social_network_graph_dc.pkl")

In [11]:
print("Basics of the Network")
print(f"Number of nodes: {len(G.nodes)}")
print(f"Number of edges: {len(G.edges)}")
print(f"Number of connected components: {nx.number_connected_components(G)}")
print(f"Average degree: {np.mean([G.degree[node] for node in G.nodes])}")

Basics of the Graph
Number of nodes: 156059
Number of edges: 157985
Number of connected components: 54059
Average degree: 2.0246829724655417


In [12]:
sizes = [len(comp) for comp in nx.connected_components(G)]
print(f"Average Component Size: {np.mean(sizes)}")
print(f"Largest Component Size: {np.max(sizes)}")
print(f"Smallest Component Size: {np.min(sizes)}")

Average Component Size: 2.8868273552969903
Largest Component Size: 9
Smallest Component Size: 2


# Extracting Categories From Network #

In [8]:
# Load and filter the frame
sub_tables_ungrouped = clnt.loadPickle("../data/pickles/sub_tables.pkl")
df = clnt.loadPickle("../data/pickles/table_of_all.pkl").reset_index()
notFamily = "grz:hasBusinessWith grz:isColleagueOf grz:genericReference".split()

# Further filtering
sub_tables_ungrouped["Gender"] = sub_tables_ungrouped["Gender"].apply(lambda x:x == "grz:Female")

# Here is to erase any non-family connections
for a in notFamily:
    sub_tables_ungrouped["Relationship From Apprentice"] = sub_tables_ungrouped["Relationship From Apprentice"].replace(a, np.nan)

# Group and keep filtering
sub_table_groups = sub_tables_ungrouped.groupby("Contract ID")
has_female = sub_table_groups.apply(lambda x:x["Gender"].sum() > 0).copy()
has_relations = sub_table_groups.apply(lambda x:len(x[(x["Is Guarantor"] == 1) &
                                                      (x["Relationship From Apprentice"].notna())]) > 0).copy()

In [9]:
# Prepare the encoding
category_table = pd.DataFrame(has_relations,
                              columns=["Has Apprentice Relation"]).join(pd.DataFrame(has_female, columns=["Has Female in Contract"]))
venezia_encoding = clnt.hotEncode(df, df["Apprentice Province"], operation="max")[["Venezia"]]
venezia_encoding.columns = ["From Venezia"]
category_table = category_table.join(venezia_encoding, how="left").copy()

# Create categories
category_table["Category"] = (category_table["Has Apprentice Relation"] +
                              category_table["Has Female in Contract"] * 2 +
                              category_table["From Venezia"] * 4)
category_table

Unnamed: 0_level_0,Has Apprentice Relation,Has Female in Contract,From Venezia,Category
Contract ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0000bab9-4e20-4ac4-9542-3c1d09d512e3,False,False,0,0
0002595f-fea7-4630-8086-8b8269ceae4f,True,False,0,1
000263cc-9017-46e6-b16d-20b4b3ed5d17,False,False,1,4
0002b49d-cac7-454d-83a8-65ad09a3a427,True,False,1,5
0002f41e-07d3-476b-b25d-7d6127d7bc5b,False,False,0,0
...,...,...,...,...
fff9669b-6a1c-4e34-bd8c-c039938b2dc1,False,False,1,4
fff9fc1a-2dc8-4bd0-98f9-098dd131c11e,False,False,1,4
fffc5a65-d354-4a24-a110-81e8d426005b,True,False,1,5
fffd998d-f24f-467a-87b4-a79cf5ac76fb,True,False,1,5


In [10]:
# Save
clnt.savePickle(category_table, "../data/pickles/generated_categories_appRel_femCon_fromVen.pkl")