# 4. Create Networks

Once we have our clean dataframe we can build our unweighted network.

## 4.1 Load in dataframe

In [1]:
import pandas as pd
import numpy as np

In [2]:
Df_connectivity = pd.read_csv('files/DirectMails_clean.csv')

In [3]:
print(Df_connectivity.shape)
Df_connectivity.head()

(745210, 6)


Unnamed: 0,From,To,Date,Subject,Message_ID,Interaction
0,From_hotmail.com,pallen@enron.com,2001-12-30 10:19:42,Fwd: Bishops Corner,215433951075855374340,From_hotmail.com|pallen@enron.com
1,From_mailman.enron.com,k..allen@enron.com,2001-12-27 17:16:46,Your Approval is Overdue: Access Request for m...,253634511075855374674,From_mailman.enron.com|k..allen@enron.com
2,no.address@enron.com,To_External,2001-10-18 15:10:12,UPDATE - Supported Internet Email Addresses,89140651075858632242,no.address@enron.com|To_External
3,ray.alvarez@enron.com,j..kean@enron.com,2001-10-18 14:51:19,Conference Call Today with FERC Staff,317060761075858632278,ray.alvarez@enron.com|j..kean@enron.com
4,ray.alvarez@enron.com,richard.shapiro@enron.com,2001-10-18 14:51:19,Conference Call Today with FERC Staff,317060761075858632278,ray.alvarez@enron.com|richard.shapiro@enron.com


## 4.2 Determine frequency of every interaction

In [10]:
# Create new dataframe only storing the connections
Connections = pd.DataFrame()
# Store new variables
Connections[['Address1', 'Address2']] = Df_connectivity.Interaction.str.split('|', expand=True)

In [11]:
Connections.head()

Unnamed: 0,Address1,Address2
0,From_hotmail.com,pallen@enron.com
1,From_mailman.enron.com,k..allen@enron.com
2,no.address@enron.com,To_External
3,ray.alvarez@enron.com,j..kean@enron.com
4,ray.alvarez@enron.com,richard.shapiro@enron.com


In [12]:
# We are interested in how often an interactions occurs
Connections_count = Connections.value_counts().reset_index().rename(columns={0: 'Count'})

In [13]:
Connections_count.head()

Unnamed: 0,Address1,Address2,Count
0,pete.davis@enron.com,pete.davis@enron.com,2413
1,vince.kaminski@enron.com,To_aol.com,2206
2,kay.mann@enron.com,suzanne.adams@enron.com,870
3,jeff.dasovich@enron.com,To_External,865
4,kay.mann@enron.com,To_kslaw.com,773


In [14]:
Connections_count.shape

(57505, 3)

## 4.3 Create unweighted network

In [15]:
import networkx as nx

In [16]:
# Empty graph
G = nx.Graph()

In [17]:
# Create lists which stores the connections
e_list = [(Connections_count.Address1[i], Connections_count.Address2.iloc[i]) for i in range(len(Connections_count))]

In [18]:
# Check result
e_list[:10]

[('pete.davis@enron.com', 'pete.davis@enron.com'),
 ('vince.kaminski@enron.com', 'To_aol.com'),
 ('kay.mann@enron.com', 'suzanne.adams@enron.com'),
 ('jeff.dasovich@enron.com', 'To_External'),
 ('kay.mann@enron.com', 'To_kslaw.com'),
 ('jeff.dasovich@enron.com', 'james.steffes@enron.com'),
 ('jeff.dasovich@enron.com', 'paul.kaufman@enron.com'),
 ('From_External', 'To_External'),
 ('jeff.dasovich@enron.com', 'susan.mara@enron.com'),
 ('jeff.dasovich@enron.com', 'richard.shapiro@enron.com')]

In [19]:
# Add edges to the graph
G.add_edges_from(e_list)

In [20]:
# Check if network is fully connected
nx.is_connected(G)

False

In [21]:
# Get all nodes belonging to different components
nodes_largest_netwerk = sorted(nx.connected_components(G), key=len, reverse=True)[0]
# Extract largest fully connected component
G = G.subgraph(nodes_largest_netwerk)

In [22]:
# Save graph
nx.write_weighted_edgelist(G, path = 'Files/Intensity_graph')

In [23]:
# Create a nodelist containing all addresses in the dataset
nodelist = list(G.nodes)

In [24]:
len(nodelist)

6533

In [21]:
# Save graph as adjacency list
nx.write_adjlist(G, path='files/Connectivity_graph')

In [32]:
# write edgelist
nx.write_edgelist(G, path='files/Connectivity_edge_list.csv')

In [25]:
# Create adjacency matrix
Connect_adj_matrix = nx.to_pandas_adjacency(G, nodelist, dtype=int)

In [26]:
print(Connect_adj_matrix.shape)
Connect_adj_matrix

(6533, 6533)


Unnamed: 0,pete.davis@enron.com,vince.kaminski@enron.com,To_aol.com,kay.mann@enron.com,suzanne.adams@enron.com,jeff.dasovich@enron.com,To_External,To_kslaw.com,james.steffes@enron.com,paul.kaufman@enron.com,...,sasha.divelbiss@enron.com,louis.allen@enron.com,danny.foster@enron.com,grace.taylor@enron.com,rice@enron.com,jaisinghani@enron.com,roy.hartstein@enron.com,roy.steinhagen@enron.com,s..presas@enron.com,didrik.thrane-nielsen@enron.com
pete.davis@enron.com,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
vince.kaminski@enron.com,0,1,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
To_aol.com,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
kay.mann@enron.com,0,0,0,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,1,0
suzanne.adams@enron.com,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
jaisinghani@enron.com,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
roy.hartstein@enron.com,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
roy.steinhagen@enron.com,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s..presas@enron.com,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# Save graph as adjacency matrix
Connect_adj_matrix.to_csv('files/Conn_Adj.csv')

## 4.4 Create weighted network

In [27]:
import networkx as nx

In [28]:
# Create empty graph
G = nx.Graph()

In [30]:
# Create an edge list
e_list = [(Connections_count.Address1.iloc[i], Connections_count.Address2.iloc[i], Connections_count.Count.iloc[i]) for i in range(len(Connections_count))]

In [31]:
e_list[:10]

[('pete.davis@enron.com', 'pete.davis@enron.com', 2413),
 ('vince.kaminski@enron.com', 'To_aol.com', 2206),
 ('kay.mann@enron.com', 'suzanne.adams@enron.com', 870),
 ('jeff.dasovich@enron.com', 'To_External', 865),
 ('kay.mann@enron.com', 'To_kslaw.com', 773),
 ('jeff.dasovich@enron.com', 'james.steffes@enron.com', 719),
 ('jeff.dasovich@enron.com', 'paul.kaufman@enron.com', 707),
 ('From_External', 'To_External', 706),
 ('jeff.dasovich@enron.com', 'susan.mara@enron.com', 690),
 ('jeff.dasovich@enron.com', 'richard.shapiro@enron.com', 678)]

In [32]:
# Add edges
G.add_weighted_edges_from(e_list)

In [33]:
# Check if network is fully connected
nx.is_connected(G)

False

In [34]:
# Get all nodes belonging to different components
nodes_largest_netwerk = sorted(nx.connected_components(G), key=len, reverse=True)[0]
# Extract largest fully connected component
G = G.subgraph(nodes_largest_netwerk)

In [38]:
# Save graph
nx.write_weighted_edgelist(G, path = 'Files/Intensity_graph')

In [35]:
# Create a nodelist containing all addresses in the dataset
nodelist = list(G.nodes)

In [36]:
len(nodelist)

6533

In [37]:
# Create adjacency matrix
Intensity_adj_matrix = nx.to_pandas_adjacency(G, nodelist, dtype=float)

In [38]:
print(Intensity_adj_matrix.shape)
Intensity_adj_matrix

(6533, 6533)


Unnamed: 0,pete.davis@enron.com,vince.kaminski@enron.com,To_aol.com,kay.mann@enron.com,suzanne.adams@enron.com,jeff.dasovich@enron.com,To_External,To_kslaw.com,james.steffes@enron.com,paul.kaufman@enron.com,...,sasha.divelbiss@enron.com,louis.allen@enron.com,danny.foster@enron.com,grace.taylor@enron.com,rice@enron.com,jaisinghani@enron.com,roy.hartstein@enron.com,roy.steinhagen@enron.com,s..presas@enron.com,didrik.thrane-nielsen@enron.com
pete.davis@enron.com,2413.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
vince.kaminski@enron.com,0.0,61.0,2206.0,0.0,0.0,0.0,371.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
To_aol.com,0.0,2206.0,0.0,0.0,0.0,31.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
kay.mann@enron.com,0.0,0.0,0.0,30.0,25.0,0.0,537.0,773.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
suzanne.adams@enron.com,0.0,0.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
jaisinghani@enron.com,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
roy.hartstein@enron.com,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
roy.steinhagen@enron.com,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s..presas@enron.com,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
Intensity_adj_matrix.to_csv('files/Intensity_Adj.csv')