# In this notebook we use NetworkX to analyze a company's email network. Each employee is represented by a node and edges indicate that at least one email has been sent between them. Nodes also have attributes "Department" and "ManagementSalary" (Indicating if employee receives management salary). Dataset for this notebook has been downloaded from coursera.

# Importing necessary libraries/packages and reading graph

In [1]:
import warnings
warnings.filterwarnings('ignore')
!pip uninstall networkx -y
!pip install networkx==1.11
import networkx as nx
import pandas as pd
import numpy as np
import pickle
import math
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
G = nx.read_gpickle('email_prediction.txt')

Found existing installation: networkx 1.11
Uninstalling networkx-1.11:
  Successfully uninstalled networkx-1.11


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scikit-image 0.18.1 requires networkx>=2.0, but you have networkx 1.11 which is incompatible.


Collecting networkx==1.11
  Using cached networkx-1.11-py2.py3-none-any.whl (1.3 MB)
Installing collected packages: networkx
Successfully installed networkx-1.11


# Part I - Training a classifier to predict probability that an employee receives management salary. First find nodes for which information about management salary is available

In [2]:
train = []
for node in list(G.nodes(data=True)) :
    if not math.isnan(node[1]['ManagementSalary']) :
        train.append(node[0])

# Obtaining features for training a classifier

In [3]:
# List containing information about dept of employees
train_dept_list = []
# List of booleans which tell whether an employee receives managment salary or not 
train_management_list = []
# List containing clusteting coefficients of nodes
train_clustering_list = []
# List containing degree centrality of nodes
train_degree_list = []
for x in train :
    train_dept_list.append(list(G.nodes(data=True))[x][1]['Department'])
    train_management_list.append(list(G.nodes(data=True))[x][1]['ManagementSalary'])
    train_clustering_list.append(nx.clustering(G, x))
    train_degree_list.append(nx.degree_centrality(G)[x])
# Dept of employees, their clustering coefficients and degree centrality values are used as features
X_train = pd.DataFrame([train_dept_list, train_clustering_list, train_degree_list]).T
X_train.columns = ['dept', 'clustering', 'degree']

# Obtaining one hot vectors for the column 'dept'. Also, the list of true labels is being obtained.

In [4]:
encoder = OneHotEncoder(handle_unknown='ignore')
# Obtaining one hot vectors for dept of employees
encoder_df = pd.DataFrame(encoder.fit_transform(X_train[['dept']]).toarray())
df_data = X_train.join(encoder_df)
df_data.drop('dept', axis=1, inplace=True)
y_data = pd.DataFrame([train_management_list]).T
y_data.columns = ['Management']

# Training a classifier using grid search with area under ROC curve as metric.

In [5]:
# Training a random forest classifier along with hyper-parameter optimization using grid search 
clf_prior = RandomForestClassifier(random_state = 0)
parameters = {'n_estimators': [6, 8, 10], 'max_depth': [5, 7]}
clf = GridSearchCV(clf_prior, parameters, scoring ='roc_auc')
clf.fit(df_data[:650], y_data[:650])
# Area under ROC curve is used as metric
print('Area under ROC curve is', roc_auc_score(y_data[650:], clf.predict_proba(df_data[650:])[:, 1]))

Area under ROC curve is 0.9226804123711341


# Part II - Predicting future connections - Given the status of graph and attributes of nodes at a certain stage, we will predict the probability of formation of an edge between nodes that were not connected initially. First we read the provided dataset.

In [6]:
future_connections = pd.read_csv('Future_Connections.csv', index_col=0, converters={0: eval})

# Classifying nodes into communities based on employee's department and finding out pairs of nodes for which information about future connection is available.

In [7]:
for x in G.nodes(data=True) :
    G.node[x[0]]['community'] = x[1]['Department']
future_connections['check_nan'] = future_connections['Future Connection'].apply(lambda x: math.isnan(x))
data_df = future_connections[future_connections['check_nan']==False]
data_df.drop(['check_nan'], axis=1, inplace=True)

# Obtaining features for training classifier

In [8]:
# Find no. of common neighbors for pairs of nodes 
data_df['common_neighbors'] = data_df.index.map(lambda city: len(list(nx.common_neighbors(G, city[0], city[1]))))
# Find list of Jaccard coefficients of pairs of nodes
jaccard_coefficient_list = []
# Find list of resource allocation index of pairs of nodes
resource_allocation_list = []
# Find list of adamic adar index of pairs of nodes
adamic_adar_list = []
# Find list of preferential attachment scores of pairs of nodes
preferential_attachment_list = []
# Finds no. of common neighbors for pairs of nodes using community information
cn_soundarajan_list = []
for x in list(data_df.index) :
    jaccard_coefficient_list.append(list(nx.jaccard_coefficient(G, [x]))[0][2])
    resource_allocation_list.append(list(nx.resource_allocation_index(G, [x]))[0][2])
    adamic_adar_list.append(list(nx.adamic_adar_index(G, [x]))[0][2])
    preferential_attachment_list.append(list(nx.preferential_attachment(G, [x]))[0][2])
    cn_soundarajan_list.append(list(nx.cn_soundarajan_hopcroft(G, [x]))[0][2])
data_df['jaccard_coefficient'] = jaccard_coefficient_list
data_df['resource_allocation'] = resource_allocation_list
data_df['adamic_adar'] = adamic_adar_list
data_df['preferential_attachment'] = preferential_attachment_list
data_df['cn_soundarajan'] = cn_soundarajan_list

# Training a classifier using grid search with area under ROC curve as metric.

In [9]:
clf_prior = RandomForestClassifier()
parameters = {'n_estimators': [6, 8, 10], 'max_depth': [3, 4, 5]}
# Train a random forest classifier with hyper-parameter optimization using grid search
clf = GridSearchCV(clf_prior, parameters, scoring ='roc_auc')
train_X = data_df[['common_neighbors', 'jaccard_coefficient', 'resource_allocation', 'adamic_adar',
                   'preferential_attachment', 'cn_soundarajan']][:329700]
train_y = data_df['Future Connection'][:329700]
test_X = data_df[['common_neighbors', 'jaccard_coefficient', 'resource_allocation', 'adamic_adar',
                  'preferential_attachment', 'cn_soundarajan']][329700:]
test_y = data_df['Future Connection'][329700:]
clf.fit(train_X, train_y)
# Area under ROC curve is used as metric
print('Area under ROC curve is', roc_auc_score(test_y, clf.predict_proba(test_X)[:, 1]))

Area under ROC curve is 0.9197720856341671
