**Project Network Analysis**

Step 3.3: Bipartite Graph (Node Embedding + Role + ISF)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Required packages
!pip install fuzzywuzzy  &> /dev/null
!pip install Node2Vec &> /dev/null
!pip install graphrole &> /dev/null

In [None]:
from networkx.algorithms.isolate import isolates
import pandas as pd
import networkx as nx
from networkx.algorithms import bipartite
import os
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
import numpy as np
import matplotlib as mlt
from node2vec import Node2Vec
from graphrole import RecursiveFeatureExtractor, RoleExtractor
from sklearn.ensemble import IsolationForest



In [None]:
path = '/content/drive/MyDrive/3. IU Courses/Courses/5. Network Analysis/Project/Final/data'
output_path = '/content/drive/MyDrive/3. IU Courses/Courses/5. Network Analysis/Project/Final/output'

# Reading Procurement Data
wb_data = pd.read_excel(os.path.join(output_path, "WB_table.xlsx"))
sanctioned_data = pd.read_excel(os.path.join(path, "Sanctioned_List.xlsx"))

In [None]:
wb_data.head()

Unnamed: 0,Fiscal Year,Borrower Country,Supplier,Total Contract Amount (USD)
0,2022,Serbia,ASSECO SEE D.O.O.,3333598
1,2022,Ethiopia,"EPTISA SERVICIOS DE INGENIERIA, S.L. IN ASSOCI...",1851919
2,2022,China,"SHENZHEN CITY DONGSHEN ENGINEERING CO., LTD",14986361
3,2022,Turkey,KOLTEK MUSAVIRLIK A.S.,777600
4,2022,Chad,UNICEF,19326986


Finding Roles

In [None]:
year = 2016
wb_data_filter = wb_data[wb_data['Fiscal Year']>=year]
# Grouping repeated transactions between country and suppliers
wb_data_group = wb_data_filter.groupby(['Fiscal Year','Borrower Country','Supplier'])['Total Contract Amount (USD)'].sum() 
wb_data_group = wb_data_group.reset_index()
# Setting up the network 
G = nx.Graph()
G.add_nodes_from(wb_data_group['Borrower Country'], bipartite='Country') 
G.add_nodes_from(wb_data_group['Supplier'], bipartite='Supplier') 
G.add_weighted_edges_from(zip(wb_data_group['Borrower Country'], wb_data_group['Supplier'], wb_data_group['Total Contract Amount (USD)']/1000000), weight = 'contract_amt')
G.to_undirected()

<networkx.classes.graph.Graph at 0x7f4dd1e6c610>

In [None]:
# extract features
feature_extractor = RecursiveFeatureExtractor(G)
features = feature_extractor.extract_features()

In [None]:
# assign node roles
role_extractor = RoleExtractor(n_roles=None)
role_extractor.extract_role_factors(features)
node_roles = role_extractor.roles
nx.set_node_attributes(G, node_roles, 'role' )
#print('\nNode role assignments:')
#print(node_roles)

#print('\nNode role membership by percentage:')
#print(role_extractor.role_percentage.round(2))

In [None]:
G.nodes(data=True)

NodeDataView({'Afghanistan': {'bipartite': 'Country', 'role': 'role_2'}, 'Africa': {'bipartite': 'Country', 'role': 'role_0'}, 'Albania': {'bipartite': 'Country', 'role': 'role_0'}, 'Angola': {'bipartite': 'Country', 'role': 'role_2'}, 'Argentina': {'bipartite': 'Country', 'role': 'role_4'}, 'Armenia': {'bipartite': 'Country', 'role': 'role_0'}, 'Azerbaijan': {'bipartite': 'Country', 'role': 'role_0'}, 'Bangladesh': {'bipartite': 'Country', 'role': 'role_5'}, 'Belarus': {'bipartite': 'Country', 'role': 'role_0'}, 'Belize': {'bipartite': 'Country', 'role': 'role_0'}, 'Benin': {'bipartite': 'Country', 'role': 'role_0'}, 'Bolivia': {'bipartite': 'Country', 'role': 'role_0'}, 'Bosnia and Herzegovina': {'bipartite': 'Country', 'role': 'role_0'}, 'Brazil': {'bipartite': 'Country', 'role': 'role_4'}, 'Burkina Faso': {'bipartite': 'Country', 'role': 'role_0'}, 'Burundi': {'bipartite': 'Country', 'role': 'role_2'}, 'Cabo Verde': {'bipartite': 'Country', 'role': 'role_0'}, 'Cambodia': {'bipartit

In [None]:
df = pd.DataFrame(G.nodes(data=True), columns= ['Supplier', 'Node_Info'])
df['role'] = None
for i in range(0, len(df)):
  df.iloc[i,2] = df.iloc[i,1]['role']

In [None]:
node2vec = Node2Vec(G, dimensions=20, walk_length=8, num_walks=10)

Computing transition probabilities:   0%|          | 0/12636 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [01:02<00:00,  6.21s/it]


In [None]:
# Learn embeddings 
model = node2vec.fit(window=10, min_count=1)

In [None]:
df1 = pd.concat([ pd.DataFrame(model.wv.index2word, columns=['Supplier']), pd.DataFrame(model.wv.vectors)], axis =1)
df1 = df1.rename({0: 'E0', 1: 'E1', 2: 'E2', 3:'E3', 4: 'E4', 5: 'E5', 6: 'E6', 7: 'E7', 8: 'E8', 9: 'E9', 10: 'E10', 11: 'E11', 
           12:'E12', 13:'E13', 14:'E14', 15:'E15', 16:'E16', 17:'E17', 18:'E18', 19:'E19'}, axis=1)

In [None]:
df_all = df1.merge(df, how ="left", on = "Supplier")
df_all = pd.get_dummies(df_all, columns=["role"], prefix="assigned")
df_all.head()

Unnamed: 0,Supplier,E0,E1,E2,E3,E4,E5,E6,E7,E8,...,E17,E18,E19,Node_Info,assigned_role_0,assigned_role_1,assigned_role_2,assigned_role_4,assigned_role_5,assigned_role_7
0,Somalia,2.184223,-1.124275,-0.887776,0.093663,0.408051,0.74754,0.396471,-1.84197,0.298149,...,0.200889,-1.360997,2.588898,"{'bipartite': 'Country', 'role': 'role_5'}",0,0,0,0,1,0
1,India,0.808455,-0.866214,-0.172879,0.738768,0.217236,-1.136243,-0.194693,1.271202,-0.830168,...,1.390983,-1.315552,1.699069,"{'bipartite': 'Country', 'role': 'role_5'}",0,0,0,0,1,0
2,China,-0.314283,-2.282622,-0.440662,-0.379399,-0.31082,-0.591606,-0.737665,-0.423816,0.978213,...,0.215258,-0.359469,2.981017,"{'bipartite': 'Country', 'role': 'role_4'}",0,0,0,1,0,0
3,Vietnam,1.376632,-1.119873,-0.426727,0.651708,-0.755409,1.485422,0.448182,0.729062,-2.780666,...,0.819311,-2.358459,0.14172,"{'bipartite': 'Country', 'role': 'role_4'}",0,0,0,1,0,0
4,"Congo, Democratic Republic of",-0.227551,-1.762249,-0.719522,-0.915918,0.420978,-0.068551,-0.854482,-0.539541,0.324941,...,0.711557,-0.32312,1.768193,"{'bipartite': 'Country', 'role': 'role_5'}",0,0,0,0,1,0


**Random Forest**

In [None]:
random_state = np.random.RandomState(42)

In [None]:
isf_model=IsolationForest(n_estimators=100,max_samples='auto',contamination=float(0.05),random_state=random_state)

isf_model.fit(df_all[['assigned_role_0'	,'assigned_role_2'	,'assigned_role_4'	,'assigned_role_5'	, 'assigned_role_1',	'assigned_role_7','E0','E1',
                       'E2','E3','E4','E5','E6','E7','E8','E9','E10','E11','E12','E13','E14','E15','E16','E17','E18','E19']]) # Update Assigned roles

print(isf_model.get_params())

{'bootstrap': False, 'contamination': 0.05, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 100, 'n_jobs': None, 'random_state': RandomState(MT19937) at 0x7F4DD086F380, 'verbose': 0, 'warm_start': False}


In [None]:
df_all['scores'] = isf_model.decision_function(df_all[['assigned_role_0'	,'assigned_role_2'	,'assigned_role_4'	,'assigned_role_1'	, 'assigned_role_5',	'assigned_role_7','E0','E1',
                       'E2','E3','E4','E5','E6','E7','E8','E9','E10','E11','E12','E13','E14','E15','E16','E17','E18','E19']])

df_all['anomaly_score'] = isf_model.predict(df_all[['assigned_role_0'	,'assigned_role_2'	,'assigned_role_4'	,'assigned_role_1'	, 'assigned_role_5',	'assigned_role_7','E0','E1',
                       'E2','E3','E4','E5','E6','E7','E8','E9','E10','E11','E12','E13','E14','E15','E16','E17','E18','E19']])

df_all[df_all['anomaly_score']==-1].head()

Unnamed: 0,Supplier,E0,E1,E2,E3,E4,E5,E6,E7,E8,...,E19,Node_Info,assigned_role_0,assigned_role_1,assigned_role_2,assigned_role_4,assigned_role_5,assigned_role_7,scores,anomaly_score
0,Somalia,2.184223,-1.124275,-0.887776,0.093663,0.408051,0.74754,0.396471,-1.84197,0.298149,...,2.588898,"{'bipartite': 'Country', 'role': 'role_5'}",0,0,0,0,1,0,-0.02202,-1
1,India,0.808455,-0.866214,-0.172879,0.738768,0.217236,-1.136243,-0.194693,1.271202,-0.830168,...,1.699069,"{'bipartite': 'Country', 'role': 'role_5'}",0,0,0,0,1,0,-0.022391,-1
2,China,-0.314283,-2.282622,-0.440662,-0.379399,-0.31082,-0.591606,-0.737665,-0.423816,0.978213,...,2.981017,"{'bipartite': 'Country', 'role': 'role_4'}",0,0,0,1,0,0,-0.100657,-1
3,Vietnam,1.376632,-1.119873,-0.426727,0.651708,-0.755409,1.485422,0.448182,0.729062,-2.780666,...,0.14172,"{'bipartite': 'Country', 'role': 'role_4'}",0,0,0,1,0,0,-0.055395,-1
4,"Congo, Democratic Republic of",-0.227551,-1.762249,-0.719522,-0.915918,0.420978,-0.068551,-0.854482,-0.539541,0.324941,...,1.768193,"{'bipartite': 'Country', 'role': 'role_5'}",0,0,0,0,1,0,-0.00406,-1


In [None]:
for index, row in df_all.iterrows():
    G.nodes[row['Supplier']]['anomaly'] = row['anomaly_score']
 
G.nodes(data=True)

NodeDataView({'Afghanistan': {'bipartite': 'Country', 'role': 'role_2', 'anomaly': -1}, 'Africa': {'bipartite': 'Country', 'role': 'role_0', 'anomaly': -1}, 'Albania': {'bipartite': 'Country', 'role': 'role_0', 'anomaly': -1}, 'Angola': {'bipartite': 'Country', 'role': 'role_2', 'anomaly': -1}, 'Argentina': {'bipartite': 'Country', 'role': 'role_4', 'anomaly': -1}, 'Armenia': {'bipartite': 'Country', 'role': 'role_0', 'anomaly': -1}, 'Azerbaijan': {'bipartite': 'Country', 'role': 'role_0', 'anomaly': -1}, 'Bangladesh': {'bipartite': 'Country', 'role': 'role_5', 'anomaly': 1}, 'Belarus': {'bipartite': 'Country', 'role': 'role_0', 'anomaly': -1}, 'Belize': {'bipartite': 'Country', 'role': 'role_0', 'anomaly': -1}, 'Benin': {'bipartite': 'Country', 'role': 'role_0', 'anomaly': -1}, 'Bolivia': {'bipartite': 'Country', 'role': 'role_0', 'anomaly': -1}, 'Bosnia and Herzegovina': {'bipartite': 'Country', 'role': 'role_0', 'anomaly': -1}, 'Brazil': {'bipartite': 'Country', 'role': 'role_4', 'a

In [None]:
nx.write_gml(G, os.path.join(output_path, "0422_Bi_Role_Node2Vec_ISF_2021.gml"))

**Node2Vec To Find Most Similar Firms Which Are Sanctioned**

<font color = 'grey'>***Node embedding for each year***

In [None]:
def clean(text):
  #define special characters list
  special_characters = ['!','#','$','%', '&','@','[',']',' ',']','_', '/', '(', ')', "'", ",", "-", "."]
  # lambda and join function
  clean_string = ''.join(filter(lambda i:i not in special_characters, text))     
  return clean_string

In [None]:
Common_list = pd.read_excel(os.path.join(path, 'Common_List.xlsx'))
node_list = Common_list['Common_list'].to_list()

In [None]:
# Doing the anlaysis by year (bipartite)
# node_list = ['TRACTEBEL ENGINEERING S.A.']

history = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
#history = [2021]
#fig = plt.figure("Degree of a Network Science graph", figsize=(12, 6))
# Create a gridspec for adding subplots of different sizes
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(9,6))  
#ax = axes.flatten()

i = 0
for year in history:
  wb_data_filter = wb_data[wb_data['Fiscal Year']==year]
  # Grouping repeated transactions between country and suppliers
  wb_data_group = wb_data_filter.groupby(['Fiscal Year','Borrower Country','Supplier'])['Total Contract Amount (USD)'].sum() 
  wb_data_group = wb_data_group.reset_index()
  # Setting up the network 
  G = nx.Graph()
  G.add_nodes_from(wb_data_group['Borrower Country'], bipartite='Country') 
  G.add_nodes_from(wb_data_group['Supplier'], bipartite='Supplier') 
  G.add_weighted_edges_from(zip(wb_data_group['Borrower Country'], wb_data_group['Supplier'], wb_data_group['Total Contract Amount (USD)']/1000000), weight = 'contract_amt')
  G.to_undirected()
  #print(year)

  # Learn embeddings 
  node2vec = Node2Vec(G, dimensions=20, walk_length=16, num_walks=1)
  model = node2vec.fit(window=10, min_count=1)
  
  # Finding Node Embedding if sanctioned firm had a contract in given year
  for sanc_node in node_list:
    # print(sanc_node)
    if sanc_node in set(G.nodes()):
      print('yes')
      # Node2Vec Embedding
      similar_node = []          
      
      for node, _ in model.most_similar(sanc_node):
        similar_node.append(node)

      # Building graph of most similar supplier in each year
      G_node_neighborhood = []
      top_node = similar_node[0] # Top most Similar node
      
      for n1 in G.neighbors(top_node):
        G_node_neighborhood.append(n1)        
        for n2 in G.neighbors(n1):
          G_node_neighborhood.append(n2)          
    
      closest_ngh_graph = G.subgraph(G_node_neighborhood + [top_node])
  
      # Subgraph of selected node and its 1st and 2nd neighbors 
      # Specify colors
      cmap = mlt.colors.ListedColormap(['C0', 'darkorange'])
      Gcc = closest_ngh_graph.subgraph(sorted(nx.connected_components(closest_ngh_graph), key=len, reverse=True)[0])
      node_clr =[]
      labels = {}
      for node_name, attrb in Gcc.nodes(data=True):
        if node_name == top_node:
          node_clr.append('red')
          labels[node_name] = node_name
        else:
          if attrb['bipartite'] =='Country':
            node_clr.append('darkorange')
            labels[node_name] = node_name 
          else:
            node_clr.append('skyblue')
      
      fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12,6))     
      pos = nx.spring_layout(Gcc, seed=10396953)
      nx.draw_networkx_nodes(Gcc, pos, node_size=32, label=True, ax=ax, cmap=cmap, node_color = node_clr)
      nx.draw_networkx_edges(Gcc, pos, alpha=0.4, ax=ax)
      nx.draw_networkx_labels(Gcc ,pos, labels, font_size=10, 
                              font_color='black', ax=ax, alpha =.9, 
                              horizontalalignment ='center')
      ax.set_title(f"Graph of {top_node} in {year}: Similar to {sanc_node} ")
      ax.set_axis_off() 
      i +=1 
      plt.savefig(os.path.join(output_path, (clean(sanc_node) + "_" + str(year))))
       
