**Project Network Analysis**

Step 3.2: Bipartite Graph (Subgraph Anomaly using RiWalk and Isolation Forest)


> <font color = 'purple'>*Loading the Git for RiWalk*








In [None]:
# Required Packages (Restart Runtime)
!pip install futures
!pip install fastdtw
!pip install gensim

Collecting futures
  Downloading futures-3.0.5.tar.gz (25 kB)
  Downloading futures-3.0.4.tar.gz (25 kB)
  Downloading futures-3.0.3.tar.gz (24 kB)
  Downloading futures-3.0.2.tar.gz (24 kB)
  Downloading futures-3.0.1.tar.gz (24 kB)
  Downloading futures-3.0.0.tar.gz (24 kB)
  Downloading futures-2.2.0-py2.py3-none-any.whl (16 kB)
Installing collected packages: futures
Successfully installed futures-2.2.0




In [None]:
# Required packages
!pip install fuzzywuzzy  &> /dev/null

In [None]:
!git clone https://github.com/maxuewei2/RiWalk.git

Cloning into 'RiWalk'...
remote: Enumerating objects: 145, done.[K
remote: Total 145 (delta 0), reused 0 (delta 0), pack-reused 145[K
Receiving objects: 100% (145/145), 2.69 MiB | 19.12 MiB/s, done.
Resolving deltas: 100% (61/61), done.


> <font color = 'purple'>*Loading the dataset*

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib as mlt
from fuzzywuzzy import fuzz
from networkx.algorithms import bipartite
from networkx.algorithms.isolate import isolates
import gensim
from sklearn.ensemble import IsolationForest



In [None]:
path = '/content/drive/MyDrive/3. IU Courses/Courses/5. Network Analysis/Project/Final/data'
output_path = '/content/drive/MyDrive/3. IU Courses/Courses/5. Network Analysis/Project/Final/output'

# Reading Procurement Data
wb_data = pd.read_excel(os.path.join(output_path, "WB_table.xlsx"))
sanctioned_data = pd.read_excel(os.path.join(path, "Sanctioned_List.xlsx"))

In [None]:
wb_data.head()

Unnamed: 0,Fiscal Year,Borrower Country,Supplier,Total Contract Amount (USD)
0,2022,Serbia,ASSECO SEE D.O.O.,3333598
1,2022,Ethiopia,"EPTISA SERVICIOS DE INGENIERIA, S.L. IN ASSOCI...",1851919
2,2022,China,"SHENZHEN CITY DONGSHEN ENGINEERING CO., LTD",14986361
3,2022,Turkey,KOLTEK MUSAVIRLIK A.S.,777600
4,2022,Chad,UNICEF,19326986


In [None]:
# Select year for analysis
year = 2010
# Grouping repeated transactions between country and suppliers
wb_data_filter = wb_data[wb_data['Fiscal Year'] == year]
wb_data_group = wb_data_filter.groupby(['Fiscal Year','Borrower Country','Supplier'])['Total Contract Amount (USD)'].sum()
wb_data_group = wb_data_group.reset_index()

In [None]:
# Setting up the network 
G = nx.Graph()
G.add_nodes_from(wb_data_group['Borrower Country'], bipartite='Country') 
G.add_nodes_from(wb_data_group['Supplier'], bipartite='Supplier') 
G.add_weighted_edges_from(zip(wb_data_group['Borrower Country'], wb_data_group['Supplier'], wb_data_group['Total Contract Amount (USD)']), weight = 'contract_amt')

In [None]:
print(nx.info(G))
G.to_undirected()

Graph with 43477 nodes and 46386 edges


<networkx.classes.graph.Graph at 0x7fc0a9aa93d0>

> <font color = 'purple'>*Implementing RiWalk*

In [None]:
# Conversion required to run RiWalk (Changing node text to integer ID)
H = nx.convert_node_labels_to_integers(G,label_attribute='old_label')

In [None]:
print(H.nodes(data=True))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
# Edge list required to run RiWalk
edge_list = nx.edgelist.generate_edgelist(H)
nx.write_edgelist(H, os.path.join('/content', 'pjct.edgelist'))

In [None]:
cd /content/RiWalk/

/content/RiWalk


In [None]:
!python3 src/RiWalk/RiWalk.py --input /content/pjct.edgelist --output embs/pjct_out.emb --dimensions 128 --num-walks 40 --walk-length 10 --window-size 10 --until-k 4 --workers 10 --iter 2 --flag sp


walk_time 31.92720353603363
bfs_time 799.0682764053345
ri_time 403.8826901912689
walks_writing_time 5.964252376556397
learning_time 95.67336297035217


In [None]:
# Output Embedding from RiWalk
pjct_out = '/content/RiWalk/embs/pjct_out.emb'

In [None]:
# Creating Word2vec from Embedding output
model = gensim.models.KeyedVectors.load_word2vec_format(pjct_out)

In [None]:
# Joining the embedding output and Index
df2 = pd.concat([ pd.DataFrame(model.wv.index2word, columns=['Supplier']), pd.DataFrame(model.wv.vectors)], axis =1)
df2.head()

  


Unnamed: 0,Supplier,0,1,2,3,4,5,6,7,8,...,118,119,120,121,122,123,124,125,126,127
0,0,-0.115935,-0.599631,0.065554,-0.292713,-0.527262,-0.080629,0.106774,-0.059088,-0.610905,...,-0.1867,-0.274491,0.889622,-0.771237,0.576502,0.441349,-0.235596,0.700456,0.126798,-0.27035
1,161,-0.020025,-0.055575,-0.076714,0.056109,-0.171844,0.004161,-0.017183,0.016943,-0.195085,...,0.130894,-0.098597,0.179881,-0.248205,0.396926,0.085362,-0.098299,0.343607,0.022148,-0.103232
2,162,-0.040671,-0.042208,-0.092076,0.146237,-0.188388,0.047314,-0.109431,0.051363,-0.232758,...,0.105216,-0.063779,0.164716,-0.171181,0.377484,0.109648,-0.053075,0.322378,0.088742,0.011123
3,163,-0.034358,-0.060281,-0.124137,0.125237,-0.177961,-0.014776,-0.056447,0.046818,-0.250339,...,0.100692,-0.087862,0.155698,-0.168778,0.447591,0.058213,-0.122163,0.383072,0.051506,-0.067698
4,164,-0.098371,-0.045624,-0.10962,0.132393,-0.163838,-0.001146,-0.052619,0.052788,-0.250663,...,0.119847,-0.111216,0.156143,-0.160043,0.449055,0.063282,-0.104948,0.348099,0.06213,-0.083764


> <font color = 'purple'>*Executing Isolation Forest*

In [None]:
random_state = np.random.RandomState(42)
df3 = df2.loc[:, df2.columns != 'Supplier']

# Isolation Forest (Anomaly Detection based on roles)
isf_model=IsolationForest(n_estimators=100,max_samples='auto',contamination=float(0.05),random_state=random_state)
isf_model.fit(df3)
print(isf_model.get_params())

{'bootstrap': False, 'contamination': 0.05, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 100, 'n_jobs': None, 'random_state': RandomState(MT19937) at 0x7FC0A9CBBD10, 'verbose': 0, 'warm_start': False}


In [None]:
df2['scores'] = isf_model.decision_function(df3)
df2['anomaly_score'] = isf_model.predict(df3)
df2[df2['anomaly_score']==-1].head()

Unnamed: 0,Supplier,0,1,2,3,4,5,6,7,8,...,120,121,122,123,124,125,126,127,scores,anomaly_score
0,0,-0.115935,-0.599631,0.065554,-0.292713,-0.527262,-0.080629,0.106774,-0.059088,-0.610905,...,0.889622,-0.771237,0.576502,0.441349,-0.235596,0.700456,0.126798,-0.27035,-0.134313,-1
44,204,-0.002403,-0.214783,0.24441,-0.306983,0.063784,0.039307,0.043075,0.089089,-0.568538,...,0.425536,-0.324239,0.240503,0.043616,-0.407737,-0.075742,-0.187678,-0.041822,-0.042311,-1
100,260,0.05774,-0.477953,-0.258887,0.085014,-0.597948,-0.281634,-0.197564,-0.06641,-0.13874,...,0.359787,-0.394876,0.262523,0.068405,0.078271,0.25081,0.05948,-0.122695,-0.035171,-1
101,261,0.136401,-0.424671,-0.266083,0.058159,-0.470961,-0.194409,-0.110636,-0.039439,-0.271642,...,0.389487,-0.276579,0.10064,-0.084034,0.097428,0.144207,-0.032286,-0.045672,-0.01399,-1
110,270,0.179376,-0.365919,-0.204699,0.107201,-0.616982,-0.215362,-0.302209,-0.013999,-0.049692,...,0.288792,-0.422649,0.31153,0.190421,0.163118,0.397793,-0.079107,0.01821,-0.018291,-1


> <font color = 'purple'>*Adding Anomaly Information in Network*

In [None]:
# Text matched common list (Suppliers in Sanctioned list)
sanction_list = pd.read_excel(os.path.join(path, 'Common_List.xlsx'))
sanction_list = sanction_list['Common_list'].to_list()

In [None]:
for index, row in df2.iterrows():
    H.nodes[int(row['Supplier'])]['anomaly'] = row['anomaly_score']
    firm_name = H.nodes[int(row['Supplier'])]['old_label']
    
    # Check if in sanctioned list of suppliers
    exist = ["yes" for x in sanction_list if firm_name==x]
    if len(exist) > 0:      
      H.nodes[int(row['Supplier'])]['sanctioned'] = 1  
      print("found sanctioned firm")
    else:
      H.nodes[int(row['Supplier'])]['sanctioned'] = 0
    
     # Check if anomaly detection identified a sanctioned suppliers
    if H.nodes[int(row['Supplier'])]['sanctioned'] == 1 and H.nodes[int(row['Supplier'])]['anomaly'] == -1:
      print ("sanctioned and anomalous")
      H.nodes[int(row['Supplier'])]['sanc_identified'] = 1
    else:
      H.nodes[int(row['Supplier'])]['sanc_identified'] = 0

found sanctioned firm
sanctioned and anomalous
found sanctioned firm
sanctioned and anomalous
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
sanctioned and anomalous
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
sanctioned and anomalous
found sanctioned firm
sanctioned and anomalous
found sanctioned firm
found sanctioned firm
found sanctioned firm
sanctioned and anomalous
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm
found sanctioned firm


In [None]:
nx.write_gml(H, os.path.join(output_path, "0422_Riwalk_Anomaly_BiPartite_2010_2022.gml"))

**Note**: Graph was built on Cytoscape. The Cyctoscape file is provided in output folder.