**Project Network Analysis**

Step 1: Data Prepration 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Required packages
!pip install fuzzywuzzy  &> /dev/null
!pip install textacy==0.11.0 &> /dev/null

In [None]:
from networkx.algorithms.isolate import isolates
import pandas as pd
import networkx as nx
from networkx.algorithms import bipartite
import os
import matplotlib.pyplot as plt
import re
import html
import textacy
import spacy
from fuzzywuzzy import fuzz
import numpy as np
from collections import defaultdict
import matplotlib as mlt



In [None]:
'Declaring Text Cleaning Functions'
RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')
def impurity(text, min_len=10):
    # returns the share of suspicious characters in a text
    if text == None or len(text) < min_len:
        return 0
    else:
        return len(RE_SUSPICIOUS.findall(text)) / len(text)

def clean(text):
    # convert html escapes like &amp; to characters.
    text = html.unescape(text)
    # tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
    # markdown URLs like [Some text](https://....)
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # text or code in brackets like [0]
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standalone sequences of specials, matches &# but not #cool
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # standalone sequences of hyphens like --- or ==
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # sequences of white spaces
    text = re.sub(r'\s+', ' ', text)
    # remove >>
    text = re.sub(r">\S+", ' ', text)
    # remove #
    text = re.sub(r"#\S+", ' ', text)
    # remove <<
    text = re.sub(r"<\S+", ' ', text)
    # remove '/n'
    text = text.strip('\n')
    # remove '/n'
    text = text.strip('\t')
    # remove 'single and double quotes'
    text = text.replace("'","")
    text = text.replace('"',"")
    text = " ".join((text.split()))
    return text.strip()

In [None]:
'Additional Text cleaning Using Textacy'
import textacy.preprocessing as tprep
def normalize(text):
 ''' Removes additional text impurities using built-in textacy functions''' 
 text = tprep.normalize.hyphenated_words(text)
 text = tprep.normalize.quotation_marks(text)
 text = tprep.normalize.unicode(text)
 text = tprep.remove.accents(text)
 text = tprep.replace.urls(text)
 text = tprep.replace.emails(text) 
 text = tprep.replace.hashtags(text) 
 text = tprep.replace.numbers(text) 
 text = tprep.replace.phone_numbers(text) 
 text = tprep.replace.user_handles(text) 
 text = tprep.replace.emojis(text)
 return text

Data Source: https://finances.worldbank.org/Procurement/Major-Contract-Awards-EDS20/4s2h-ds7h/data

Sanctioned List: https://www.worldbank.org/en/projects-operations/procurement/debarred-firms

In [None]:
path = '/content/drive/MyDrive/3. IU Courses/Courses/5. Network Analysis/Project/Final/data'
output_path = '/content/drive/MyDrive/3. IU Courses/Courses/5. Network Analysis/Project/Final/output'

# Reading Procurement Data
df_all_years = pd.DataFrame()
wb_data = pd.read_csv(os.path.join(path, "Major_Contract_Awards.csv")) 
sanctioned_data = pd.read_excel(os.path.join(path, "Sanctioned_List.xlsx"))

In [None]:
# Cleaning the text columns
wb_data['Supplier'] = wb_data['Supplier'].replace(r'\s+|\\n', ' ', regex=True) 
wb_data['Supplier'] = wb_data['Supplier'].replace(r'\s+', ' ', regex=True) 
wb_data['Supplier'] = wb_data['Supplier'].map(clean)
wb_data['Supplier'] = wb_data['Supplier'].map(normalize) 

wb_data['Borrower Country'] = wb_data['Borrower Country'].replace(r'\s+|\\n', ' ', regex=True) 
wb_data['Borrower Country'] = wb_data['Borrower Country'].replace(r'\s+', ' ', regex=True) 
wb_data['Borrower Country'] = wb_data['Borrower Country'].map(clean)
wb_data['Borrower Country'] = wb_data['Borrower Country'].map(normalize) 

# Only considering contracts which were awarded based on bidding process

wb_data = wb_data[wb_data['Supplier']!='INDIVIDUAL CONSULTANT']
wb_data = wb_data[wb_data['Supplier Country'] !='World']
wb_data = wb_data[wb_data['Total Contract Amount (USD)']!= np.inf]
wb_data = wb_data[wb_data['Total Contract Amount (USD)']!= np.nan]

wb_data = wb_data.loc[:,['Fiscal Year','Borrower Country','Supplier','Total Contract Amount (USD)']]

In [None]:
wb_data.head()

Unnamed: 0,Fiscal Year,Borrower Country,Supplier,Total Contract Amount (USD)
0,2022,Serbia,ASSECO SEE D.O.O.,3333598
2,2022,Ethiopia,"EPTISA SERVICIOS DE INGENIERIA, S.L. IN ASSOCI...",1851919
3,2022,China,"SHENZHEN CITY DONGSHEN ENGINEERING CO., LTD",14986361
4,2022,Turkey,KOLTEK MUSAVIRLIK A.S.,777600
5,2022,Chad,UNICEF,19326986


In [None]:
wb_data.to_excel(os.path.join(output_path, 'WB_table.xlsx'), index= False) # This file is provided in output folder for future models