# Download & Processing

The purpose of this notebook is to download, process, and (optionally) export the data.

## Todo list

* The World Bank data should have infrastructure investments by country by year. Import that data. (Ben)
* BRI field is either 1.0 or 0.0, should be true or false (though we can still work with it as is if we want)
* We want to (potentially) do some network analysis. There's a list [here](https://govt.chinadaily.com.cn/topics/state-ownedenterprises/directorofcentralsoes) but it may not be comprehensive.
* What data do we need/like to have to do a network analysis?
* What data do we still want?
* What visualizations do we want to create? We could easily do some visualizations comparing national investment in infrastructure relative to/comparing the investments/construction with Chinese companies.
* The sky is really the limit here. Jupyter can do basically any and all visualizations imaginable.

In [1]:
# Imports here. Some of these are holdovers from a previous project, ignore them.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests, os, io, zipfile
from sklearn.preprocessing import MinMaxScaler

# Graph visualization stuff
import networkx as nx
import plotly.graph_objects as go

# etc
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim

scaler = MinMaxScaler()

In [2]:
# Download AEI dataset on Chinese investment and construction projects
aeidata = requests.get('https://www.aei.org/wp-content/uploads/2022/02/China-Global-Investment-Tracker-2021-Fall-FINAL-2022.2.21-update.xlsx')

In [3]:
#Download AidData dataset on Chinese development financing
findata = requests.get('https://docs.aiddata.org/ad4/datasets/AidDatas_Global_Chinese_Development_Finance_Dataset_Version_2_0.zip')

In [4]:
#Unzipping Findata Zip
with io.BytesIO(findata.content) as fd:
    with zipfile.ZipFile(fd, mode='r') as zip_findata:
        with zip_findata.open('AidDatas_Global_Chinese_Development_Finance_Dataset_Version_2_0/AidDatasGlobalChineseDevelopmentFinanceDataset_v2.0.xlsx') as fd_excel:
            with pd.ExcelFile(fd_excel) as fd_all:
                # Basically all the data here is in one excel sheet, that's all we need for now
                fd_data = pd.read_excel(fd_all, "Global_CDF2.0", usecols="E,F,J:P,T:V,AD:AE,AL,AQ,BK").dropna(thresh=7).dropna(subset=['Amount (Constant USD2017)'])

In [5]:
# Handle UN data
with io.BytesIO(aeidata.content) as fh:
    with pd.ExcelFile(fh) as aei_all:
        aei_df = pd.read_excel(aei_all, "Dataset 1+2", usecols="C:G, I, K", header=0, skiprows=5, index_col=None).dropna(subset=['BRI'],axis=0)

In [6]:
# Handle UN data
with io.BytesIO(aeidata.content) as fh:
    with pd.ExcelFile(fh) as aei_all:
        aei_df = pd.read_excel(aei_all, "Dataset 1+2", usecols="C:G, I, K", header=0, skiprows=5, index_col=None).dropna(subset=['BRI'],axis=0)

In [7]:
# Do any pruning here
aei_df = aei_df.reset_index(drop=True)
aei_df = aei_df[pd.to_numeric(aei_df['Quantity in Millions'], errors='coerce').notnull()]
aei_bri_df = aei_df[aei_df.BRI == 1.0].drop(columns='BRI')

# aggregate like rows by country/investor
aei_brief_bri_df = aei_bri_df.groupby(['Country', 'Investor', 'Sector']).aggregate({'Quantity in Millions': 'sum'}).reset_index()

aei_bri_df

Unnamed: 0,Investor,Quantity in Millions,Share Size,Transaction Party,Sector,Country
1,China National Petroleum Corp. (CNPC),620,0.49,Rosneft,Energy,Russian Federation
5,Zongshen Industrial,100,1,,Metals,Myanmar
13,Aluminum Corporation of China (Chinalco),450,,,Metals,Guinea
14,China Power Investment,400,,Tanzania Electric Supply,Energy,Tanzania
15,Power Construction Corp. (PowerChina),140,,,Energy,Tanzania
...,...,...,...,...,...,...
2554,Sichuan consortium,230,0.8,,Energy,Nepal
2555,Sinosteel,1100,,Tosyali,Metals,Algeria
2556,China Communications Construction,270,,,Transport,Philippines
2557,Shanghai Tunnel Engineering,400,0.7,LT Sambo,Transport,Singapore


In [8]:
testgraph = nx.from_pandas_edgelist(aei_brief_bri_df, source="Investor", target="Country", edge_attr="Quantity in Millions")
testgraph.name = "Test Graph"
print(nx.info(testgraph))

Name: Test Graph
Type: Graph
Number of nodes: 497
Number of edges: 1018
Average degree:   4.0966


In [9]:
# This code runs very, very slowly (though it does appear to work?) Probably export this in a single run after 
# an hour or so. While it would be nice to have if we wanted an interactive world map... Oh well. Maybe not today.
"""
def findGeocode(country):
    try:
        geolocator = Nominatim(user_agent="china_class_final_bmueller")
        return geolocator.geocode(country)
    except GeocoderTimedOut:
        return findGeocode(country)

pos_dict = {}
for index, country in enumerate(aei_brief_bri_df['Country']):
    loc = findGeocode(country)
    if loc != None:
        pos_dict[country] = (loc.longitude, loc.latitude)
    else:
        pos_dict[country] = (np.nan, np.nan)
pos_dict
"""

'\ndef findGeocode(country):\n    try:\n        geolocator = Nominatim(user_agent="china_class_final_bmueller")\n        return geolocator.geocode(country)\n    except GeocoderTimedOut:\n        return findGeocode(country)\n\npos_dict = {}\nfor index, country in enumerate(aei_brief_bri_df[\'Country\']):\n    loc = findGeocode(country)\n    if loc != None:\n        pos_dict[country] = (loc.longitude, loc.latitude)\n    else:\n        pos_dict[country] = (np.nan, np.nan)\npos_dict\n'

In [16]:
# Visualize the AEI graph(s)
# the lazy way:
# nx.draw(testgraph, with_labels=False, font_weight='bold', node_color='orange')
# the nice way. This is basically just the tutorial code copy/pasted and edited from the plotly site etc.
#pos = nx.spring_layout(testgraph)
pos = nx.shell_layout(nx.subgraph(testgraph, aei_brief_bri_df.Country))
pos = nx.spring_layout(testgraph, fixed=aei_brief_bri_df.Country, pos=pos)
names = testgraph.nodes.items
nx.set_node_attributes(testgraph, pos, 'pos')
nx.set_node_attributes(testgraph, names, 'names')

edge_x = []
edge_y = []
for edge in testgraph.edges(data=True):
    x0, y0 = testgraph.nodes[edge[0]]['pos']
    x1, y1 = testgraph.nodes[edge[1]]['pos']
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)
#    edge_text.append(edge[2]['Quantity in Millions'])

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
for node in testgraph.nodes():
    x, y = testgraph.nodes[node]['pos']
    node_x.append(x)
    node_y.append(y)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        # colorscale options
        #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
        #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
        #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
        colorscale='YlGnBu',
        reversescale=True,
        color=[],
        size=10,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line_width=2))

node_adjacencies = []
node_text = []
for node, adjacencies in enumerate(testgraph.adjacency()):
    node_adjacencies.append(len(adjacencies[1]))
    hover_msg = (list(testgraph.nodes)[node]+' has '+str(len(adjacencies[1]))+" connections:<br>")
    i = 0
    node_name = []
    node_quantity = []
    for connected, l in enumerate(testgraph.nodes(node)):
        if i < len(adjacencies[1]):
            node_name.append(l[0])
            node_quantity.append(list(nx.get_edge_attributes(nx.subgraph(testgraph, [list(testgraph.nodes)[node], l[0]]),
                                "Quantity in Millions").values())) #(l[0] + "</br>")
            i += 1
    sorted_names = []
    for x, y in zip(node_name, node_quantity):
        if len(y) != 0:
            sorted_names.append([x, y[0]])
        else:
            sorted_names.append([x, 0])
    sorted_names = sorted(sorted_names, key=lambda x: x[1], reverse=True)
    i = 0
    for name in sorted_names:
        if i < 5:
            hover_msg += name[0] + ", $" + str(name[1]) + '<br>'
            i += 1
    node_text.append(hover_msg)

node_trace.marker.color = node_adjacencies
node_trace.hovertext = node_text

fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='<br>Network graph made with Python',
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    text="name here",
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002 ) ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                )
fig.show()

In [None]:
#nx.subgraph(testgraph, list(["Indonesia", "Tsingshan Holding"])).edges(data=True)
sorted_names
#testgraph.nodes(1)

In [None]:
pos

In [None]:
# Do any pruning here
fd_data

In [None]:
# Download World Bank data
wbdata = requests.get('https://databank.worldbank.org/data/download/WDI_excel.zip')

In [None]:
# Since the download seems to die sometimes, let's just use a local copy as a backup
wbdata = "../WDI_excel.zip"

In [None]:
# Handle WB data (a bit harder, we need to unzip the file first)
with zipfile.ZipFile(wbdata, mode='r') as zip_wbdata:
    with zip_wbdata.open('WDIEXCEL.xlsx') as wb_excel:
        with pd.ExcelFile(wb_excel) as wb_all:
            # Basically all the data here is in one excel sheet, that's all we need for now
            wb_data = pd.read_excel(wb_all, "Data", usecols="A,C,BD:BM").dropna(thresh=3)

## TODO: everything below here

Most of this is just code that I imported from a previous project, ignore it for now.

In [None]:
# if NaN use the latest value, drop all the other values from the decade
cols = wb_data.columns.difference(["Country Name", "Indicator Name"])
wb_data["Value"] = wb_data[cols].apply(lambda x: x.dropna().astype(float)[-1], 1)
wb_data = wb_data.drop(cols, axis=1)

In [None]:
wb_names=["Land area", "Electricity access", "Renewable percentage", #"Urban land area", 
          "Nuclear percentage", "Energy use", "Gini index"]

wb_source_names=["Land area (sq. km)", #"Urban land area (sq. km)", 
                 "Access to electricity (% of population)",
                 "Renewable electricity output (% of total electricity output)",
                 "Electricity production from nuclear sources (% of total)",
                 "Energy use (kg of oil equivalent per capita)", "Gini index (World Bank estimate)"]

wb_pivoted = wb_data.pivot(index="Country Name", columns="Indicator Name")
wb_final = pd.DataFrame(wb_pivoted.Value.to_records()).set_index('Country Name')
wb_final.rename(columns=dict(zip(wb_source_names, wb_names)), inplace=True)
wb_final = wb_final[wb_names]

In [None]:
# Merge the data
data = wb_final.merge(un_hdi, left_index=True, right_index=True)
data = data.merge(un_sustainability, left_index=True, right_index=True)
data = data.merge(un_urbanization, left_index=True, right_index=True)
data = data.merge(un_dietary_adequacy, left_index=True, right_index=True)
data = data.dropna()
# There's a few outliers that have unreliable/skewed data. Let's remove those.
data = data.drop(['Malta', 'South Sudan', 'Luxembourg', 'Mauritius'])

In [None]:
# do we need to normalize?
data

In [None]:
# We further need to do analysis of this data now that we've pared it down. The most important factors I'm using
# is the urbanization normalized to sprawl[ Urbanization % * ("Urban land area(sq. km)"/"Land area (sq. km)")]
# then double normalized.

# Scratch the above. I couldn't get enough recent data for urban land area (in the last 10 years, I could probably
# make it work if I used older data but ehhhhh not great) so I'm gonna work around it by just using urbanization
metrics = pd.DataFrame(data['Urbanization'], columns=['Urbanization'])

# The second factor is, quite simply, a composite factor considering only renewable energy consumption capacity
# ("Access to electricity (% of population)" * low carbon energy sources")
carbondiet = pd.DataFrame(data['Renewable percentage'].add(data['Nuclear percentage']), columns=['Low carbon energy'])
carbondiet = pd.DataFrame(carbondiet['Low carbon energy'].mul(data['Electricity access']), columns=['Low carbon energy'])

metrics = metrics.merge(carbondiet, left_index=True, right_index=True)

# The third factor is simply the UN sustainability index
metrics = metrics.merge(data['UN Sustainability Index'], left_index=True, right_index=True)

# The fourth factor is (HDI/"Energy use (kg of oil equivalent per capita)")
sus_liv = pd.DataFrame(data['HDI'].div(data['Energy use']), columns=['Sustainable livability']) # this is a bit sus

metrics = metrics.merge(sus_liv, left_index=True, right_index=True)

# The fifth factor is the "Gini index (World Bank estimate)"
metrics = metrics.merge(data['Gini index'], left_index=True, right_index=True)

#Finally normalize
metrics = pd.DataFrame(scaler.fit_transform(metrics.values), columns=metrics.columns, index=metrics.index)
metrics

In [None]:
weighted = metrics
weighted['Urbanization'] *= 15
weighted['Low carbon energy'] *= 35
weighted['UN Sustainability Index'] *= 20
weighted['Sustainable livability'] *= 10
weighted['Gini index'] = 1 - weighted['Gini index']
weighted['Gini index'] *= 10
weighted = pd.DataFrame(scaler.fit_transform(weighted.values), columns=weighted.columns, index=weighted.index) * 100
weighted['Total'] = weighted.sum(axis=1) / 5
weighted

In [None]:
with pd.ExcelWriter("final_output.xlsx") as writer:
    weighted.sort_values(by=['Total'], ascending=False).to_excel(writer, sheet_name='Ordered Results')
    weighted.to_excel(writer, sheet_name='Results')
    metrics.to_excel(writer, sheet_name='Raw Data')
    data.to_excel(writer, sheet_name='Source Data')