1. Project Background

The energy industry is highly sensitive to geopolitical instability, which often disrupts supply chains, inflates prices and threatens energy security. Oil prices, in particular, experience frequent spikes or crashes due to unpredictable geopolitical events. Additionally, companies and investors face challenges in systematically assessing how geopolitical factors influence energy markets.

To address these issues, this project aims to develop an index that ranks countries and regions based on their geopolitical risks. The index will quantify key risk exposures such as political stability, the rule of law and control of corruption and evaluate their impact on energy markets. Ultimately, the project seeks to optimize sourcing, trading, and investment strategies by providing actionable insights into geopolitical risk.

2. Load Libraries & Data

In [1]:
#Import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#Load Dataset 1
world_bank =pd.read_excel(r'C:\Users\Work\Desktop\Geopolitical_Risk_Index\Datasets\wgidataset.xlsx')
world_bank.head(10)

Unnamed: 0,codeindyr,code,countryname,year,indicator,estimate,stddev,nsource,pctrank,pctranklower,pctrankupper
0,AFGcc1996,AFG,Afghanistan,1996,cc,-1.291705,0.340507,2,4.301075,0,27.419355
1,ALBcc1996,ALB,Albania,1996,cc,-0.893903,0.315914,3,19.354839,2.688172,43.010754
2,DZAcc1996,DZA,Algeria,1996,cc,-0.566741,0.262077,4,33.333332,16.666666,52.688171
3,ASMcc1996,ASM,American Samoa,1996,cc,..,..,..,..,..,..
4,ADOcc1996,ADO,Andorra,1996,cc,1.318143,0.480889,1,87.096771,72.043015,96.774193
5,AGOcc1996,AGO,Angola,1996,cc,-1.167702,0.262077,4,9.67742,0.537634,27.419355
6,AIAcc1996,AIA,Anguilla,1996,cc,..,..,..,..,..,..
7,ATGcc1996,ATG,Antigua and Barbuda,1996,cc,0.869897,0.480889,1,80.107529,59.139786,91.39785
8,ARGcc1996,ARG,Argentina,1996,cc,-0.101317,0.210325,6,53.763439,38.709679,62.903225
9,ARMcc1996,ARM,Armenia,1996,cc,-0.473051,0.340507,2,38.172043,15.053763,59.139786


3. Exploratory Data Analysis

In [3]:
world_bank.columns.to_list()

['codeindyr',
 'code',
 'countryname',
 'year',
 'indicator',
 'estimate',
 'stddev',
 'nsource',
 'pctrank',
 'pctranklower',
 'pctrankupper']

In [4]:
world_bank.shape

(32100, 11)

In [5]:
# Filter Political Stability (pv), Rule of Law (rl), and Control of Corruption (cc)
filtered_world_bank = world_bank[world_bank['indicator'].isin(['pv', 'rl', 'cc'])]
filtered_world_bank.head(15)

Unnamed: 0,codeindyr,code,countryname,year,indicator,estimate,stddev,nsource,pctrank,pctranklower,pctrankupper
0,AFGcc1996,AFG,Afghanistan,1996,cc,-1.291705,0.340507,2,4.301075,0,27.419355
1,ALBcc1996,ALB,Albania,1996,cc,-0.893903,0.315914,3,19.354839,2.688172,43.010754
2,DZAcc1996,DZA,Algeria,1996,cc,-0.566741,0.262077,4,33.333332,16.666666,52.688171
3,ASMcc1996,ASM,American Samoa,1996,cc,..,..,..,..,..,..
4,ADOcc1996,ADO,Andorra,1996,cc,1.318143,0.480889,1,87.096771,72.043015,96.774193
5,AGOcc1996,AGO,Angola,1996,cc,-1.167702,0.262077,4,9.67742,0.537634,27.419355
6,AIAcc1996,AIA,Anguilla,1996,cc,..,..,..,..,..,..
7,ATGcc1996,ATG,Antigua and Barbuda,1996,cc,0.869897,0.480889,1,80.107529,59.139786,91.39785
8,ARGcc1996,ARG,Argentina,1996,cc,-0.101317,0.210325,6,53.763439,38.709679,62.903225
9,ARMcc1996,ARM,Armenia,1996,cc,-0.473051,0.340507,2,38.172043,15.053763,59.139786


In [6]:
filtered_world_bank.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16050 entries, 0 to 31671
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   codeindyr     16050 non-null  object
 1   code          16050 non-null  object
 2   countryname   16050 non-null  object
 3   year          16050 non-null  int64 
 4   indicator     16050 non-null  object
 5   estimate      16050 non-null  object
 6   stddev        16050 non-null  object
 7   nsource       16050 non-null  object
 8   pctrank       16050 non-null  object
 9   pctranklower  16050 non-null  object
 10  pctrankupper  16050 non-null  object
dtypes: int64(1), object(10)
memory usage: 1.5+ MB


In [7]:
#Check for missing values
filtered_world_bank.isnull().sum()

codeindyr       0
code            0
countryname     0
year            0
indicator       0
estimate        0
stddev          0
nsource         0
pctrank         0
pctranklower    0
pctrankupper    0
dtype: int64

In [8]:
#check for duplicates
filtered_world_bank.duplicated().value_counts()

False    16050
Name: count, dtype: int64

In [9]:
#check for data types
filtered_world_bank.dtypes

codeindyr       object
code            object
countryname     object
year             int64
indicator       object
estimate        object
stddev          object
nsource         object
pctrank         object
pctranklower    object
pctrankupper    object
dtype: object

In [10]:
#Drop columns
columns_to_drop = ['code', 'nsource', 'pctranklower', 'pctrankupper']
filtered_world_bank = filtered_world_bank.drop(columns=columns_to_drop, errors='ignore')
filtered_world_bank.head()

Unnamed: 0,codeindyr,countryname,year,indicator,estimate,stddev,pctrank
0,AFGcc1996,Afghanistan,1996,cc,-1.291705,0.340507,4.301075
1,ALBcc1996,Albania,1996,cc,-0.893903,0.315914,19.354839
2,DZAcc1996,Algeria,1996,cc,-0.566741,0.262077,33.333332
3,ASMcc1996,American Samoa,1996,cc,..,..,..
4,ADOcc1996,Andorra,1996,cc,1.318143,0.480889,87.096771


In [11]:
#Convert columns to numeric
numeric_cols = ['estimate', 'pctrank', 'stddev']
filtered_world_bank[numeric_cols] = filtered_world_bank[numeric_cols].apply(pd.to_numeric, errors='coerce')
filtered_world_bank.dtypes

codeindyr       object
countryname     object
year             int64
indicator       object
estimate       float64
stddev         float64
pctrank        float64
dtype: object

In [12]:
#Pivot/Reshape the data
clean_wb_df = filtered_world_bank.pivot(
    index=['countryname', 'year'],
    columns='indicator',
    values=['estimate', 'pctrank']
)

clean_wb_df.columns = [f'{ind}_{metric}' for metric, ind in clean_wb_df.columns]
clean_wb_df.reset_index(inplace=True)
clean_wb_df.head(10)

Unnamed: 0,countryname,year,cc_estimate,pv_estimate,rl_estimate,cc_pctrank,pv_pctrank,rl_pctrank
0,Afghanistan,1996,-1.291705,-2.41731,-1.788075,4.301075,2.12766,1.507538
1,Afghanistan,1998,-1.176012,-2.427355,-1.734887,8.02139,0.531915,2.0
2,Afghanistan,2000,-1.271724,-2.438969,-1.780661,4.787234,0.529101,1.492537
3,Afghanistan,2002,-1.251137,-2.035034,-1.673473,4.761905,1.587302,1.99005
4,Afghanistan,2003,-1.34418,-2.198372,-1.558294,4.761905,2.01005,2.985075
5,Afghanistan,2004,-1.350647,-2.295682,-1.693925,6.403941,1.941748,2.403846
6,Afghanistan,2005,-1.447252,-2.06751,-1.662966,1.463415,2.427185,2.392344
7,Afghanistan,2006,-1.446292,-2.219135,-1.879005,1.95122,1.449275,0.478469
8,Afghanistan,2007,-1.613251,-2.413373,-1.85256,0.970874,1.449275,0.478469
9,Afghanistan,2008,-1.672096,-2.691361,-1.903308,0.485437,0.480769,0.480769


Calculate Risk Scores

In [13]:
#Calculate Risk Scores
#Invert Political Stability
clean_wb_df['pv_risk'] = -1 * clean_wb_df['pv_estimate']
"""
The Higher the pv = more stable, but we want higher = riskier
"""

'\nThe Higher the pv = more stable, but we want higher = riskier\n'

In [14]:
# Normalize All Scores to 0–100
def normalize(series):
    return (series - series.min()) / (series.max() - series.min()) * 100

clean_wb_df['pv_risk_norm'] = normalize(clean_wb_df['pv_risk'])
clean_wb_df['rl_risk_norm'] = normalize(-1 * clean_wb_df['rl_estimate'])  # Lower rule of law = higher risk
clean_wb_df['cc_risk_norm'] = normalize(-1 * clean_wb_df['cc_estimate'])  # Lower corruption control = higher risk

In [15]:
#Weighted Composite Risk Score
weights = {
    'pv': 0.40,  # Political stability (most critical for energy projects)
    'rl': 0.30,  # Rule of law (contract enforcement)
    'cc': 0.30   # Corruption control (bribes, permits)
}

clean_wb_df['composite_risk'] = (
    weights['pv'] * clean_wb_df['pv_risk_norm'] + 
    weights['rl'] * clean_wb_df['rl_risk_norm'] + 
    weights['cc'] * clean_wb_df['cc_risk_norm']
)

In [16]:
#Rank Countries by Risk
clean_wb_df = clean_wb_df.sort_values('composite_risk', ascending=False)  # Highest risk first

In [17]:
#Categorize Risk Tiers
conditions = [
    clean_wb_df['composite_risk'] <= 30,
    clean_wb_df['composite_risk'] <= 60,
    clean_wb_df['composite_risk'] > 60
]
choices = ['Low', 'Moderate', 'High']
clean_wb_df['risk_tier'] = pd.cut(clean_wb_df['composite_risk'], bins=[0, 30, 60, 100], labels=choices)

In [18]:
clean_wb_df.head(25)

Unnamed: 0,countryname,year,cc_estimate,pv_estimate,rl_estimate,cc_pctrank,pv_pctrank,rl_pctrank,pv_risk,pv_risk_norm,rl_risk_norm,cc_risk_norm,composite_risk,risk_tier
4334,Somalia,2008,-1.848734,-3.280517,-2.590877,0.0,0.0,0.0,3.280517,99.385394,100.0,97.271847,98.935712,High
4335,Somalia,2009,-1.701694,-3.312951,-2.450297,0.0,0.0,0.0,3.312951,100.0,97.018847,93.95166,97.291152,High
4333,Somalia,2007,-1.739551,-3.228497,-2.41923,0.0,0.0,0.0,3.228497,98.399631,96.360046,94.806477,96.709809,High
4336,Somalia,2010,-1.726097,-3.130971,-2.406264,0.0,0.0,0.0,3.130971,96.551562,96.08508,94.502688,95.796955,High
4337,Somalia,2011,-1.715412,-3.083847,-2.338279,0.0,0.0,0.0,3.083847,95.658566,94.643387,94.261412,94.934866,High
4330,Somalia,2004,-1.803035,-2.87854,-2.288664,0.0,0.485437,0.0,2.87854,91.768084,93.591256,96.239952,93.656596,High
4338,Somalia,2012,-1.596589,-2.860755,-2.418204,0.0,0.0,0.0,2.860755,91.431082,96.338291,91.578375,92.947433,High
4332,Somalia,2006,-1.724921,-2.74635,-2.350463,0.0,0.483092,0.0,2.74635,89.263153,94.901769,94.476117,92.518627,High
4347,Somalia,2021,-1.795541,-2.727339,-2.282445,0.952381,0.0,0.47619,2.727339,88.902888,93.459388,96.070743,92.420195,High
4339,Somalia,2013,-1.583604,-2.758687,-2.406621,0.0,0.0,0.0,2.758687,89.496919,96.092664,91.285164,92.012116,High
