In [3]:
from pandas_datareader import wb
import pandas as pd
pd.options.display.width = 0

names = [
    "NE.EXP.GNFS.CD", # Exports of goods and services (current US$)
    "NE.IMP.GNFS.CD", # Imports of goods and services (current US$)
    "NV.AGR.TOTL.CD", # Agriculture, forestry, and fishing, value added (current US$)
    "NY.GDP.MKTP.CD", # GDP (current US$)
    "NE.RSB.GNFS.CD", # External balance on goods and services (current US$)
]
# unduh data dari World Bank
df = wb.download(country="all", indicator=names, start=2010, end=2010).reset_index()

In [4]:
# unduh data negara dari world bank
countries = wb.get_countries()
# dapatkan negara yang bukan merupakan negara agregat (misal agregat: Asia Timur, Oceania, dsb)
non_aggregates = countries[countries['region'] != 'Aggregates'].name
df_nonagg = df[df["country"].isin(non_aggregates)].dropna()
df_nonagg

Unnamed: 0,country,year,NE.EXP.GNFS.CD,NE.IMP.GNFS.CD,NV.AGR.TOTL.CD,NY.GDP.MKTP.CD,NE.RSB.GNFS.CD
50,Albania,2010,3.337086e+09,5.792185e+09,2.141582e+09,1.192692e+10,-2.455100e+09
51,Algeria,2010,6.197541e+10,5.065473e+10,1.364852e+10,1.612073e+11,1.132067e+10
54,Angola,2010,5.157282e+10,3.568226e+10,5.179055e+09,8.379950e+10,1.589056e+10
55,Antigua and Barbuda,2010,9.142222e+08,8.415185e+08,1.876296e+07,1.148700e+09,7.270370e+07
56,Argentina,2010,8.020887e+10,6.793793e+10,3.021382e+10,4.236274e+11,1.227093e+10
...,...,...,...,...,...,...,...
260,Vietnam,2010,8.347359e+10,9.299467e+10,2.130649e+10,1.159317e+11,-9.521076e+09
262,West Bank and Gaza,2010,1.367300e+09,5.264300e+09,8.716000e+08,9.681500e+09,-3.897000e+09
263,"Yemen, Rep.",2010,9.270503e+09,1.062900e+10,2.522665e+09,3.090675e+10,-1.358501e+09
264,Zambia,2010,7.503513e+09,6.256989e+09,1.909207e+09,2.026556e+10,1.246524e+09


In [10]:
# jadikan nilai numerik dari semua kolom sebagai vektor
vectors = {}
for rowid, row in df_nonagg.iterrows():
    vectors[row["country"]] = row[names].values
vectors

{'Albania': array([3337085649.24613, 5792185435.94322, 2141582328.47493,
        11926922828.9911, -2455099786.6971], dtype=object),
 'Algeria': array([61975405318.205, 50654732073.2396, 13648522571.4516,
        161207270185.25, 11320673244.9655], dtype=object),
 'Angola': array([51572818660.8665, 35682259098.1843, 5179054574.41704,
        83799496611.2004, 15890559562.6822], dtype=object),
 'Antigua and Barbuda': array([914222222.222222, 841518518.518518, 18762962.962963, 1148700000.0,
        72703703.7037037], dtype=object),
 'Argentina': array([80208867995.7171, 67937933972.3653, 30213817111.0998,
        423627422092.49, 12270934023.3519], dtype=object),
 'Australia': array([227416541883.203, 238004051792.478, 25318418039.2848,
        1147589183475.73, -10587509909.2751], dtype=object),
 'Austria': array([200892688044.939, 187158711051.024, 4966565222.5306,
        391892746544.69, 13733976993.9149], dtype=object),
 'Azerbaijan': array([28732245203.09, 10942312484.4256, 2921255

In [28]:
import numpy as np
euclid = {}
cosine = {}
target = "Indonesia"

# hitung jarak euclid dan cosine semua negara terhadap Indonesia
for country in vectors:
    vecA = vectors[target]
    vecB = vectors[country]
    dist = np.linalg.norm(vecA-vecB)
    # Implementasi rumus VSM
    cos = (vecA @ vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))
    euclid[country] = dist
    cosine[country] = 1-cos

In [29]:
df_distance = pd.DataFrame({'euclid': euclid, 'cos': cosine})
df_distance.sort_values(by="euclid")

Unnamed: 0,euclid,cos
Indonesia,0.000000e+00,2.220446e-16
Turkey,7.222593e+10,3.653198e-03
Saudi Arabia,2.670936e+11,3.863201e-02
Sweden,2.809543e+11,3.086098e-02
Poland,2.930389e+11,2.766845e-02
...,...,...
France,2.042600e+12,8.747504e-03
Germany,3.131712e+12,2.709413e-02
Japan,5.087010e+12,1.442053e-02
China,5.696383e+12,1.483741e-03


In [33]:
print("Closest by Euclidean distance:")
print(df_distance.sort_values(by="euclid").head(5))
print()
print("Closest by Cosine distance:")
print(df_distance.sort_values(by="cos").head(5))

Closest by Euclidean distance:
                    euclid           cos
Indonesia     0.000000e+00  2.220446e-16
Turkey        7.222593e+10  3.653198e-03
Saudi Arabia  2.670936e+11  3.863201e-02
Sweden        2.809543e+11  3.086098e-02
Poland        2.930389e+11  2.766845e-02

Closest by Cosine distance:
                 euclid           cos
Indonesia  0.000000e+00  2.220446e-16
China      5.696383e+12  1.483741e-03
Cameroon   7.729717e+11  1.757571e-03
Uruguay    7.594432e+11  2.591134e-03
Peru       6.453587e+11  3.040229e-03


Ekonomi indonesia lebih mendekati China jika dibandingkan dengan negara lainnya berdasarkan Cosine Similiarity.