# WDI - Preproesarea datelor

In [2]:
import json
import pandas as pd

### Incarcam datele wdi

In [3]:
import json

with open("../data/wdi_data.json", "r", encoding="utf-8") as f:
    wdi_data = json.load(f)


FileNotFoundError: [Errno 2] No such file or directory: '../data/wdi_data.json'

### Explorăm structura pentru un indicator

In [4]:
# un indicator (ex: GDP per capita)
gdp = wdi_data["gdp_per_capita"]

print(type(gdp))      # listă
print(len(gdp))       # 2 elemente: [metadata, rows]

metadata = gdp[0]
rows     = gdp[1]

print(metadata.keys())      # ce câmpuri există în metadata
print(rows[0].keys())       # ce câmpuri există într-un rând de date

<class 'list'>
2
dict_keys(['page', 'pages', 'per_page', 'total', 'sourceid', 'lastupdated'])
dict_keys(['indicator', 'country', 'countryiso3code', 'date', 'value', 'unit', 'obs_status', 'decimal'])


In [5]:
gdp[1][0]

{'indicator': {'id': 'NY.GDP.PCAP.CD',
  'value': 'GDP per capita (current US$)'},
 'country': {'id': 'ZH', 'value': 'Africa Eastern and Southern'},
 'countryiso3code': 'AFE',
 'date': '2024',
 'value': 1567.63583892828,
 'unit': '',
 'obs_status': '',
 'decimal': 1}

# TRANSFORMARE ÎN DATAFRAME

## Pentru un singur indicator

### Metoda 1. Folosind json_normalize pentru a crea un DataFrame

In [7]:
# 2. Lucrez pe un singur indicator, ex: GDP per capita
gdp = wdi_data["gdp_per_capita"]

# 3. Extragem lista de rânduri (partea importantă)
rows = gdp[1]

# 4. Transformare directă în DataFrame
df_gdp = pd.json_normalize(rows)

print(df_gdp.columns)
df_gdp.head(3)

Index(['countryiso3code', 'date', 'value', 'unit', 'obs_status', 'decimal',
       'indicator_name', 'indicator.id', 'indicator.value', 'country.id',
       'country.value'],
      dtype='object')


Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator_name,indicator.id,indicator.value,country.id,country.value
0,AFE,2024,1567.635839,,,1,gdp_per_capita,NY.GDP.PCAP.CD,GDP per capita (current US$),ZH,Africa Eastern and Southern
1,AFE,2023,1510.742951,,,1,gdp_per_capita,NY.GDP.PCAP.CD,GDP per capita (current US$),ZH,Africa Eastern and Southern
2,AFE,2022,1628.318944,,,1,gdp_per_capita,NY.GDP.PCAP.CD,GDP per capita (current US$),ZH,Africa Eastern and Southern


In [13]:
print(len(df_gdp))  # număr de rânduri
print(df_gdp.shape) # (număr rânduri, număr coloane)
print(df_gdp.info())  # informații despre DataFrame
print("\nStatistici descriptive:")
print(df_gdp.describe())  # statistici descriptive pentru coloanele numerice

17290
(17290, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17290 entries, 0 to 17289
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   countryiso3code  17290 non-null  object 
 1   date             17290 non-null  object 
 2   value            14541 non-null  float64
 3   unit             17290 non-null  object 
 4   obs_status       17290 non-null  object 
 5   decimal          17290 non-null  int64  
 6   indicator_name   17290 non-null  object 
 7   indicator.id     17290 non-null  object 
 8   indicator.value  17290 non-null  object 
 9   country.id       17290 non-null  object 
 10  country.value    17290 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 1.5+ MB
None

Statistici descriptive:
               value  decimal
count   14541.000000  17290.0
mean     8656.965489      1.0
std     17414.997378      0.0
min        11.801322      1.0
25%       582.138962      1.0
50%

### Metoda 2 - direct fara json_normalize

In [14]:

rows = wdi_data["gdp_per_capita"][1]

flat_rows = []

for r in rows:
    flat_rows.append({
        "countryiso3code": r["countryiso3code"],
        "country_id":      r["country"]["id"],
        "country_name":    r["country"]["value"],
        "indicator_id":    r["indicator"]["id"],
        "indicator_name":  r["indicator"]["value"],
        "date":            r["date"],
        "value":           r["value"],
        "unit":            r["unit"],
        "obs_status":      r["obs_status"],
        "decimal":         r["decimal"]
    })

df2 = pd.DataFrame(flat_rows)
print(df2.head())

  countryiso3code country_id                 country_name    indicator_id  \
0             AFE         ZH  Africa Eastern and Southern  NY.GDP.PCAP.CD   
1             AFE         ZH  Africa Eastern and Southern  NY.GDP.PCAP.CD   
2             AFE         ZH  Africa Eastern and Southern  NY.GDP.PCAP.CD   
3             AFE         ZH  Africa Eastern and Southern  NY.GDP.PCAP.CD   
4             AFE         ZH  Africa Eastern and Southern  NY.GDP.PCAP.CD   

                 indicator_name  date        value unit obs_status  decimal  
0  GDP per capita (current US$)  2024  1567.635839                        1  
1  GDP per capita (current US$)  2023  1510.742951                        1  
2  GDP per capita (current US$)  2022  1628.318944                        1  
3  GDP per capita (current US$)  2021  1522.393346                        1  
4  GDP per capita (current US$)  2020  1344.103210                        1  


### Metoda 3 - Conversie prin expansiune dict -> DataFrame + join

In [16]:
rows = wdi_data["gdp_per_capita"][1]

# 1. DataFrame direct
df_raw = pd.DataFrame(rows)
df_raw.head(2)

Unnamed: 0,indicator,country,countryiso3code,date,value,unit,obs_status,decimal,indicator_name
0,"{'id': 'NY.GDP.PCAP.CD', 'value': 'GDP per cap...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2024,1567.635839,,,1,gdp_per_capita
1,"{'id': 'NY.GDP.PCAP.CD', 'value': 'GDP per cap...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2023,1510.742951,,,1,gdp_per_capita


In [18]:
# 2. Expandăm coloanele care sunt dict
df_country   = pd.json_normalize(df_raw["country"])
df_indicator = pd.json_normalize(df_raw["indicator"])

# 3. Le alipim înapoi
df3 = pd.concat(
    [df_raw.drop(columns=["country", "indicator"]),
     df_country.add_prefix("country_"),
     df_indicator.add_prefix("indicator_")],
    axis=1
)

df3.head(2)

Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator_name,country_id,country_value,indicator_id,indicator_value
0,AFE,2024,1567.635839,,,1,gdp_per_capita,ZH,Africa Eastern and Southern,NY.GDP.PCAP.CD,GDP per capita (current US$)
1,AFE,2023,1510.742951,,,1,gdp_per_capita,ZH,Africa Eastern and Southern,NY.GDP.PCAP.CD,GDP per capita (current US$)


### Metoda 4. Folosim apply

In [None]:
df = pd.DataFrame(rows)

In [20]:
df["country_id"]    = df["country"].apply(lambda x: x["id"])
df["country_name"]  = df["country"].apply(lambda x: x["value"])
df["indicator_id"]  = df["indicator"].apply(lambda x: x["id"])
df["indicator_name"] = df["indicator"].apply(lambda x: x["value"])

df4 = df.drop(columns=["country", "indicator"])
df4.head(2)

Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator_name,country_id,country_name,indicator_id
0,AFE,2024,1567.635839,,,1,GDP per capita (current US$),ZH,Africa Eastern and Southern,NY.GDP.PCAP.CD
1,AFE,2023,1510.742951,,,1,GDP per capita (current US$),ZH,Africa Eastern and Southern,NY.GDP.PCAP.CD


In [23]:
df.head(2)

Unnamed: 0,indicator,country,countryiso3code,date,value,unit,obs_status,decimal,indicator_name,country_id,country_name,indicator_id
0,"{'id': 'NY.GDP.PCAP.CD', 'value': 'GDP per cap...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2024,1567.635839,,,1,GDP per capita (current US$),ZH,Africa Eastern and Southern,NY.GDP.PCAP.CD
1,"{'id': 'NY.GDP.PCAP.CD', 'value': 'GDP per cap...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2023,1510.742951,,,1,GDP per capita (current US$),ZH,Africa Eastern and Southern,NY.GDP.PCAP.CD


## Pentru toate datele

In [26]:
# 1. Încarc fișierul
#with open("wdi_data.json", "r", encoding="utf-8") as f:
#    wdi_data = json.load(f)

all_frames = []   # aici colectăm fiecare indicator normalizat

# 2. Iterăm prin fiecare indicator
for indicator_name, indicator_data in wdi_data.items():
    rows = indicator_data[1]   # partea cu valori

    # 3. Normalizare JSON → DataFrame
    df = pd.json_normalize(rows, sep=".")

    # 4. Adăugăm numele indicatorului (ajută enorm la analiză)
    df["indicator_group"] = indicator_name

    all_frames.append(df)

wdi_df = pd.concat(all_frames, ignore_index=True)
print(wdi_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224770 entries, 0 to 224769
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   countryiso3code  224770 non-null  object 
 1   date             224770 non-null  object 
 2   value            123660 non-null  float64
 3   unit             224770 non-null  object 
 4   obs_status       224770 non-null  object 
 5   decimal          224770 non-null  int64  
 6   indicator_name   224770 non-null  object 
 7   indicator.id     224770 non-null  object 
 8   indicator.value  224770 non-null  object 
 9   country.id       224770 non-null  object 
 10  country.value    224770 non-null  object 
 11  indicator_group  224770 non-null  object 
dtypes: float64(1), int64(1), object(10)
memory usage: 20.6+ MB
None


In [27]:
wdi_df.head(2)

Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator_name,indicator.id,indicator.value,country.id,country.value,indicator_group
0,AFE,2024,1567.635839,,,1,gdp_per_capita,NY.GDP.PCAP.CD,GDP per capita (current US$),ZH,Africa Eastern and Southern,gdp_per_capita
1,AFE,2023,1510.742951,,,1,gdp_per_capita,NY.GDP.PCAP.CD,GDP per capita (current US$),ZH,Africa Eastern and Southern,gdp_per_capita


#### Metoda 1 - As Dataframe + normalizare

In [6]:
all_rows = []   # aici stocăm toate țările & anii & indicatorii

for name, json_obj in wdi_data.items():
    rows = json_obj[1]              # partea cu date
    for r in rows:
        r["indicator_name"] = name  # punem numele indicatorului (ex: gdp_per_capita)
        all_rows.append(r)

df = pd.DataFrame(all_rows)
df.head(2)


Unnamed: 0,indicator,country,countryiso3code,date,value,unit,obs_status,decimal,indicator_name
0,"{'id': 'NY.GDP.PCAP.CD', 'value': 'GDP per cap...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2024,1567.635839,,,1,gdp_per_capita
1,"{'id': 'NY.GDP.PCAP.CD', 'value': 'GDP per cap...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2023,1510.742951,,,1,gdp_per_capita


In [11]:
df.dtypes


indicator           object
country             object
countryiso3code     object
date                object
value              float64
unit                object
obs_status          object
decimal              int64
indicator_name      object
dtype: object

In [13]:
indicator_df = pd.json_normalize(df["indicator"])
indicator_df.head(2)


Unnamed: 0,id,value
0,NY.GDP.PCAP.CD,GDP per capita (current US$)
1,NY.GDP.PCAP.CD,GDP per capita (current US$)


In [14]:
country_df = pd.json_normalize(df["country"])
country_df.head(2)

Unnamed: 0,id,value
0,ZH,Africa Eastern and Southern
1,ZH,Africa Eastern and Southern


In [18]:
# Le combinăm înapoi într-un singur DataFrame curat
df_clean = pd.concat(
    [
        df.drop(columns=["indicator","country"]),
        indicator_df.add_prefix("indicator_"),
        country_df.add_prefix("country_")
    ],
    axis=1)
df_clean.head(2)

Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator_name,indicator_id,indicator_value,country_id,country_value
0,AFE,2024,1567.635839,,,1,gdp_per_capita,NY.GDP.PCAP.CD,GDP per capita (current US$),ZH,Africa Eastern and Southern
1,AFE,2023,1510.742951,,,1,gdp_per_capita,NY.GDP.PCAP.CD,GDP per capita (current US$),ZH,Africa Eastern and Southern


## Json Normalization 2

mai multe metode toate cu acelasi rezultat

### 1

In [36]:
all_df = []

for name, obj in wdi_data.items():
    part = pd.json_normalize(obj[1])
    part["indicator_name"] = name
    all_df.append(part)

wdi_df1 = pd.concat(all_df, ignore_index=True)
wdi_df1.head(3)


Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator.id,indicator.value,country.id,country.value,indicator_name
0,AFE,2024,1567.635839,,,1,NY.GDP.PCAP.CD,GDP per capita (current US$),ZH,Africa Eastern and Southern,gdp_per_capita
1,AFE,2023,1510.742951,,,1,NY.GDP.PCAP.CD,GDP per capita (current US$),ZH,Africa Eastern and Southern,gdp_per_capita
2,AFE,2022,1628.318944,,,1,NY.GDP.PCAP.CD,GDP per capita (current US$),ZH,Africa Eastern and Southern,gdp_per_capita


### 2

In [37]:
dfs = []   # colectăm aici toate tabelele, câte unul pentru fiecare indicator

for name, data in wdi_data.items():
    rows = data[1]            # extragem lista de observații pentru indicator
    df = pd.json_normalize(   # transformăm structura JSON în tabel
        rows,
        sep="."               # separăm câmpurile nested cu punct (ex: country.id)
    )
    df["indicator_group"] = name   # păstrăm numele indicatorului din care provine
    dfs.append(df)                 # adăugăm tabela în lista finală

wdi_df2 = pd.concat(dfs, ignore_index=True)  # unim toate tabelele într-un singur DataFrame
wdi_df2.head(3)

Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator.id,indicator.value,country.id,country.value,indicator_group
0,AFE,2024,1567.635839,,,1,NY.GDP.PCAP.CD,GDP per capita (current US$),ZH,Africa Eastern and Southern,gdp_per_capita
1,AFE,2023,1510.742951,,,1,NY.GDP.PCAP.CD,GDP per capita (current US$),ZH,Africa Eastern and Southern,gdp_per_capita
2,AFE,2022,1628.318944,,,1,NY.GDP.PCAP.CD,GDP per capita (current US$),ZH,Africa Eastern and Southern,gdp_per_capita


### 3

In [38]:
def wdi_to_dataframe(wdi_data):
    all_df = []
    
    for name, obj in wdi_data.items():
        part = pd.json_normalize(obj[1])
        part["indicator_name"] = name
        all_df.append(part)
    
    return pd.concat(all_df, ignore_index=True)

wdi_df3 = wdi_to_dataframe(wdi_data)
wdi_df3.head(2)

Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator.id,indicator.value,country.id,country.value,indicator_name
0,AFE,2024,1567.635839,,,1,NY.GDP.PCAP.CD,GDP per capita (current US$),ZH,Africa Eastern and Southern,gdp_per_capita
1,AFE,2023,1510.742951,,,1,NY.GDP.PCAP.CD,GDP per capita (current US$),ZH,Africa Eastern and Southern,gdp_per_capita


# Salvam datele

In [41]:
wdi_df1.to_csv("data/wdi_data.csv", index=False)