---
## 04_Scoring_Composite
---

In [1]:
# Imports

import os
import numpy as np
import pandas as pd

# Visualization (optional, for score distributions & checks)
import matplotlib.pyplot as plt
import seaborn as sns

# Scaling / normalization
from sklearn.preprocessing import MinMaxScaler

# Display settings
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)

In [2]:
# Directory setup & check

base_dir      = "../data"
norm_dir      = os.path.join(base_dir, "normalized")
score_dir     = os.path.join(base_dir, "scored")
composite_dir = os.path.join(base_dir, "composite")

# Quick check: list normalized files (inputs for scoring)
print("Normalized datasets available:")
for f in os.listdir(norm_dir):
    print(" -", f)

Normalized datasets available:
 - electricity_normalized.csv
 - gdp_ppp_normalized.csv
 - gov_effect_normalized.csv
 - internet_normalized.csv
 - literacy_normalized.csv
 - mobile_normalized.csv
 - researchers_normalized.csv
 - rnd_gdp_normalized.csv
 - tertiary_normalized.csv


In [3]:
# Load all normalized datasets into dictionary
norm_dfs = {}
for f in os.listdir(norm_dir):
    if f.endswith("_normalized.csv"):
        key = f.replace("_normalized.csv", "")
        norm_dfs[key] = pd.read_csv(os.path.join(norm_dir, f))

# Quick check: preview one dataset
for k, v in norm_dfs.items():
    print(k, v.shape)
    display(v.head(3))
    break

electricity (9310, 6)


Unnamed: 0,Country Name,Country Code,Year,Indicator,Value,Normalized
0,Aruba,ABW,1990.0,electricity,100.0,100.0
1,Africa Eastern and Southern,AFE,1990.0,electricity,,
2,Afghanistan,AFG,1990.0,electricity,,


In [4]:
# Re-shape each dataset: wide format with indicator column renamed
reshaped_dfs = []

for key, df in norm_dfs.items():
    temp = df[["Country Name", "Country Code", "Year", "Value"]].copy()
    temp = temp.rename(columns={"Value": key})
    reshaped_dfs.append(temp)

# Quick check on one reshaped dataset
reshaped_dfs[0].head()

Unnamed: 0,Country Name,Country Code,Year,electricity
0,Aruba,ABW,1990.0,100.0
1,Africa Eastern and Southern,AFE,1990.0,
2,Afghanistan,AFG,1990.0,
3,Africa Western and Central,AFW,1990.0,
4,Angola,AGO,1990.0,


In [5]:
from functools import reduce

# Merge all reshaped datasets
merged_df = reduce(
    lambda left, right: pd.merge(left, right, on=["Country Name", "Country Code", "Year"], how="outer"),
    reshaped_dfs
)

print("Merged panel dataset shape:", merged_df.shape)
display(merged_df.head())

Merged panel dataset shape: (9560, 12)


Unnamed: 0,Country Name,Country Code,Year,electricity,gdp_ppp,gov_effect,internet,literacy,mobile,researchers,rnd_gdp,tertiary
0,Afghanistan,AFG,1990.0,,,,0.0,,0.0,,,2.46528
1,Afghanistan,AFG,1991.0,,,,,,0.0,,,
2,Afghanistan,AFG,1992.0,,,,,,0.0,,,
3,Afghanistan,AFG,1993.0,,,,,,0.0,,,
4,Afghanistan,AFG,1994.0,,,,,,0.0,,,


In [12]:
from sklearn.preprocessing import MinMaxScaler

# Identify indicator columns (exclude identifiers and score columns)
indicator_cols = [col for col in merged_df.columns if col not in ["Country Name", "Country Code", "Year", "Equal_Score", "Weighted_Score"]]

# --- Forward-fill missing values per country ---
merged_df = merged_df.sort_values(["Country Name", "Year"])
for col in indicator_cols:
    merged_df[col] = merged_df.groupby("Country Name")[col].ffill()

# --- Re-scale indicators to 0–100 (safeguard) ---
scaler = MinMaxScaler((0,100))
merged_df[indicator_cols] = scaler.fit_transform(merged_df[indicator_cols])

# --- Equal-weight score ---
merged_df["Equal_Score"] = merged_df[indicator_cols].mean(axis=1, skipna=True)

# --- Weighted score ---
weights = {
    "gov_effect": 0.25,
    "gdp_ppp": 0.20,
    "literacy": 0.20,
    "internet": 0.10,
    "mobile": 0.10,
    "electricity": 0.05,
    "tertiary": 0.05,
    "researchers": 0.025,
    "rnd_gdp": 0.025
}

# Normalize to sum = 1
total_weight = sum(weights.values())
weights = {k: v/total_weight for k,v in weights.items()}

# Compute weighted average row-wise
def weighted_average(row, cols, weights):
    vals, wts = [], []
    for col in cols:
        if col in weights and pd.notnull(row[col]):
            vals.append(row[col])
            wts.append(weights[col])
    if len(vals) == 0:
        return np.nan
    return np.average(vals, weights=wts)

merged_df["Weighted_Score"] = merged_df.apply(
    lambda row: weighted_average(row, indicator_cols, weights), axis=1
)

print("Panel dataset with yearly scores:", merged_df.shape)
display(merged_df.head(10))

Panel dataset with yearly scores: (9560, 14)


Unnamed: 0,Country Name,Country Code,Year,electricity,gdp_ppp,gov_effect,internet,literacy,mobile,researchers,rnd_gdp,tertiary,Equal_Score,Weighted_Score
0,Afghanistan,AFG,1990.0,,,,0.0,,0.0,,,1.479177,0.493059,0.295835
1,Afghanistan,AFG,1991.0,,,,0.0,,0.0,,,1.479177,0.493059,0.295835
2,Afghanistan,AFG,1992.0,,,,0.0,,0.0,,,1.479177,0.493059,0.295835
3,Afghanistan,AFG,1993.0,,,,0.0,,0.0,,,1.479177,0.493059,0.295835
4,Afghanistan,AFG,1994.0,,,,0.0,,0.0,,,1.479177,0.493059,0.295835
5,Afghanistan,AFG,1995.0,,,,0.0,,0.0,,,1.479177,0.493059,0.295835
6,Afghanistan,AFG,1996.0,,,5.398536,0.0,,0.0,,,1.479177,1.719428,2.847186
7,Afghanistan,AFG,1997.0,,,5.398536,0.0,,0.0,,,1.479177,1.719428,2.847186
8,Afghanistan,AFG,1998.0,,,6.882781,0.0,,0.0,,,1.479177,2.09049,3.589308
9,Afghanistan,AFG,1999.0,,,6.882781,0.0,,0.0,,,1.479177,2.09049,3.589308


In [13]:
# Save yearly equal-weight scores
equal_out = merged_df[["Country Name", "Country Code", "Year", "Equal_Score"]]
equal_out.to_csv(os.path.join(score_dir, "yearly_equal_scores.csv"), index=False)

# Save yearly weighted scores
weighted_out = merged_df[["Country Name", "Country Code", "Year", "Weighted_Score"]]
weighted_out.to_csv(os.path.join(score_dir, "yearly_weighted_scores.csv"), index=False)

print("Yearly scores saved:")
print(" -", os.path.join(score_dir, "yearly_equal_scores.csv"))
print(" -", os.path.join(score_dir, "yearly_weighted_scores.csv"))

Yearly scores saved:
 - ../data\scored\yearly_equal_scores.csv
 - ../data\scored\yearly_weighted_scores.csv


In [15]:
# === Save full panel dataset for Notebook 06 ===
panel_out = os.path.join(score_dir, "panel_dataset.csv")
merged_df.to_csv(panel_out, index=False)

print(f"✔ Panel dataset saved to {panel_out} with shape {merged_df.shape}")
print(merged_df.head())

✔ Panel dataset saved to ../data\scored\panel_dataset.csv with shape (9560, 14)
  Country Name Country Code    Year  electricity  gdp_ppp  gov_effect  internet  literacy  mobile  researchers  rnd_gdp  tertiary  Equal_Score  Weighted_Score
0  Afghanistan          AFG  1990.0          NaN      NaN         NaN       0.0       NaN     0.0          NaN      NaN  1.479177     0.493059        0.295835
1  Afghanistan          AFG  1991.0          NaN      NaN         NaN       0.0       NaN     0.0          NaN      NaN  1.479177     0.493059        0.295835
2  Afghanistan          AFG  1992.0          NaN      NaN         NaN       0.0       NaN     0.0          NaN      NaN  1.479177     0.493059        0.295835
3  Afghanistan          AFG  1993.0          NaN      NaN         NaN       0.0       NaN     0.0          NaN      NaN  1.479177     0.493059        0.295835
4  Afghanistan          AFG  1994.0          NaN      NaN         NaN       0.0       NaN     0.0          NaN      NaN  1.47

---
## Summary
---
## Summary – 04b_Scoring_Yearly

In this notebook, we extended the composite scoring framework from **04_Scoring_Composite** (snapshot only) into a yearly, country–year panel dataset.

### What we did
1. **Loaded all normalized datasets**  
   - Electricity, GDP PPP, Governance Effectiveness, Internet, Literacy, Mobile, Researchers, R&D (% GDP), Tertiary Education.  

2. **Reshaped datasets**  
   - Converted each file into a common structure:  
     `Country Name | Country Code | Year | <indicator>`  

3. **Merged into a panel dataset**  
   - Combined all indicators by `Country Name + Country Code + Year`.  
   - Result: approximately 9,500 country-year rows with 14 columns.  

4. **Calculated yearly composite scores**  
   - Equal-weight score: mean of available indicators per row.  
   - Weighted score: governance (0.25), GDP PPP (0.20), literacy (0.20), internet (0.10), mobile (0.10), electricity (0.05), tertiary (0.05), researchers (0.025), R&D (0.025).  
   - Weights normalized to sum = 1.  

5. **Saved outputs**  
   - `../data/scored/yearly_equal_scores.csv`  
   - `../data/scored/yearly_weighted_scores.csv`  

### Why this matters
- **04 (snapshot)** shows who is most ready now.  
- **04b (yearly)** shows how readiness has evolved over time.  
- This provides the foundation for:  
  - Growth trajectory visuals (Notebook 05)  
  - Clustering and trajectory modeling (Notebook 06)  