In [22]:
# Data Manipulation
import pandas as pd
import numpy as np
import requests
from io import StringIO

# Visualization (Static)
import matplotlib.pyplot as plt
import seaborn as sns

# Visualization (Interactive)
import plotly.express as px

# Machine Learning & Clustering
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.linear_model import LassoCV
from sklearn.impute import SimpleImputer


# Import Data 

In [23]:
# 1) All variables and countries from the master data
final_df_long= pd.read_csv("https://raw.githubusercontent.com/AyaanTigdikar/Capstone/refs/heads/main/workingdata/master_data_long.csv")
final_df_long = final_df_long.drop(columns=['Unnamed: 0'], errors='ignore')

# 2) Clusters list from Leo
clusters_list = pd.read_csv('https://raw.githubusercontent.com/AyaanTigdikar/Capstone/refs/heads/main/workingdata/clusters_list.csv')

# Cluster variations across time 

### Pre-processing 

In [24]:
# I will take 5-year averages (from 1991 to 1995, and 2020 to 2024)
# Only use years from 2015 to 2021
df_start_end = final_df_long[final_df_long['Year'].isin([1991, 1992, 1993, 1994, 1995, 2015, 2016, 2017, 2018, 2019])].copy()

# Pivot the data to wide format
data_pivot_var = df_start_end.pivot_table(
    index=['Country Code', 'Variable'], 
    columns='Year', 
    values='Value'
)

# Take 5-year averages
years_91_95 = [1991, 1992, 1993, 1994, 1995]
years_15_19 = [2015, 2016, 2017, 2018, 2019]

data_pivot_var['avg_1991_1995'] = data_pivot_var[years_91_95].mean(axis=1)
data_pivot_var['avg_2015_2019'] = data_pivot_var[years_15_19].mean(axis=1)

# Calculate absolute change between 5-year averages
df_change = (data_pivot_var['avg_2015_2019'] - data_pivot_var['avg_1991_1995']).reset_index()
df_change.columns = ['Country Code', 'Variable', 'Net_Change'] 

# To wide format
df_change_wide = df_change.pivot(
    index='Country Code', 
    columns='Variable', 
    values='Net_Change'
)

# Merge with clusters_list from first cluster 
df_change_clusters = df_change_wide.merge(
    clusters_list,
    how='right',
    on='Country Code'
)

# Keep only clusters 1, 2 and 4
#df_change_clusters = df_change_clusters[df_change_clusters['cluster_6'].isin([1, 2, 4])]


In [25]:
df_change_clusters

Unnamed: 0,Country Code,Accountability index,Adjusted savings: natural resources depletion (% of GNI),Adjusted savings: total (current US$),Agriculture,CPIA building human resources rating (1=low to 6=high),CPIA quality of budgetary and financial management rating (1=low to 6=high),"CPIA transparency, accountability and corruption in the public sector rating (1=low to 6=high)",Capital depreciation rate,Capital stock (national accounts prices),...,Total natural resources rents (% of GDP),"Use of IMF credit (DOD, current US$)",Welfare-relevant TFP,deliberative_dem,egalitarian_dem,electoral_dem,liberal_dem,participatory_dem,Country Name,cluster_6
0,AFG,1.3052,,,,,,,,,...,,,,0.2274,1.318000e-01,0.2470,0.1574,0.1102,Afghanistan,0
1,DZA,-0.0634,-0.836730,2.650331e+10,,,,,0.008840,1.220994e+06,...,-0.279462,6.806324e+08,,-0.0322,3.640000e-02,0.0896,0.0514,0.0166,Algeria,2
2,ARG,-0.1518,0.222437,6.571439e+09,-0.353786,,,,0.008769,2.355022e+06,...,0.326150,1.351598e+10,0.164446,-0.1800,-4.920000e-02,-0.0608,-0.0194,-0.0558,Argentina,0
3,ARM,0.0748,0.188177,4.919790e+08,,,,,-0.004891,5.497152e+04,...,0.588305,4.363836e+08,0.467682,-0.0456,-9.900000e-02,-0.1032,0.0054,0.0452,Armenia,0
4,AUS,-0.1074,0.647845,6.351324e+10,-0.728409,,,,0.006049,3.949761e+06,...,2.363719,,-0.050949,-0.0182,-4.860000e-02,-0.0308,-0.0308,-0.0520,Australia,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,VEN,-1.6950,,,,,,,0.007416,-4.207898e+05,...,,,-0.485530,-0.5966,-3.540000e-01,-0.4958,-0.5244,-0.3008,Venezuela,0
132,VNM,0.1272,-2.554888,,-18.718827,,,,0.019577,2.703420e+06,...,-4.990406,2.442174e+08,,0.0244,2.775558e-17,0.0120,0.0388,0.0024,Vietnam,0
133,YEM,-0.7680,-14.003470,,-0.636242,,,,-0.001150,6.438121e+05,...,-23.536238,4.873248e+08,,-0.1810,-7.540000e-02,-0.1694,-0.1060,-0.0086,Yemen,0
134,ZMB,-0.1288,-3.667877,,-14.408141,,,,0.008955,2.128462e+05,...,5.258444,-1.380927e+08,0.024617,-0.0924,-3.200000e-02,-0.0390,-0.0244,-0.0218,Zambia,4


## LASSO

In [50]:
X_imputed

array([[ 1.30520000e+00, -2.21182432e-02,  6.75745606e+09, ...,
         2.47000000e-01,  1.57400000e-01,  1.10200000e-01],
       [-6.34000000e-02, -8.36729964e-01,  2.65033150e+10, ...,
         8.96000000e-02,  5.14000000e-02,  1.66000000e-02],
       [-1.51800000e-01,  2.22437157e-01,  6.57143946e+09, ...,
        -6.08000000e-02, -1.94000000e-02, -5.58000000e-02],
       ...,
       [-7.68000000e-01, -1.40034701e+01,  6.75745606e+09, ...,
        -1.69400000e-01, -1.06000000e-01, -8.60000000e-03],
       [-1.28800000e-01, -3.66787738e+00,  6.75745606e+09, ...,
        -3.90000000e-02, -2.44000000e-02, -2.18000000e-02],
       [ 2.40000000e-02, -6.76707683e-01, -2.34370198e+09, ...,
        -1.04000000e-02, -1.30000000e-02,  3.08000000e-02]],
      shape=(131, 34))

In [54]:
print(y.shape)
print(X.shape)
print(X_imputed.shape)

(131,)
(131, 40)
(131, 34)


In [None]:
# ----------------------------
# 1. Drop rows with missing outcome
# ----------------------------
df_clean = df_change_clusters.dropna(subset=['Economic Complexity Index']).copy()

# ----------------------------
# 2. Separate outcome and predictors
# ----------------------------
y = df_clean['Economic Complexity Index'].values

X = df_clean.drop(columns=['Country Code', 'Country Name', 'cluster_6', 'Economic Complexity Index']).values
feature_names = df_clean.drop(columns=['Country Code', 'Country Name', 'cluster_6', 'Economic Complexity Index']).columns

# ----------------------------
# 3. Impute missing predictor values
# ----------------------------
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# ----------------------------
# 4. Standardize predictors
# ----------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# ----------------------------
# 5. LASSO with cross-validation
# ----------------------------
lasso_cv = LassoCV(cv=10, random_state=42, max_iter=10000)
lasso_cv.fit(X_scaled, y)

# ----------------------------
# 6. Extract coefficients (indexed by feature names, not rows!)
# ----------------------------
coef = pd.Series(lasso_cv.coef_, index=feature_names)
print("LASSO coefficients:\n", coef)

# ----------------------------
# 7. Selected predictors (non-zero coefficients)
# ----------------------------
selected = coef[coef != 0].sort_values(key=abs, ascending=False)
print("Selected predictors:\n", selected)

# ----------------------------
# 8. Optional: Plot
# ----------------------------
plt.figure(figsize=(8,6))
selected.plot(kind='barh')
plt.title("Selected predictors (LASSO)")
plt.xlabel("Standardized coefficient")
plt.gca().invert_yaxis()
plt.show()


 'CPIA quality of budgetary and financial management rating (1=low to 6=high)'
 'CPIA transparency, accountability and corruption in the public sector rating (1=low to 6=high)'
 'Civil war' 'High-tech exports' 'Political stability — estimate']. At least one non-missing value is needed for imputation with strategy='median'.


ValueError: Length of values (34) does not match length of index (40)

## Random Forest