In [56]:
# Data Manipulation
import pandas as pd
import numpy as np
import requests
from io import StringIO

# Visualization (Static)
import matplotlib.pyplot as plt
import seaborn as sns

# Visualization (Interactive)
import plotly.express as px

# Machine Learning & Clustering
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.linear_model import LassoCV
from sklearn.impute import SimpleImputer


# Import Data 

In [75]:
# 1) All variables and countries from the master data
final_df_long= pd.read_csv("https://raw.githubusercontent.com/AyaanTigdikar/Capstone/refs/heads/main/workingdata/master_data_long.csv")
final_df_long = final_df_long.drop(columns=['Unnamed: 0'], errors='ignore')

# 2) Clusters list from Leo
clusters_list = pd.read_csv('https://raw.githubusercontent.com/AyaanTigdikar/Capstone/refs/heads/main/workingdata/clusters_list.csv')

# Cluster variations across time 

### Pre-processing 

I am going to create 3 dfs: three, five and  seven-year average changes

In [None]:
windows = {
    3: ([1991, 1992, 1993], [2017, 2018, 2019]),
    5: ([1991, 1992, 1993, 1994, 1995], [2015, 2016, 2017, 2018, 2019]),
    7: ([1991, 1992, 1993, 1994, 1995, 1996, 1997], [2013, 2014, 2015, 2016, 2017, 2018, 2019])
}

results = {}

for k, (start_years, end_years) in windows.items():

    df_k = final_df_long[
        final_df_long['Year'].isin(start_years + end_years)
    ].copy()

    pivot = df_k.pivot_table(
        index=['Country Code', 'Variable'],
        columns='Year',
        values='Value'
    )

    pivot[f'avg_{start_years[0]}_{start_years[-1]}'] = pivot[start_years].mean(axis=1)
    pivot[f'avg_{end_years[0]}_{end_years[-1]}']     = pivot[end_years].mean(axis=1)

    df_change = (
        pivot[f'avg_{end_years[0]}_{end_years[-1]}'] - pivot[f'avg_{start_years[0]}_{start_years[-1]}']
    ).reset_index(name='Net_Change')

    df_change_wide = df_change.pivot(
        index='Country Code',
        columns='Variable',
        values='Net_Change'
    )

    results[k] = df_change_wide.merge(
        clusters_list,
        how='right',
        on='Country Code'
    )

df_3_change_clusters = results[3]
df_5_change_clusters = results[5]
df_7_change_clusters = results[7]

In [81]:
df_3_change_clusters

Unnamed: 0,Country Code,Accountability index,Adjusted savings: gross savings (% of GNI),Adjusted savings: natural resources depletion (% of GNI),Adjusted savings: total (current US$),Agriculture,CPIA building human resources rating (1=low to 6=high),CPIA quality of budgetary and financial management rating (1=low to 6=high),"CPIA transparency, accountability and corruption in the public sector rating (1=low to 6=high)",Capital depreciation rate,...,Total natural resources rents (% of GDP),"Use of IMF credit (DOD, current US$)",Welfare-relevant TFP,deliberative_dem,egalitarian_dem,electoral_dem,liberal_dem,participatory_dem,Country Name,cluster_6
0,AFG,1.228000,,,,,,,,,...,,,,0.213000,0.116333,0.243333,0.145667,0.107333,Afghanistan,0
1,DZA,-0.109667,5.523325,0.395408,2.655957e+10,,,,,0.008440,...,2.056406,9.228342e+08,,-0.053333,0.024667,0.062667,0.047667,0.005667,Algeria,2
2,ARG,-0.118000,-1.628856,0.484825,3.896770e+09,-0.918683,,,,0.009406,...,0.639994,2.413571e+10,0.207586,-0.171333,-0.046333,-0.055000,-0.009000,-0.050000,Argentina,0
3,ARM,0.313667,19.212740,,,,,,,-0.005659,...,1.003659,,,0.031000,-0.044333,-0.044333,0.073333,0.105333,Armenia,0
4,AUS,-0.156000,2.118881,0.908693,6.891202e+10,-0.703518,,,,0.006034,...,2.894485,,-0.059322,-0.026667,-0.058667,-0.040000,-0.041000,-0.053000,Australia,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,VEN,-1.837667,,,,,,,,0.006071,...,,,-0.639869,-0.596667,-0.377000,-0.522000,-0.544667,-0.318333,Venezuela,0
132,VNM,0.142333,,-3.032031,,-22.425533,,,,0.018048,...,-5.666765,3.403912e+08,,0.027333,-0.001000,0.022000,0.040000,0.005667,Vietnam,0
133,YEM,-0.837000,,-8.202315,,1.425109,,,,-0.003667,...,-14.171913,4.725520e+08,,-0.187667,-0.079667,-0.175667,-0.119333,-0.012667,Yemen,0
134,ZMB,-0.100667,,-2.897587,,-19.111380,,,,0.007857,...,5.385885,-1.221881e+08,-0.048340,-0.053000,-0.025333,-0.026333,-0.019333,-0.013333,Zambia,4


## LASSO

In [63]:
X_imputed

array([[ 1.30520000e+00,  3.99751956e+00, -2.21182432e-02, ...,
         2.47000000e-01,  1.57400000e-01,  1.10200000e-01],
       [-6.34000000e-02,  4.69123181e+00, -8.36729964e-01, ...,
         8.96000000e-02,  5.14000000e-02,  1.66000000e-02],
       [-1.51800000e-01, -1.60969474e+00,  2.22437157e-01, ...,
        -6.08000000e-02, -1.94000000e-02, -5.58000000e-02],
       ...,
       [-7.68000000e-01,  3.99751956e+00, -1.40034701e+01, ...,
        -1.69400000e-01, -1.06000000e-01, -8.60000000e-03],
       [-1.28800000e-01,  3.99751956e+00, -3.66787738e+00, ...,
        -3.90000000e-02, -2.44000000e-02, -2.18000000e-02],
       [ 2.40000000e-02,  1.05938572e+01, -6.76707683e-01, ...,
        -1.04000000e-02, -1.30000000e-02,  3.08000000e-02]],
      shape=(131, 38))

In [64]:
print(y.shape)
print(X.shape)
print(X_imputed.shape)

(131,)
(131, 44)
(131, 38)


In [62]:
# ----------------------------
# 1. Drop rows with missing outcome
# ----------------------------
df_clean = df_change_clusters.dropna(subset=['Economic Complexity Index']).copy()

# ----------------------------
# 2. Separate outcome and predictors
# ----------------------------
y = df_clean['Economic Complexity Index'].values

X = df_clean.drop(columns=['Country Code', 'Country Name', 'cluster_6', 'Economic Complexity Index']).values
feature_names = df_clean.drop(columns=['Country Code', 'Country Name', 'cluster_6', 'Economic Complexity Index']).columns

# ----------------------------
# 3. Impute missing predictor values
# ----------------------------
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# ----------------------------
# 4. Standardize predictors
# ----------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# ----------------------------
# 5. LASSO with cross-validation
# ----------------------------
lasso_cv = LassoCV(cv=10, random_state=42, max_iter=10000)
lasso_cv.fit(X_scaled, y)

# ----------------------------
# 6. Extract coefficients (indexed by feature names, not rows!)
# ----------------------------
coef = pd.Series(lasso_cv.coef_, index=feature_names)
print("LASSO coefficients:\n", coef)

# ----------------------------
# 7. Selected predictors (non-zero coefficients)
# ----------------------------
selected = coef[coef != 0].sort_values(key=abs, ascending=False)
print("Selected predictors:\n", selected)

# ----------------------------
# 8. Optional: Plot
# ----------------------------
plt.figure(figsize=(8,6))
selected.plot(kind='barh')
plt.title("Selected predictors (LASSO)")
plt.xlabel("Standardized coefficient")
plt.gca().invert_yaxis()
plt.show()




ValueError: Length of values (38) does not match length of index (44)

## Random Forest