## PCA instead of Linear Regression or PLS Reression to extract weights (2020 only and validate on 2022)

In [None]:
# load necessary libraries
import pandas as pd 
import glob
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score, LeaveOneOut
from scipy.stats import pearsonr, spearmanr
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
tg_2020 = pd.read_csv('clean_data/tg_2020.csv')
tg_2022 = pd.read_csv('clean_data/tg_2022.csv')
gt_2020 = pd.read_csv('clean_data/gt_2020.csv')
gt_2022 = pd.read_csv('clean_data/gt_2022.csv')
yt_2020 = pd.read_csv('clean_data/yt_2020.csv')
yt_2022 = pd.read_csv('clean_data/yt_2022.csv')
news_2020 = pd.read_csv('clean_data/news_2020.csv')
news_2022 = pd.read_csv('clean_data/news_2022.csv')
off_2020 = pd.read_csv('clean_data/off_2020.csv')
off_2022 = pd.read_csv('clean_data/off_2022.csv')

In [3]:
# there were inconsistencies in the state names, so this mapping standardizes the state names across all datasets
state_name_map = {
    "México": "Estado de México",
    "Mexico": "Estado de México",
    "Estados Unidos Mexicanos": "Estado de México",
    "Michoacán de Ocampo": "Michoacán",
    "Veracruz de Ignacio de la Llave": "Veracruz",
    "Coahuila de Zaragoza": "Coahuila",
    "Yucatan": "Yucatán",
    "Queretaro": "Querétaro",
    "San Luis Potosi": "San Luis Potosí",
    "Nuevo Leon": "Nuevo León",
    "Michoacan": "Michoacán",
    "Michoacán de Ocampo": "Michoacán"}

for df in [off_2020, off_2022, gt_2020, gt_2022, yt_2020, yt_2022, tg_2020, tg_2022, news_2020, news_2022]:
    df['state'] = df['state'].astype(str).str.strip()
    df['state'] = df['state'].replace(state_name_map)
    df['state'] = df['state'].replace("nan", None)  

In [4]:
# 'state' columns as strings in all dataframes
for df in [off_2020, off_2022, gt_2020, gt_2022, yt_2020, yt_2022, tg_2020, tg_2022, news_2020, news_2022]:
    df['state'] = df['state'].astype(str)

# create dataset for 2020
data_2020 = off_2020.copy()
data_2020['year'] = 2020

# merge Google Trends
data_2020 = data_2020.merge(gt_2020, on='state', how='inner')

# merge YouTube
data_2020 = data_2020.merge(yt_2020, on='state', how='inner')

# merge Telegram
data_2020 = data_2020.merge(tg_2020, on='state', how='inner')

# merge News (=LDA topics)
data_2020 = data_2020.merge(news_2020, on='state', how='inner')


# create dataset for 2022
data_2022 = off_2022.copy()
data_2022['year'] = 2022

# merge Google Trends
data_2022 = data_2022.merge(gt_2022, on='state', how='inner')

# merge YouTube
data_2022 = data_2022.merge(yt_2022, on='state', how='inner')

# merge Telegram
data_2022 = data_2022.merge(tg_2022, on='state', how='inner')

# merge News (=LDA topics)
data_2022 = data_2022.merge(news_2022, on='state', how='inner')

In [5]:
# =============================================
# REGRESSION 1: EDUCATIONAL LAG
# =============================================

# Features for educational lag
educ_features = []

# Google Trends
if 'educational_lag' in data_2020.columns:
    educ_features.append('educational_lag')

# YouTube
if 'educational_lag_avg_sentiment' in data_2020.columns:
    educ_features.append('educational_lag_avg_sentiment')
if 'educational_lag_pct_yt' in data_2020.columns:
    educ_features.append('educational_lag_pct_yt')

# Telegram
if 'educational_lag_pct_tg' in data_2020.columns:
    educ_features.append('educational_lag_pct_tg')

# News topics
if 'women_rights' in data_2020.columns:
    educ_features.append('women_rights')

print(f"Features for educational lag: {educ_features}")

if len(educ_features) > 0:
    # Prepare data
    X_education = data_2020[educ_features].fillna(0)
    
    # Standardize
    scaler_education = StandardScaler()
    X_education_scaled = scaler_education.fit_transform(X_education)
    
    # Apply PCA
    pca_education = PCA(n_components=1)
    pca_result_education = pca_education.fit_transform(X_education_scaled)
    
    pca_scores_education = pca_result_education.flatten()
    
    # create an artificial intercept to have only positive weights - need to justify the arbitrary value chosen 
    education_intercept = abs(pca_scores_education.min()) + 10  
    pca_scores_positive_education = pca_scores_education + education_intercept
    
    # use loadings as weights and save scaling parameters
    educ_weights = dict(zip(educ_features, pca_education.components_[0]))
    educ_scaler_mean = dict(zip(educ_features, scaler_education.mean_))
    educ_scaler_scale = dict(zip(educ_features, scaler_education.scale_))
    
    print(f"Education - Explained variance: {pca_education.explained_variance_ratio_[0]:.3f}")
    print("PCA Weights:")
    for feature, weight in educ_weights.items():
        print(f"  {feature}: {weight:.4f}")
    print(f"Artificial Intercept: {education_intercept:.4f}")

Features for educational lag: ['educational_lag', 'educational_lag_avg_sentiment', 'educational_lag_pct_yt', 'educational_lag_pct_tg', 'women_rights']
Education - Explained variance: 0.326
PCA Weights:
  educational_lag: 0.6539
  educational_lag_avg_sentiment: -0.4569
  educational_lag_pct_yt: -0.4116
  educational_lag_pct_tg: -0.3028
  women_rights: -0.3202
Artificial Intercept: 12.3767


In [6]:
# =============================================
# REGRESSION 2: HEALTH
# =============================================

# Features for health
health_features = []

# Google Trends
if 'health_gt' in data_2020.columns:
    health_features.append('health_gt')

# YouTube
if 'access_to_health_services_avg_sentiment' in data_2020.columns:
    health_features.append('access_to_health_services_avg_sentiment')
if 'access_to_health_services_pct_yt' in data_2020.columns:
    health_features.append('access_to_health_services_pct_yt')

# Telegram
if 'access_to_health_services_pct_tg' in data_2020.columns:
    health_features.append('access_to_health_services_pct_tg')

# News topics
if 'health' in data_2020.columns:
    health_features.append('health')

print(f"Features for health: {health_features}")

if len(health_features) > 0:
    # Prepare data
    X_health = data_2020[health_features].fillna(0)
    
    # Standardize
    scaler_health = StandardScaler()
    X_health_scaled = scaler_health.fit_transform(X_health)
    
    # Apply PCA
    pca_health = PCA(n_components=1)
    pca_result_health = pca_health.fit_transform(X_health_scaled)
    
    pca_scores_health = pca_result_health.flatten()
    
    # create an artificial intercept to have only positive weights - need to justify the arbitrary value chosen 
    health_intercept = abs(pca_scores_health.min()) + 10  
    pca_scores_positive_health = pca_scores_health + health_intercept
    
    # use loadings as weights and save scaling parameters
    health_weights = dict(zip(health_features, pca_health.components_[0]))
    health_scaler_mean = dict(zip(health_features, scaler_health.mean_))
    health_scaler_scale = dict(zip(health_features, scaler_health.scale_))
    
    print(f"Health - Explained variance: {pca_health.explained_variance_ratio_[0]:.3f}")
    print("PCA Weights:")
    for feature, weight in health_weights.items():
        print(f"  {feature}: {weight:.4f}")
    print(f"Artificial Intercept: {health_intercept:.4f}")

Features for health: ['health_gt', 'access_to_health_services_avg_sentiment', 'access_to_health_services_pct_yt', 'access_to_health_services_pct_tg', 'health']
Health - Explained variance: 0.391
PCA Weights:
  health_gt: 0.4521
  access_to_health_services_avg_sentiment: -0.5435
  access_to_health_services_pct_yt: 0.6253
  access_to_health_services_pct_tg: 0.2229
  health: 0.2439
Artificial Intercept: 12.3376


In [7]:
# =============================================
# REGRESSION 3: SOCIAL SECURITY
# =============================================

# Features for social security
social_features = []

# Google Trends
if 'social_gt' in data_2020.columns:
    social_features.append('social_gt')

# YouTube
if 'access_to_social_security_avg_sentiment' in data_2020.columns:
    social_features.append('access_to_social_security_avg_sentiment')
if 'access_to_social_security_pct_yt' in data_2020.columns:
    social_features.append('access_to_social_security_pct_yt')

# Telegram
if 'access_to_social_security_pct_tg' in data_2020.columns:
    social_features.append('access_to_social_security_pct_tg')

# News topics
if 'public_services' in data_2020.columns:
    social_features.append('public_services')
if 'institutions' in data_2020.columns:
    social_features.append('institutions')

print(f"Features for social security: {social_features}")

if len(social_features) > 0:
    # Prepare data
    X_social = data_2020[social_features].fillna(0)
    
    # Standardize
    scaler_social = StandardScaler()
    X_social_scaled = scaler_social.fit_transform(X_social)
    
    # Apply PCA
    pca_social = PCA(n_components=1)
    pca_result_social = pca_social.fit_transform(X_social_scaled)
    
    pca_scores_social = pca_result_social.flatten()
    
    # create an artificial intercept to have only positive weights - need to justify the arbitrary value chosen 
    social_intercept = abs(pca_scores_social.min()) + 10  
    pca_scores_positive_social = pca_scores_social + social_intercept
    
    # use loadings as weights and save scaling parameters
    social_weights = dict(zip(social_features, pca_social.components_[0]))
    social_scaler_mean = dict(zip(social_features, scaler_social.mean_))
    social_scaler_scale = dict(zip(social_features, scaler_social.scale_))
    
    print(f"Social Security - Explained variance: {pca_social.explained_variance_ratio_[0]:.3f}")
    print("PCA Weights:")
    for feature, weight in social_weights.items():
        print(f"  {feature}: {weight:.4f}")
    print(f"Artificial Intercept: {social_intercept:.4f}")

Features for social security: ['social_gt', 'access_to_social_security_avg_sentiment', 'access_to_social_security_pct_yt', 'access_to_social_security_pct_tg', 'public_services', 'institutions']
Social Security - Explained variance: 0.268
PCA Weights:
  social_gt: 0.0718
  access_to_social_security_avg_sentiment: 0.2506
  access_to_social_security_pct_yt: 0.6952
  access_to_social_security_pct_tg: 0.3628
  public_services: 0.2436
  institutions: -0.5078
Artificial Intercept: 12.5542


In [8]:
# =============================================
# REGRESSION 4: INCOME
# =============================================

# Features for income
income_features = []

# Google Trends
if 'income_gt' in data_2020.columns:
    income_features.append('income_gt')

# YouTube
if 'income_avg_sentiment' in data_2020.columns:
    income_features.append('income_avg_sentiment')
if 'income_pct_yt' in data_2020.columns:
    income_features.append('income_pct_yt')

# Telegram
if 'income_pct_tg' in data_2020.columns:
    income_features.append('income_pct_tg')

# News topics
if 'economy' in data_2020.columns:
    income_features.append('economy')
if 'work' in data_2020.columns:
    income_features.append('work')

print(f"Features for income: {income_features}")

if len(income_features) > 0:
    # Prepare data
    X_income = data_2020[income_features].fillna(0)
    
    # Standardize
    scaler_income = StandardScaler()
    X_income_scaled = scaler_income.fit_transform(X_income)
    
    # Apply PCA
    pca_income = PCA(n_components=1)
    pca_result_income = pca_income.fit_transform(X_income_scaled)
    
    pca_scores_income = pca_result_income.flatten()
    
    # create an artificial intercept to have only positive weights - need to justify the arbitrary value chosen 
    income_intercept = abs(pca_scores_income.min()) + 10  
    pca_scores_positive_income = pca_scores_income + income_intercept
    
    # use loadings as weights and save scaling parameters
    income_weights = dict(zip(income_features, pca_income.components_[0]))
    income_scaler_mean = dict(zip(income_features, scaler_income.mean_))
    income_scaler_scale = dict(zip(income_features, scaler_income.scale_))
    
    print(f"Income - Explained variance: {pca_income.explained_variance_ratio_[0]:.3f}")
    print("PCA Weights:")
    for feature, weight in income_weights.items():
        print(f"  {feature}: {weight:.4f}")
    print(f"Artificial Intercept: {income_intercept:.4f}")

Features for income: ['income_gt', 'income_avg_sentiment', 'income_pct_yt', 'income_pct_tg', 'economy', 'work']
Income - Explained variance: 0.273
PCA Weights:
  income_gt: -0.3186
  income_avg_sentiment: -0.3716
  income_pct_yt: 0.5657
  income_pct_tg: -0.5089
  economy: -0.3996
  work: 0.1476
Artificial Intercept: 13.1896


In [9]:
# =============================================
# REGRESSION 5: FOOD
# =============================================

# Features for food
food_features = []

# Google Trends
if 'food_gt' in data_2020.columns:
    food_features.append('food_gt')

# YouTube
if 'access_to_food_avg_sentiment' in data_2020.columns:
    food_features.append('access_to_food_avg_sentiment')
if 'access_to_food_pct_yt' in data_2020.columns:
    food_features.append('access_to_food_pct_yt')

# Telegram
if 'access_to_food_pct_tg' in data_2020.columns:
    food_features.append('access_to_food_pct_tg')

# News topics
if 'women_rights' in data_2020.columns:
    food_features.append('women_rights')

print(f"Features for food: {food_features}")

if len(food_features) > 0:
    # Prepare data
    X_food = data_2020[food_features].fillna(0)
    
    # Standardize
    scaler_food = StandardScaler()
    X_food_scaled = scaler_food.fit_transform(X_food)
    
    # Apply PCA
    pca_food = PCA(n_components=1)
    pca_result_food = pca_food.fit_transform(X_food_scaled)
    
    pca_scores_food = pca_result_food.flatten()
    
    # create an artificial intercept to have only positive weights - need to justify the arbitrary value chosen 
    food_intercept = abs(pca_scores_food.min()) + 10  
    pca_scores_positive_food = pca_scores_food + food_intercept
    
    # use loadings as weights and save scaling parameters
    food_weights = dict(zip(food_features, pca_food.components_[0]))
    food_scaler_mean = dict(zip(food_features, scaler_food.mean_))
    food_scaler_scale = dict(zip(food_features, scaler_food.scale_))
    
    print(f"Food - Explained variance: {pca_food.explained_variance_ratio_[0]:.3f}")
    print("PCA Weights:")
    for feature, weight in food_weights.items():
        print(f"  {feature}: {weight:.4f}")
    print(f"Artificial Intercept: {food_intercept:.4f}")

Features for food: ['food_gt', 'access_to_food_avg_sentiment', 'access_to_food_pct_yt', 'access_to_food_pct_tg', 'women_rights']
Food - Explained variance: 0.291
PCA Weights:
  food_gt: -0.1261
  access_to_food_avg_sentiment: -0.5725
  access_to_food_pct_yt: 0.6697
  access_to_food_pct_tg: -0.3031
  women_rights: -0.3405
Artificial Intercept: 12.2616


In [10]:
# =============================================
# REGRESSION 6: HOUSING
# =============================================

# Features for housing
housing_features = []

# Google Trends
if 'housing_gt' in data_2020.columns:
    housing_features.append('housing_gt')

# YouTube
if 'housing_avg_sentiment' in data_2020.columns:
    housing_features.append('housing_avg_sentiment')
if 'housing_pct_yt' in data_2020.columns:
    housing_features.append('housing_pct_yt')

# Telegram
if 'housing_pct_tg' in data_2020.columns:
    housing_features.append('housing_pct_tg')

# News topics
if 'public_services' in data_2020.columns:
    housing_features.append('public_services')

print(f"Features for housing: {housing_features}")

if len(housing_features) > 0:
    # Prepare data
    X_housing = data_2020[housing_features].fillna(0)
    
    # Standardize
    scaler_housing = StandardScaler()
    X_housing_scaled = scaler_housing.fit_transform(X_housing)
    
    # Apply PCA
    pca_housing = PCA(n_components=1)
    pca_result_housing = pca_housing.fit_transform(X_housing_scaled)
    
    pca_scores_housing = pca_result_housing.flatten()
    
    # create an artifical intercept to have only positive weights - need to justify the arbitrary value chosen 
    housing_intercept = abs(pca_scores_housing.min()) + 10  
    pca_scores_positive_housing = pca_scores_housing + housing_intercept
    
    # use loadings as weights and save scaling parameters
    housing_weights = dict(zip(housing_features, pca_housing.components_[0]))
    housing_scaler_mean = dict(zip(housing_features, scaler_housing.mean_))
    housing_scaler_scale = dict(zip(housing_features, scaler_housing.scale_))
    
    print(f"Social Cohesion - Explained variance: {pca_housing.explained_variance_ratio_[0]:.3f}")
    print("PCA Weights:")
    for feature, weight in housing_weights.items():
        print(f"  {feature}: {weight:.4f}")
    print(f"Artificial Intercept: {housing_intercept:.4f}")

Features for housing: ['housing_gt', 'housing_avg_sentiment', 'housing_pct_yt', 'housing_pct_tg', 'public_services']
Social Cohesion - Explained variance: 0.369
PCA Weights:
  housing_gt: 0.5485
  housing_avg_sentiment: -0.4288
  housing_pct_yt: 0.0696
  housing_pct_tg: 0.5107
  public_services: 0.4995
Artificial Intercept: 12.5463


In [11]:
# =============================================
# PCA FOR SOCIAL COHESION (PCA loadings as weights as we don't have a target variable) - artificial intercept
# =============================================

# Features for social cohesion
cohesion_features = []

# Google Trends
if 'cohesion_gt' in data_2020.columns:
    cohesion_features.append('cohesion_gt')

# YouTube
if 'social_cohesion_avg_sentiment' in data_2020.columns:
    cohesion_features.append('social_cohesion_avg_sentiment')
if 'social_cohesion_pct_yt' in data_2020.columns:
    cohesion_features.append('social_cohesion_pct_yt')

# Telegram
if 'social_cohesion_pct_tg' in data_2020.columns:
    cohesion_features.append('social_cohesion_pct_tg')

# News topics
if 'safety' in data_2020.columns:
    cohesion_features.append('safety')

print(f"Features for social cohesion: {cohesion_features}")

if len(cohesion_features) > 0:
    # Prepare data
    X_cohesion = data_2020[cohesion_features].fillna(0)
    
    # Standardize
    scaler_cohesion = StandardScaler()
    X_cohesion_scaled = scaler_cohesion.fit_transform(X_cohesion)
    
    # Apply PCA
    pca_cohesion = PCA(n_components=1)
    pca_result_cohesion = pca_cohesion.fit_transform(X_cohesion_scaled)
    
    pca_scores_cohesion = pca_result_cohesion.flatten()
    
    # create an artifical intercept to have only positive weights - need to justify the arbitrary value chosen 
    cohesion_intercept = abs(pca_scores_cohesion.min()) + 10  
    pca_scores_positive_cohesion = pca_scores_cohesion + cohesion_intercept
    
    # use loadings as weights and save scaling parameters
    cohesion_weights = dict(zip(cohesion_features, pca_cohesion.components_[0]))
    cohesion_scaler_mean = dict(zip(cohesion_features, scaler_cohesion.mean_))
    cohesion_scaler_scale = dict(zip(cohesion_features, scaler_cohesion.scale_))
    
    print(f"Social Cohesion - Explained variance: {pca_cohesion.explained_variance_ratio_[0]:.3f}")
    print("PCA Weights:")
    for feature, weight in cohesion_weights.items():
        print(f"  {feature}: {weight:.4f}")
    print(f"Artificial Intercept: {cohesion_intercept:.4f}")

Features for social cohesion: ['cohesion_gt', 'social_cohesion_avg_sentiment', 'social_cohesion_pct_yt', 'social_cohesion_pct_tg', 'safety']
Social Cohesion - Explained variance: 0.335
PCA Weights:
  cohesion_gt: 0.6497
  social_cohesion_avg_sentiment: -0.5884
  social_cohesion_pct_yt: -0.2718
  social_cohesion_pct_tg: 0.2196
  safety: 0.3311
Artificial Intercept: 13.0030


In [12]:
# save weights, intercepts, and scaling parameters

# Collect all weights with intercepts and scaling parameters
all_weights = []
all_scaling_params = []

# Add regression weights for each dimension
dimensions_data = [
    ('educational_lag', educ_weights if 'educ_weights' in locals() else {}, education_intercept if 'educ_intercept' in locals() else 0, 
     educ_scaler_mean if 'educ_scaler_mean' in locals() else {}, educ_scaler_scale if 'educ_scaler_scale' in locals() else {}),
    ('health', health_weights if 'health_weights' in locals() else {}, health_intercept if 'health_intercept' in locals() else 0,
     health_scaler_mean if 'health_scaler_mean' in locals() else {}, health_scaler_scale if 'health_scaler_scale' in locals() else {}),
    ('social_security', social_weights if 'social_weights' in locals() else {}, social_intercept if 'social_intercept' in locals() else 0,
     social_scaler_mean if 'social_scaler_mean' in locals() else {}, social_scaler_scale if 'social_scaler_scale' in locals() else {}),
    ('income', income_weights if 'income_weights' in locals() else {}, income_intercept if 'income_intercept' in locals() else 0,
     income_scaler_mean if 'income_scaler_mean' in locals() else {}, income_scaler_scale if 'income_scaler_scale' in locals() else {}),
    ('food', food_weights if 'food_weights' in locals() else {}, food_intercept if 'food_intercept' in locals() else 0,
     food_scaler_mean if 'food_scaler_mean' in locals() else {}, food_scaler_scale if 'food_scaler_scale' in locals() else {}),
    ('housing', housing_weights if 'housing_weights' in locals() else {}, housing_intercept if 'housing_intercept' in locals() else 0,
     housing_scaler_mean if 'housing_scaler_mean' in locals() else {}, housing_scaler_scale if 'housing_scaler_scale' in locals() else {}),
    ('social_cohesion', cohesion_weights if 'cohesion_weights' in locals() else {}, cohesion_intercept if 'cohesion_intercept' in locals() else 0,
     cohesion_scaler_mean if 'cohesion_scaler_mean' in locals() else {}, cohesion_scaler_scale if 'cohesion_scaler_scale' in locals() else {})
]

for dimension, weights_dict, intercept, scaler_mean_dict, scaler_scale_dict in dimensions_data:
    for feature, weight in weights_dict.items():
        all_weights.append({
            'dimension': dimension,
            'feature': feature,
            'weight': weight,
            'method': 'linear_regression' if dimension != 'social_cohesion' else 'pca_with_artificial_intercept',
            'intercept': intercept
        })
        
        # Save scaling parameters for each feature
        if feature in scaler_mean_dict and feature in scaler_scale_dict:
            all_scaling_params.append({
                'dimension': dimension,
                'feature': feature,
                'scaler_mean': scaler_mean_dict[feature],
                'scaler_scale': scaler_scale_dict[feature]
            })

# Create dataframes
weights_df = pd.DataFrame(all_weights)
scaling_df = pd.DataFrame(all_scaling_params)

# Create pivot table for weights
if len(weights_df) > 0:
    weights_pivot = weights_df.pivot(index='feature', columns='dimension', values='weight').fillna(0)

# Save intercepts separately
intercepts_data = []
for dimension, weights_dict, intercept, scaler_mean_dict, scaler_scale_dict in dimensions_data:
    if weights_dict:  # Only if we have weights for this dimension
        intercepts_data.append({
            'dimension': dimension,
            'intercept': intercept
        })

intercepts_df = pd.DataFrame(intercepts_data)

# prepare data 

# Create dictionaries for intercepts 
intercepts_dict = dict(zip(intercepts_df['dimension'], intercepts_df['intercept']))

# Create scaling dictionaries by dimension
scaling_means = {}
scaling_scales = {}

for dimension in scaling_df['dimension'].unique():
    dim_scaling = scaling_df[scaling_df['dimension'] == dimension]
    scaling_means[dimension] = dict(zip(dim_scaling['feature'], dim_scaling['scaler_mean']))
    scaling_scales[dimension] = dict(zip(dim_scaling['feature'], dim_scaling['scaler_scale']))

In [13]:
def predict_dimension(data, dimension, weights_pivot, intercepts_dict, scaling_means, scaling_scales):
    
    # Get weights and intercept (PCA loadings + shift)
    dimension_weights = weights_pivot[dimension]
    dimension_weights = dimension_weights[dimension_weights != 0]
    intercept = intercepts_dict.get(dimension, 0)
    
    # Prepare containers
    feature_values = []
    feature_names = []
    weights_used = []
    
    for feature, weight in dimension_weights.items():
        if feature in data.columns:
            raw_values = data[feature].fillna(0).values
            print(f"  Raw values range: {raw_values.min():.4f} to {raw_values.max():.4f}")
            
            if dimension in scaling_means and feature in scaling_means[dimension]:
                mean = scaling_means[dimension][feature]
                scale = scaling_scales[dimension][feature]
                scaled = (raw_values - mean) / scale
                
                feature_values.append(scaled)
                feature_names.append(feature)
                weights_used.append(weight)
                print(f"   Feature added successfully")
            else:
                print(f"  No scaling params for {feature} in {dimension}")
        else:
            print(f"  Feature {feature} not found in data")
    
    # Compute nowcast
    if len(feature_values) > 0:
        X_scaled = np.column_stack(feature_values)
        weights_array = np.array(weights_used)
        
        print(f"  X_scaled shape: {X_scaled.shape}")
        print(f"  Weights shape: {weights_array.shape}")
        
        projections = np.dot(X_scaled, weights_array) + intercept
        return projections, feature_names
    else:
        return None, None


In [14]:
results_df = pd.DataFrame()
results_df['state'] = data_2022['state']

dimension_mapping = {
    'educational_lag': 'educ_target',
    'health': 'health_target', 
    'social_security': 'social_target',
    'income': 'income_target',
    'food': 'food_target',
    'housing': 'housing_target'}



prediction_results = {}

for dimension, target_col in dimension_mapping.items():
    if target_col in data_2022.columns:
        print(f"\n=== NOWCASTING: {dimension.upper()} ===")
        
        predictions, features_used = predict_dimension(
            data_2022, dimension, weights_pivot, intercepts_dict, scaling_means, scaling_scales)
        
        if predictions is not None:
            results_df[f'{dimension}_actual'] = data_2022[target_col]
            results_df[f'{dimension}_predicted'] = predictions
            
            actual = data_2022[target_col].values
            r2 = r2_score(actual, predictions)
            rmse = np.sqrt(mean_squared_error(actual, predictions))
            mae = mean_absolute_error(actual, predictions)
            
            prediction_results[dimension] = {
                'r2': r2,
                'rmse': rmse,
                'mae': mae,
                'features_used': features_used,
                'n_features': len(features_used)}
            
            print(f"   R² = {r2:.3f}, RMSE = {rmse:.3f}, MAE = {mae:.3f}")
        else:
            print(f"   ⚠️ Failed to predict {dimension}")
    else:
        print(f"   ⚠️ Target {target_col} not found in data")



=== NOWCASTING: EDUCATIONAL_LAG ===
  Raw values range: 3.5400 to 23.4200
   Feature added successfully
  Raw values range: -0.2623 to 0.0998
   Feature added successfully
  Raw values range: 1.4368 to 15.7447
   Feature added successfully
  Raw values range: 13.2446 to 23.0723
   Feature added successfully
  Raw values range: 0.0063 to 0.4924
   Feature added successfully
  X_scaled shape: (32, 5)
  Weights shape: (5,)
   R² = -14.690, RMSE = 19.692, MAE = 18.843

=== NOWCASTING: HEALTH ===
  Raw values range: -0.5596 to -0.1941
   Feature added successfully
  Raw values range: 9.6154 to 37.6623
   Feature added successfully
  Raw values range: 4.0375 to 9.7040
   Feature added successfully
  Raw values range: 0.0055 to 0.3892
   Feature added successfully
  Raw values range: 3.1200 to 23.3400
   Feature added successfully
  X_scaled shape: (32, 5)
  Weights shape: (5,)
   R² = -4.295, RMSE = 28.622, MAE = 25.585

=== NOWCASTING: SOCIAL_SECURITY ===
  Raw values range: -0.5338 to 0.0