In [45]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch import optim
from sklearn.neighbors import KernelDensity
from statsmodels.nonparametric.kernel_regression import KernelReg
from sklearn.model_selection import GridSearchCV


In [11]:
# ------------------------------------------------------------------
# Make sure Python can see the project root (so imports work)
# ------------------------------------------------------------------
PROJECT_ROOT = Path("/Users/apple/Documents/MRes+MPhil:PhD_Economics/MPhil_Project/deepset_project").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

# Now we can import our own modules
from utils.seed import set_seed
from data.feature_engineering import compute_mu0_ate_features
from data.cluster_ate_dataset import ClusterATEDataset
from data.collate_ate import collate_fn
from models.deepset_ate_set2set import DeepSetSiteConstant
from train.loops_ate import train_one_epoch, eval_one_epoch

set_seed(42)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device

device(type='mps')

In [46]:
from pathlib import Path

#  DATA_RAW = PROJECT_ROOT / "data" / "raw"
#  
#  # ------------------------------------------------------------------
#  # 1. Load your input dataframes
#  #    Replace filenames with your actual ones
#  # ------------------------------------------------------------------
#  df = pd.read_csv(DATA_RAW / "main_panel.csv")            # must contain: site, T, size, exp, ...
#  ate_weighted_df = pd.read_csv(DATA_RAW / "ate_weighted.csv")
#  ate_y0_weighted = pd.read_csv(DATA_RAW / "ate_y0_weighted.csv")
#  ate_y1_weighted = pd.read_csv(DATA_RAW / "ate_y1_weighted.csv")
#  
#  # Example: best bandwidth from previous search
#  best_bw = 0.5  # replace with your chosen value
#  
#  # ------------------------------------------------------------------
#  # 2. Run feature engineering to build per-site objects
#  #    Output: list of dicts, one per (site, target):
#  #        {
#  #          "target": t,
#  #          "site":   s,
#  #          "x":      x_grid [n_i, 2],
#  #          "mu0":    mu0_hat [n_i],
#  #          "ate":    ate_val scalar
#  #        }
#  # ------------------------------------------------------------------
#  clusters = compute_mu0_ate_features(
#      df=df,
#      ate_weighted_df=ate_weighted_df,
#      ate_y0_weighted=ate_y0_weighted,
#      ate_y1_weighted=ate_y1_weighted,
#      best_bw=best_bw,
#  )
#  
#  len(clusters), clusters[0].keys()

df = pd.read_csv("/Users/apple/Documents/MRes+MPhil:PhD_Economics/MPhil_Project/Notebooks/weighted_ate_by_site.csv")
targets = df['target'].unique()
df_raw = pd.read_csv("/Users/apple/Documents/MRes+MPhil:PhD_Economics/MPhil_Project/Notebooks/raw_combined.csv")


best_bw = 0.2782559402207124
target_clusters = []
for t in targets:
    data = df_raw.loc[df_raw['site']==t,:]
    x0 = data.loc[data['T']==0, ['size', 'exp']].to_numpy(dtype=float)
    y1 = data.loc[data['T']==1,'y'].to_numpy(dtype=float)
    y0 = data.loc[data['T']==0,'y'].to_numpy(dtype=float)
    kr0 = KernelReg(endog=y0, exog=x0, var_type="cc", reg_type="ll", bw=[best_bw, best_bw])
    x_grid = np.sort(data.loc[:, ['size', 'exp']].to_numpy(dtype=float))
    mu0_hat, _ = kr0.fit(x_grid)
    ate = np.mean(y1) - np.mean(y0)
    target_clusters.append({'target':t, 'site':t, 'x':x_grid, 'mu0':mu0_hat, 'ate':ate})

In [90]:
model = DeepSetSiteConstant(hid=64).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.0)

model

DeepSetSiteConstant(
  (phi): Sequential(
    (0): Linear(in_features=3, out_features=64, bias=True)
    (1): ReLU()
    (2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (3): Linear(in_features=64, out_features=64, bias=True)
    (4): ReLU()
  )
  (rho): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (3): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [None]:
import pickle
with open(PROJECT_ROOT/"data/processed/mu0_ate.pkl", "rb") as f:
    mu0_ate = pickle.load(f)

len(mu0_ate), mu0_ate[0].keys()

(1056, dict_keys(['target', 'site', 'x', 'mu0', 'ate']))

In [None]:
from torch.utils.data import Subset
from data.collate_ate import collate_fn

loss_all_t = []

for t in targets:
    
    # ------------------------------------------------------------------
    # 1. Create Dataset and train/val split
    # ------------------------------------------------------------------
    source_clusters = []
    for i in range(len(mu0_ate)):
        if mu0_ate[i]['target'] == t:
            source_clusters.append(mu0_ate[i])
            
    target_cluster = []
    for j in range(len(target_clusters)):
        if target_clusters[j]['site'] == t:
            target_cluster.append(target_clusters[j])
    
    clusters = []
    for k1 in range(len(source_clusters)):
        clusters.append(source_clusters[k1])
    clusters.append(target_cluster[0])
    
    dataset = ClusterATEDataset(clusters)
    n = len(dataset)
    indices = list(range(len(dataset) - 1))   # all except last
    trimmed_train = Subset(dataset, indices)
    test_idx = [n - 1]
    trimmed_val = Subset(dataset, test_idx)
    train_ds, val_ds = trimmed_train, trimmed_val
    
    
    # ------------------------------------------------------------------
    # 2. DataLoaders using our collate_fn
    # ------------------------------------------------------------------
    batch_size = 16

    train_loader = DataLoader(
        train_ds,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn,
    )

    val_loader = DataLoader(
        val_ds,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_fn,
    )
    
    
    # ------------------------------------------------------------------
    # 3. Sanity check one batch
    # ------------------------------------------------------------------
    X, MU0, MASK, Y = next(iter(train_loader))
    X.shape, MU0.shape, MASK.shape, Y.shape

    
    # ----------
    # 4. Run training with pre-defined model
    # ---
    from train.loops_ate import train_one_epoch, eval_one_epoch

    loss_list = []
    # corr_list = []
    for epoch in range(1, 51):
        train_loss = train_one_epoch(model, train_loader, optimizer, device)
        val_loss, val_corr = eval_one_epoch(model, val_loader, device)

        print(f"Epoch {epoch:03d} | "
              f"train RMSE={np.sqrt(train_loss):.4f} | "
              f"val RMSE={np.sqrt(val_loss):.4f}, corr={val_corr:.3f}")
        
        loss_list.append(val_loss)
        # corr_list.append(val_corr)
    
    loss_all_t.append(loss_list[-1])
    # corr_list = corr_list[-1]
    
print(f"Average RMSE is {np.mean(loss_all_t):.4f}")

Epoch 001 | train RMSE=0.2234 | val RMSE=0.0478, corr=nan
Epoch 002 | train RMSE=0.1091 | val RMSE=0.0231, corr=nan
Epoch 003 | train RMSE=0.1186 | val RMSE=0.0975, corr=nan
Epoch 004 | train RMSE=0.0955 | val RMSE=0.1106, corr=nan
Epoch 005 | train RMSE=0.0923 | val RMSE=0.0414, corr=nan
Epoch 006 | train RMSE=0.0877 | val RMSE=0.0370, corr=nan
Epoch 007 | train RMSE=0.0860 | val RMSE=0.0783, corr=nan
Epoch 008 | train RMSE=0.0834 | val RMSE=0.0857, corr=nan
Epoch 009 | train RMSE=0.0877 | val RMSE=0.0727, corr=nan
Epoch 010 | train RMSE=0.0846 | val RMSE=0.0339, corr=nan
Epoch 011 | train RMSE=0.0882 | val RMSE=0.0536, corr=nan
Epoch 012 | train RMSE=0.0835 | val RMSE=0.0983, corr=nan
Epoch 013 | train RMSE=0.0864 | val RMSE=0.0812, corr=nan
Epoch 014 | train RMSE=0.0855 | val RMSE=0.0428, corr=nan
Epoch 015 | train RMSE=0.0837 | val RMSE=0.0570, corr=nan
Epoch 016 | train RMSE=0.0800 | val RMSE=0.0808, corr=nan
Epoch 017 | train RMSE=0.0818 | val RMSE=0.0827, corr=nan
Epoch 018 | tr

In [92]:
loss_pooled = []

for t in targets:
    
    # ------------------------------------------------------------------
    # 1. Create Dataset and train/val split
    # ------------------------------------------------------------------
    source_clusters = []
    source_y = []
    # source_x = []
    for i in range(len(mu0_ate)):
        if mu0_ate[i]['target'] == t:
            source_clusters.append(mu0_ate[i])
            # source_x.append(mu0_ate[i]['x'])
            source_y.append(mu0_ate[i]['ate'])
            
    target_cluster = []
    for j in range(len(target_clusters)):
        if target_clusters[j]['site'] == t:
            target_cluster.append(target_clusters[j])
            target_y = target_cluster[0]['ate']
    
    rmse_pooled = np.sqrt((np.mean(source_y) - target_y)**2)
    loss_pooled.append(rmse_pooled)
    
print(f"Average RMSE is {np.mean(loss_all_t):.4f}")
print(f"Average RMSE from pooled prediction is {np.mean(loss_pooled):.4f}")


Average RMSE is 0.0023
Average RMSE from pooled prediction is 0.0261
