In [None]:
import torch

import sys
sys.path.append('./src')

from data import load_data, get_data_sl
from visualize import plot_outcome_distribution
from model import get_model

## Load Data

In [None]:
train = load_data(environment='train')
test = load_data(environment='test')

### Sanity Check

In [None]:
model_name = "vit"
processor, model = get_model(model_name)

In [None]:
inputs = processor(images=train[0]['image'], return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
print("Top 5 predicted labels with associated probabilities:")
top_5 = torch.topk(logits, 5)
probs = logits.softmax(-1)[0][top_5.indices][0]
for i, (idx, prob) in enumerate(zip(top_5.indices[0], probs), 1):
    print(f"    {i}. {model.config.id2label[idx.item()]}: {prob.item():.2%}")

In [None]:
inputs = processor(images=train[0]['image'], return_tensors="pt")
outputs = model(**inputs, output_hidden_states=True)
outputs.hidden_states[-1][:,0].shape
# outputs.hidden_states[-1].mean(dim=[2,3]).shape

In [None]:
test["outcome"].sum()

## Supervised Learning

In [None]:
X, y = get_data_sl(environment="train", model_name="dino")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

outcome_idx = 0

model = LogisticRegression(solver='liblinear', max_iter=10) 
model.fit(X, y[:,outcome_idx])

y_pred = model.predict(X)  
print(classification_report(y[:,outcome_idx], y_pred))

In [None]:
import matplotlib.pyplot as plt

id_1s = (torch.Tensor(y_pred) == 1).nonzero(as_tuple=True)[0]#[0]
id_1 = id_1s[0].item()

train = load_data(environment='train')
example = train[id_1]["image"]

img = example.numpy().transpose(1, 2, 0)
plt.imshow(img)
plt.show()

## Causal Inference

In [None]:
plot_outcome_distribution(train, save=True)

In [None]:
rct = train #concatenate_datasets([train, test])
Y_y = rct["outcome"][:,0]
Y_b = rct["outcome"][:,1]
T = rct["treatment"]

# compute associational difference
rct = concatenate_datasets([train, test])
Y_y = rct["outcome"][:,0]
Y_b = rct["outcome"][:,1]
T = rct["treatment"]

# compute associational difference E[Y|T=1] - E[Y|T=0]
print("Grooming: Yellow to Focal")
E_Y_y_t0 = Y_y[T==0].mean()
E_Y_y_t1 = Y_y[T==1].mean()
E_Y_y_t2 = Y_y[T==2].mean()
ATE_B = E_Y_y_t1 - E_Y_y_t0
print(f"ATE_B: {ATE_B*100:.2f}%")
ATE_inf = E_Y_y_t2 - E_Y_y_t0
print(f"ATE_inf: {ATE_inf*100:.2f}%")

# compute associational difference E[Y|B=1] - E[Y|B=0]
print("Grooming: Blue to Focal")
E_Y_b_t0 = Y_b[T==0].mean()
E_Y_b_t1 = Y_b[T==1].mean()
E_Y_b_t2 = Y_b[T==2].mean()
ATE_B = E_Y_b_t1 - E_Y_b_t0
print(f"ATE_B: {ATE_B*100:.2f}%")
ATE_inf = E_Y_b_t2 - E_Y_b_t0
print(f"ATE_inf: {ATE_inf*100:.2f}%")
