The paper this is based attempting to replicate: https://arxiv.org/pdf/2402.01785

Useful DoubleML docs: https://docs.doubleml.org/stable/guide/guide.html

In [7]:
#package imports
import numpy as np
import torch
import torch.nn as nn

import transformers
import doubleml
import pandas as pd


import warnings
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.base import clone

from doubleml import DoubleMLData
from doubleml import DoubleMLPLR
from doubleml.datasets import make_plr_CCDDHNR2018

face_colors = sns.color_palette('pastel')
edge_colors = sns.color_palette('dark')

warnings.filterwarnings("ignore")
print("init complete")

## Models 
#### Baseline Model
Uses `LightGBM` package _only_ to estimate nuisance elements on _only_ the tabular data 

#### Deep Model
Implemented exactly as in _Figure 2_ in paper
- For text, they use a `RoBERTa` Model pretrained on a `Twitter` Dataset
- For images, they use a `VIT` Model pretrained on the `ImageNet-21k` Dataset
- For tabular data, they use a `SAINT` model implemented in `pytorch-widedeep`

#### Embedding Model 
Builds on the `Deep Model`, but instead of passing embeddings directly through fusion head/predictive workflow, passes general embedding $H_e$ and data $X_{tab}$ to a boosting algorithm.

# Datasets (change labelling to be \textbf)
### Tabular
`DIAMONDS` dataset, downsampled to create dataset with $N=50,000$ 
- $\tilde{X}_{tab}$ is the logarithm of the price
- $X_{tab}$ is everything else
### Image
`CIFAR-10` dataset, specifically the training set ($N=50,000$), which is 32x32 colour images in 10 different classes
 - $\tilde{X}_{img}$ is a numerical representation of the label
 - $X_{img}$ is the image itself
### Text
`IMDB` dataset, both the training and test samples
- $\tilde{X}_{txt}$ is the binary (positive/negative) sentiment label
- $X_{txt}$ is the review itself

In [110]:
#process real data
import pandas as pd
import numpy as np
import PIL
from PIL import Image
import requests
from transformers import pipeline
import torch
from tqdm import tqdm



tab_df = pd.read_csv("diamonds.csv")

cut_di = {'Ideal':5, 'Premium':4, 'Good':2, 'Very Good':3, 'Fair':1}
tab_df['cut'].replace(cut_di,inplace=True)
col_di={'E': 2, 'I': 6, 'J': 7, 'H': 5, 'F': 3, 'G': 4, 'D': 1}
tab_df['color'].replace(col_di,inplace=True)
clar_di={'SI2': 1, 'SI1': 2, 'VS1': 4, 'VS2': 3, 'VVS2': 5, 'VVS1': 6, 'I1': 0, 'IF': 7}
tab_df['clarity'].replace(clar_di,inplace=True)

tab_tild_df=np.log(tab_df['price']).to_numpy()
tab_df=tab_df.drop(columns=['price','Unnamed: 0'])


txt_df=pd.read_csv("IMDB Dataset.csv")

sent_di={'positive':1,'negative':0}
txt_df['sentiment'].replace(sent_di,inplace=True)

txt_tild_df = txt_df['sentiment'].to_numpy()
txt_df=txt_df['review']


def unpickle(file):
    import pickle
    with open(file,'rb') as fo:
        dict=pickle.load(fo,encoding='bytes')
    return dict


img_dict=unpickle(r'cifar-10-batches-py\data_batch_1')
labels=img_dict[b'labels']
names=img_dict[b'filenames']
imgs=img_dict[b'data']

img_tild_df=np.array(labels)

img_df=imgs

def CIFAR2img(img):
    red = np.split(img, 32*3)[0:32]
    green = np.split(img, 32*3)[32:64]
    blue = np.split(img, 32*3)[64:96]
    
    return Image.fromarray(np.dstack((red,green,blue)), "RGB")

imgs=list(map(CIFAR2img, imgs[0:3]))
img_df = pd.DataFrame(imgs, columns=['imgs'])



# DEVICE = torch.device('cpu')
# pipe = pipeline(task="image-feature-extraction", model_name="google/vit-base-patch16-384", device=DEVICE, pool=True)
# output = pipe(imgs, return_tensors=True)
# img_embeddings=torch.stack(output).squeeze()
#NOTE: above is important for image pipeline

X_mod_tild = np.array([txt_tild_df, tab_tild_df, img_tild_df])
N=min([np.shape(X)[0] for X in X_mod_tild])
X_mod_tild = np.array([txt_tild_df[0:N], tab_tild_df[0:N], img_tild_df[0:N]])


#contruct synthetic data
theta0=0.5

g0_tild=np.sum(np.array(list(map(lambda X: (X - np.full(N,np.mean(X)))/np.std(X) ,X_mod_tild))),axis=0)

m0_tild= -g0_tild

np.random.seed(20)
D=m0_tild + np.random.normal(0,1,size=N)
Y=theta0*D + g0_tild + np.random.normal(0,1,size=N)
D_df=pd.DataFrame(D, columns=['D'])
Y_df=pd.DataFrame(Y, columns=['Y'])

X_mod=tab_df[:N].join(other=[txt_df[:N], img_df[:N]])
df=X_mod.join(other=[D_df,Y_df])
df.head()
#NOTE: need to double check with full 10000/50000 batches 


  X_mod_tild = np.array([txt_tild_df, tab_tild_df, img_tild_df])


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,review,imgs,D,Y
0,0.23,5,2,1,61.5,55.0,3.95,3.98,2.43,One of the other reviewers has mentioned that ...,<PIL.Image.Image image mode=RGB size=32x32 at ...,3.094251,-0.949851
1,0.21,4,2,2,59.8,61.0,3.89,3.84,2.31,A wonderful little production. <br /><br />The...,<PIL.Image.Image image mode=RGB size=32x32 at ...,1.361744,-0.560637
2,0.23,2,2,4,56.9,65.0,4.05,4.07,2.31,I thought this was a wonderful way to spend ti...,<PIL.Image.Image image mode=RGB size=32x32 at ...,1.518297,-0.234724
3,0.29,4,6,3,62.4,58.0,4.2,4.23,2.63,Basically there's a family where a little boy ...,,2.522928,-4.178922
4,0.31,2,7,1,63.3,58.0,4.34,4.35,2.75,"Petter Mattei's ""Love in the Time of Money"" is...",,2.820808,-1.549256


In [None]:
#set up architecture/model: Baseline

In [None]:
#deep/embedding common classes


# embedding_extractor = pipeline(model="google-bert/bert-base-cased", task="feature-extraction", device=0)
# result = embedding_extractor("This is a simple test.", return_tensors=True)

txt_model = "FacebookAI/roberta-base"
txt_embedding_extractor = pipeline(model=txt_model, task="feature-extraction", device=0 if torch.cuda.is_available() else -1)
txt_embeddings= txt_embedding_extractor(all_text_tensor, return_tensors=True)

img_model="VIT"

tab_model = "SAINT"





class MMEmbeddingNetwork(nn.Module):
    def __init__(self, img_embed_size, txt_embed_size,HE_size,hiddenSize=100):
        super(self).__init__()
        # self.inp = nn.Linear(imgEmbSize+txtEmbSize, hiddenSize)
        # self.fc = nn.Linear(hiddenSize,hiddenSize)
        # self.out = nn.Linear(hiddenSize,H_ESize)
        self.fc=nn.Linear(img_embed_size+txt_embed_size, HE_size)
    def forward(self,comb_embed):
        # x=combinedEmbed
        # x=self.inp(x)
        # x=self.fc(x)
        # return self.out(x)
        x=self.fc(comb_embed)
        return activF(x)


In [None]:
#set up architecture/model: Deep
    

class PredictiveUnit(nn.Module):
    def __init__(self,gen_embed_size):
        super(self).__init__()
        self.fhead_outcome = nn.Linear(gen_embed_size,1)
        self.fhead_treat = nn.Linear(gen_embed_size,1)
    
    def forward(self, gen_embed):
        l_hat=self.fhead_outcome(gen_embed)
        m_hat=self.fhead_treat(gen_embed)
        return m_hat,l_hat
    

class DeepModel(nn.Module):
    def __init__(self, txt_pipeline, img_pipeline, tab_pipeline, img_embed_size, txt_embed_size,HE_size,gen_embed_size):
        super(self).__init__()
        self.txt_in,self.img_in,self.tab_in=txt_pipeline, img_pipeline, tab_pipeline
        self.multimod = MMEmbeddingNetwork(img_embed_size, txt_embed_size,HE_size)
        self.pred=PredictiveUnit(gen_embed_size)

    def forward(self, txt, img, tab):
        txt_embed=self.txt_in(txt)
        img_embed = self.img_in(img)
        tab_embed = self.tab_in(tab)
        comb_embed = txt_embed + img_embed 
        H_E = self.multimod(comb_embed)
        G_E = H_E + tab_embed
        return self.pred(G_E)


In [None]:
#set up architecture/model: Embedding

class EmbeddingModel(nn.Module):
    def __init__(self, boost_alg,txt_pipeline, img_pipeline, img_embed_size, txt_embed_size,HE_size):
        self.txt_in,self.img_in=txt_pipeline, img_pipeline
        self.multimod = MMEmbeddingNetwork(img_embed_size, txt_embed_size,HE_size)
        self.boosting_alg= boost_alg
    def forward(self, txt, img, tab):
        txt_embed=self.txt_in(txt)
        img_embed = self.img_in(img)
        comb_embed = txt_embed + img_embed 
        H_E = self.multimod(comb_embed)
        return self.boosting_alg(H_E, tab)

In [None]:
#ML training loop: Deep
def DeepLoss(D,Y,m_hat,l_hat):
    """can handle 1-D vectors"""
    D_rms_err=torch.sqrt(torch.sum(torch.square(D-m_hat)))
    Y_rms_err=torch.sqrt(torch.sum(torch.square(Y-l_hat)))
    return D_rms_err*Y_rms_err

deepnet=DeepModel(txt_pipeline, img_pipeline, tab_pipeline, img_embed_size, txt_embed_size,HE_size,gen_embed_size)

def train(deepnet,n_epochs=1000, batch_size=100,loss_fn=DeepLoss):
    """Training params need work
    form depends on pd.DataFrame/torch.tensor implementation details"""
    optimizer=torch.optim.Adam(deepnet.parameters(),lr=0.001)
    for i_epoch in tqdm(range(n_epochs)):
        
        #NOTE: it's about to get spicy here!
        
        m_hat,l_hat = deepnet(txt, img, tab)
        loss = loss_fn(D,Y,m_hat,l_hat)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"Finished {i_epoch+1}/n_epoch, loss = {loss}", end = '\r')

train(deepnet)


In [None]:
#ML training loop: Embedding

In [None]:
#Common DoubleML pass-through implementation

In [None]:
#save and export model

In [None]:
#import model (if needed (how to structure selective cell runs?))

In [None]:
#plotting/performance analysis