The paper this is based attempting to replicate: https://arxiv.org/pdf/2402.01785

Useful DoubleML docs: https://docs.doubleml.org/stable/guide/guide.html

In [7]:
#package imports
import numpy as np
import torch
import torch.nn as nn

import transformers
import doubleml
import pandas as pd


import warnings
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.base import clone

from doubleml import DoubleMLData
from doubleml import DoubleMLPLR
from doubleml.datasets import make_plr_CCDDHNR2018

face_colors = sns.color_palette('pastel')
edge_colors = sns.color_palette('dark')

warnings.filterwarnings("ignore")
print("init complete")

## Models 
#### Baseline Model
Uses `LightGBM` package _only_ to estimate nuisance elements on _only_ the tabular data 

#### Deep Model
Implemented exactly as in _Figure 2_ in paper
- For text, they use a `RoBERTa` Model pretrained on a `Twitter` Dataset
- For images, they use a `VIT` Model pretrained on the `ImageNet-21k` Dataset
- For tabular data, they use a `SAINT` model implemented in `pytorch-widedeep`

#### Embedding Model 
Builds on the `Deep Model`, but instead of passing embeddings directly through fusion head/predictive workflow, passes general embedding $H_e$ and data $X_{tab}$ to a boosting algorithm.

In [None]:
#import the transformers

# Datasets (change labelling to be \textbf)
### Tabular
`DIAMONDS` dataset, downsampled to create dataset with $N=50,000$ 
- $\tilde{X}_{tab}$ is the logarithm of the price
- $X_{tab}$ is everything else
### Image
`CIFAR-10` dataset, specifically the training set ($N=50,000$), which is 32x32 colour images in 10 different classes
 - $\tilde{X}_{img}$ is a numerical representation of the label
 - $X_{img}$ is the image itself
### Text
`IMDB` dataset, both the training and test samples
- $\tilde{X}_{txt}$ is the binary (positive/negative) sentiment label
- $X_{txt}$ is the review itself

In [None]:
#process real data
tab_df = pd.read_csv("diamonds.csv")

cut_di = {'Ideal':5, 'Premium':4, 'Good':2, 'Very Good':3, 'Fair':1}
tab_df['cut'].replace(cut_di,inplace=True)
col_di={'E': 2, 'I': 6, 'J': 7, 'H': 5, 'F': 3, 'G': 4, 'D': 1}
tab_df['color'].replace(col_di,inplace=True)
clar_di={'SI2': 1, 'SI1': 2, 'VS1': 4, 'VS2': 3, 'VVS2': 5, 'VVS1': 6, 'I1': 0, 'IF': 7}
tab_df['clarity'].replace(clar_di,inplace=True)

tab_tild_df=np.log(tab_df['price'])
tab_df=tab_df.drop(columns=['price','Unnamed: 0'])



txt_df=pd.read_csv("IMDB Dataset.csv")

sent_di={'positive':1,'negative':0}
txt_df['sentiment'].replace(sent_di,inplace=True)
txt_tild_df = txt_df['sentiment']
txt_df=txt_df['review']





img_df=False

#need to extract from pd to np
X_mod_tild = np.array([txt_tild_df, tab_tild_df, img_tild_df])

#output DataFrame X with cols [carat,cut,color,clarity,depth,table,x,y,z  ,  review_text   ,   img]

In [None]:
def unpickle(file):
    import pickle
    with open(file,'rb') as fo:
        dict=pickle.load(fo,encoding='bytes')
    return dict

img_dict=unpickle(r'cifar-10-batches-py\data_batch_1')
labels=img_dict[b'labels']
names=img_dict[b'names']
imgs=img_dict[b'filenames']
print(img_dict.keys())

In [None]:
#contruct synthetic data
N=50000
theta0=0.5
# X_mod_tild=[[log(price)],[img label],[txt sentiment]] np_arr
g0_tild=np.sum(np.array(map(lambda X: (X - np.full(N,np.mean(X)))/np.std(X) ,X_mod_tild)),axis=0)
m0_tild= -g0_tild

np.random.seed(20)
D=m0_tild + np.random.normal(0,1,size=N)
Y=theta0*D + g0_tild + np.random.normal(0,1,size=N)

#append D,Y onto df X

In [None]:
#set up architecture/model: Baseline

In [None]:
#deep/embedding common classes

from transformers import pipeline

# embedding_extractor = pipeline(model="google-bert/bert-base-cased", task="feature-extraction", device=0)
# result = embedding_extractor("This is a simple test.", return_tensors=True)

txt_model = "ROBERTA"
txt_embedding_extractor = pipeline(model="google-bert/bert-base-cased", task="feature-extraction", device=0 if torch.cuda.is_available() else -1)
txt_embeddings= txt_embedding_extractor(all_text_tensor, return_tensors=True)

img_model="VIT"

tab_model = "SAINT"





class MMEmbeddingNetwork(nn.Module):
    def __init__(self, img_embed_size, txt_embed_size,HE_size,hiddenSize=100):
        super(self).__init__()
        # self.inp = nn.Linear(imgEmbSize+txtEmbSize, hiddenSize)
        # self.fc = nn.Linear(hiddenSize,hiddenSize)
        # self.out = nn.Linear(hiddenSize,H_ESize)
        self.fc=nn.Linear(img_embed_size+txt_embed_size, HE_size)
    def forward(self,comb_embed):
        # x=combinedEmbed
        # x=self.inp(x)
        # x=self.fc(x)
        # return self.out(x)
        x=self.fc(comb_embed)
        return activF(x)


In [None]:
#set up architecture/model: Deep
    

class PredictiveUnit(nn.Module):
    def __init__(self,gen_embed_size):
        super(self).__init__()
        self.fhead_outcome = nn.Linear(gen_embed_size,1)
        self.fhead_treat = nn.Linear(gen_embed_size,1)
    
    def forward(self, gen_embed):
        l_hat=self.fhead_outcome(gen_embed)
        m_hat=self.fhead_treat(gen_embed)
        return m_hat,l_hat
    

class DeepModel(nn.Module):
    def __init__(self, txt_pipeline, img_pipeline, tab_pipeline, img_embed_size, txt_embed_size,HE_size,gen_embed_size):
        super(self).__init__()
        self.txt_in,self.img_in,self.tab_in=txt_pipeline, img_pipeline, tab_pipeline
        self.multimod = MMEmbeddingNetwork(img_embed_size, txt_embed_size,HE_size)
        self.pred=PredictiveUnit(gen_embed_size)

    def forward(self, txt, img, tab):
        txt_embed=self.txt_in(txt)
        img_embed = self.img_in(img)
        tab_embed = self.tab_in(tab)
        comb_embed = txt_embed + img_embed 
        H_E = self.multimod(comb_embed)
        G_E = H_E + tab_embed
        return self.pred(G_E)


In [None]:
#set up architecture/model: Embedding

class EmbeddingModel(nn.Module):
    def __init__(self, boost_alg,txt_pipeline, img_pipeline, img_embed_size, txt_embed_size,HE_size):
        self.txt_in,self.img_in=txt_pipeline, img_pipeline
        self.multimod = MMEmbeddingNetwork(img_embed_size, txt_embed_size,HE_size)
        self.boosting_alg= boost_alg
    def forward(self, txt, img, tab):
        txt_embed=self.txt_in(txt)
        img_embed = self.img_in(img)
        comb_embed = txt_embed + img_embed 
        H_E = self.multimod(comb_embed)
        return self.boosting_alg(H_E, tab)

In [None]:
#ML training loop: Deep
def DeepLoss(D,Y,m_hat,l_hat):
    """can handle 1-D vectors"""
    D_rms_err=torch.sqrt(torch.sum(torch.square(D-m_hat)))
    Y_rms_err=torch.sqrt(torch.sum(torch.square(Y-l_hat)))
    return D_rms_err*Y_rms_err

deepnet=DeepModel(txt_pipeline, img_pipeline, tab_pipeline, img_embed_size, txt_embed_size,HE_size,gen_embed_size)

def train(deepnet,n_epochs=1000, batch_size=100,loss_fn=DeepLoss):
    """Training params need work
    form depends on pd.DataFrame/torch.tensor implementation details"""
    optimizer=torch.optim.Adam(deepnet.parameters(),lr=0.001)
    for i_epoch in tqdm(range(n_epochs)):
        
        #NOTE: it's about to get spicy here!
        
        m_hat,l_hat = deepnet(txt, img, tab)
        loss = loss_fn(D,Y,m_hat,l_hat)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"Finished {i_epoch+1}/n_epoch, loss = {loss}", end = '\r')

train(deepnet)


In [None]:
#ML training loop: Embedding

In [None]:
#Common DoubleML pass-through implementation

In [None]:
#save and export model

In [None]:
#import model (if needed (how to structure selective cell runs?))

In [None]:
#plotting/performance analysis