In [None]:
!export CUDA_VISIBLE_DEVICES='0,1,2,3'

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
import json
import pandas as pd
import warnings
import pickle
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
from datasets import Dataset

from torch.nn import DataParallel

warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [None]:
DATA_FILE = 'recipe1M_layers/layer1.json'
MODEL_CKPT = '/common/home/dm1487/loaded_models/bert-base-uncased'
os.listdir(MODEL_CKPT)

In [None]:
# getting the WordPiece Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)
# loading Bert and adding it to the GPU.
model = AutoModel.from_pretrained(MODEL_CKPT, output_hidden_states=True).to('cuda') 

In [None]:
## Code to tokenize the dataset parallel for each type - title, ingredients, instructions, full
def tokenize(id_, col):
    return tokenizer(col, padding='max_length', max_length=512, truncation=True)

dataset_encoded = None
data_df = None
layer = 'dataset_tokenized_instructions.pkl'
layer_csv = 'layer3_preprocessed.csv'

if os.path.exists(layer) :
    with open(layer, 'rb') as f:
        dataset_encoded = pickle.load(f)
else:    
    if os.path.exists(layer_csv):
        print('here')
        data_df = pd.read_csv(layer_csv)
        print('done')
    else:
        data_dict = []
        data = None
        with open(DATA_FILE, 'r') as f:
            data = json.load(f)
        for i in tqdm(data):
            ingredients = ' '.join([ingred['text'] for ingred in i['ingredients']])
            instructions = ' '.join([f"{instruct['text']}" for idx, instruct in enumerate(i['instructions'])])
            title = i['title']
            id_ = i['id']
            data_dict.append({
                "id": id_,
                "title": title,
                "ingredients": ingredients,
                "instructions": instructions,
                "full": ' '.join([title, ingredients, instructions])
            })
        data_df = pd.DataFrame(data_dict)
        data_df.to_csv(layer_csv, index=False)

    dataset = Dataset.from_pandas(data_df)
    dataset_encoded = dataset.map(tokenize, input_columns=['id', 'instructions'], batched=True, batch_size=1000, keep_in_memory=True)
    with open(layer, 'wb') as f:
        pickle.dump(dataset_encoded, f)

In [None]:
### Using multiple GPUs
parallel_model = DataParallel(model)

In [None]:
# Dataloader for fast data extraction
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
    
    def __len__(self):
        return self.dataset.shape[0]
    
    def __getitem__(self, idx):
        return {'input_ids': torch.tensor(self.dataset[idx]['input_ids']).long(), 'attention_mask': torch.tensor(self.dataset[idx]['attention_mask']).float()}

dataset = TextDataset(dataset_encoded)
dataloader = DataLoader(dataset, batch_size=220, shuffle=False)

In [None]:
# Bert embedding extraction
features_folder =  'features_text_instructions'
if not os.path.exists(features_folder):
    os.mkdir(features_folder)
features = torch.tensor([])
layers = [-2]
with torch.no_grad():
    for idx, batch in tqdm(enumerate(dataloader)):
        output =  parallel_model(**batch)
        states = output[2][-2]
        if features.shape[0] == 0:
            features = torch.mean(states, dim=1)
        else:
            features = torch.vstack([features, torch.mean(states, dim=1)])
            
        if (idx % 500 == 0  and idx != 0) or (idx == len(dataset) - 1):
            print('saving here at', idx)
            torch.save(features.cpu(), os.path.join(features_folder, f'{idx}.pkl'))
            features = torch.tensor([]) 

In [None]:
torch.save(features.cpu(), os.path.join(features_folder, f'{idx}.pkl'))