In [None]:
## Importing all required libraries
import torch
from transformers import BertTokenizer, BertModel
import os
import time
import numpy as np
import json
import nltk
from nltk import sent_tokenize
from tqdm import tqdm
nltk.download('punkt')

## installing Transformers
!pip install transformers

## Loading the model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

## Setting model to evaluation mode
model.eval()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

## Loading data
## Change the paths s.t it matches your file path
IT_train_path = '/kaggle/input/rhetorical-dataset/IT_train.json'
IT_test_path = '/kaggle/input/rhetorical-dataset/IT_test.json'
CL_train_path = '/kaggle/input/rhetorical-dataset/CL_train.json'
CL_test_path = '/kaggle/input/rhetorical-dataset/CL_test.json'

## Loading the json data
with open(IT_train_path, 'r') as f:
    train_it = json.load(f)
    f.close()
    
with open(IT_test_path, 'r') as f:
    test_it = json.load(f)
    f.close()

with open(CL_train_path, 'r') as f:
    train_cl = json.load(f)
    f.close()

with open(CL_test_path, 'r') as f:
    test_cl = json.load(f)
    f.close()


it_train_files = list(train_it.keys())
cl_train_files = list(train_cl.keys())
it_test_files = list(test_it.keys())
cl_test_files = list(test_cl.keys())

out = 'train_it_bert_emb/' ## Give the path where you want to save the embeddings

!mkdir 'train_it_bert_emb/'

## Getting the embeddings for case in train_it, similarly follow for other files
## Here we use the embedding corresponding to the [CLS] token as the sentences representation
for case in tqdm(train_it.keys()):
    sentences = train_it[case]['sentences']
    all_text = ""
    start_time = time.time()
    for idx in range(len(sentences)):
        
        text = sentences[idx]
        marked_text = "[CLS] " + text + " [SEP]"
        tokenized_text = tokenizer.tokenize(marked_text)
        if(len(tokenized_text) > 510):
            tokenized_text = tokenized_text[:510] + ['[SEP]']
            
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

        segments_ids = [1] * len(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens]).to(device)
        segments_tensors = torch.tensor([segments_ids]).to(device)
        
        with torch.no_grad():
            outputs = model(tokens_tensor, segments_tensors)
        
        emb = outputs[0].squeeze()[0].flatten().tolist()
        
        emb = [str(round(i,5)) for i in emb]
        final = " ".join(emb)
        final += "\t"+train_it[case]['complete'][idx]
        all_text += final+"\n"
    with open(os.path.join(out, case),"w") as f:
        f.write(all_text)