#### Let's look at textbook semantics

#### Libraries

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, LongformerModel
from pandas import DataFrame
import torch

### EDA

More about the data [here](https://huggingface.co/datasets/nampdn-ai/tiny-textbooks?row=43).

More about huggingface [datasets](https://huggingface.co/docs/datasets/index).

In [2]:
tiny_textbooks = load_dataset("nampdn-ai/tiny-textbooks")

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

In [3]:
# 420000 rows
tiny_textbooks

DatasetDict({
    train: Dataset({
        features: ['text', 'source', 's', 'len', 'idx', 'textbook'],
        num_rows: 420000
    })
})

In [4]:
tiny_textbooks["train"][4]

{'text': 'An emerging clinical approach to treat substance abuse disorders involves a form of cognitive-behavioral therapy whereby addicts learn to reduce their reactivity to drug-paired stimuli through cue-exposure or extinction training. It is, however, unlikely that extinction training would be consistently effective as a stand- alone treatment in populations that have abused drugs long-term because the key memory systems that are recruited during extinction training are impaired by long-term drug use. There is a critical need to understand mechanisms underlying extinction learning and to establish viable strategies to increase the efficacy of extinction therapies for substance abuse disorders. Key elements of the proposed research plan build on recent advances made in the treatment of conditioned fear and anxiety and other cognitive disorders by pharmacological modulation of glycine, an obligatory co-transmitter at the NMDA glutamate receptor complex. The specific aims of the propo

In [5]:
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")

In [6]:
tokenizer(tiny_textbooks['train'][0]["text"])

{'input_ids': [0, 495, 17334, 329, 29603, 368, 7897, 4, 11209, 1678, 6, 497, 1168, 6508, 4, 28489, 329, 111, 10, 32656, 6, 1044, 6, 2470, 19, 25330, 1270, 6, 208, 17201, 21115, 835, 9, 14904, 5318, 8, 9370, 647, 1044, 4, 91, 34, 823, 291, 107, 9, 647, 676, 23, 1337, 31944, 5033, 1389, 8, 258, 111, 739, 12, 8056, 758, 7260, 25, 157, 25, 400, 12784, 4, 5337, 9, 464, 36, 7110, 16018, 5206, 6, 210, 2835, 1258, 238, 2375, 12475, 8, 5731, 254, 9, 5, 3919, 4964, 4, 91, 6264, 2752, 708, 11, 5, 19261, 6522, 6, 61, 423, 4173, 566, 5, 275, 11, 5, 138, 4, 83, 704, 8, 11989, 6, 4296, 12095, 1800, 647, 893, 6, 21496, 1626, 1321, 7, 3042, 239, 819, 4, 91, 32829, 327, 39, 2655, 8, 676, 11, 5, 173, 23567, 21640, 19, 451, 25, 41, 2171, 32656, 8, 8298, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

input_ids: the numbers representing the tokens in the text.

token_type_ids: indicates which sequence a token belongs to if there is more than one sequence.

attention_mask: indicates whether a token should be masked or not.

In [7]:
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')

In [9]:
def batch_tokenize(batch):
    return tokenizer(batch['text'], max_length=4096, padding=True, truncation=True)

tokenized_data = tiny_textbooks.map(batch_tokenize, batched=True)

Map:   0%|          | 0/420000 [00:00<?, ? examples/s]

In [10]:
# Function to get embeddings for a batch
def batch_to_embeddings(batch):
    input_ids = torch.tensor(batch['input_ids'])
    attention_mask = torch.tensor(batch['attention_mask'])

    # Run through the model
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    # Extract and process embeddings
    embeddings = torch.mean(outputs.last_hidden_state, dim=1).numpy().tolist()
    return {'embeddings': embeddings}

embeddings_data = tokenized_data.map(batch_to_embeddings, batched=True)

Map:   0%|          | 0/420000 [00:00<?, ? examples/s]

: 