In [1]:
from transformers import AutoModel, AutoTokenizer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Choose a pre-trained model (e.g., 'aubmindlab/bert-base-arabert')
model_name = 'aubmindlab/bert-base-arabert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Example Arabic sentence
sentence = "التضمين السياقي يلتقط غنى اللغة."

# Tokenize and obtain contextual embeddings
tokens = tokenizer(sentence, return_tensors='pt')
outputs = model(**tokens)

# The last layer hidden states often contain contextual embeddings
contextual_embeddings = outputs.last_hidden_state

# Apply global average pooling (GAP) along the sequence dimension
global_avg_pooled_embedding = torch.mean(contextual_embeddings, dim=1)

# If necessary, apply linear transformation to get embeddings of size 100
desired_embedding_size = 100
linear_layer = torch.nn.Linear(global_avg_pooled_embedding.size(-1), desired_embedding_size)
transformed_embedding = linear_layer(global_avg_pooled_embedding)

# Apply activation function (e.g., ReLU)
transformed_embedding = torch.relu(transformed_embedding)

# For demonstration, let's print the transformed embedding
print("Transformed Embedding:")
print(transformed_embedding)

Downloading tokenizer_config.json: 100%|██████████| 637/637 [00:00<00:00, 160kB/s]
Downloading config.json: 100%|██████████| 578/578 [00:00<00:00, 196kB/s]
Downloading vocab.txt: 100%|██████████| 717k/717k [00:00<00:00, 1.30MB/s]
Downloading tokenizer.json: 100%|██████████| 2.26M/2.26M [00:01<00:00, 1.48MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 56.0kB/s]
Downloading model.safetensors: 100%|██████████| 543M/543M [05:37<00:00, 1.61MB/s] 


Transformed Embedding:
tensor([[0.2089, 0.1642, 0.4452, 0.1068, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.3230, 0.0000, 0.1015, 0.0000, 0.0115, 0.0068, 0.2136, 0.3448,
         0.0000, 0.0049, 0.0000, 0.0000, 0.0000, 0.4373, 0.0000, 0.0000, 0.0000,
         0.2356, 0.0718, 0.2735, 0.1882, 0.3006, 0.2955, 0.0000, 0.2480, 0.1132,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.3874, 0.4874, 0.1143, 0.1500,
         0.0000, 0.0000, 0.0000, 0.4013, 0.0000, 0.0000, 0.0224, 0.0555, 0.0000,
         0.1533, 0.2097, 0.0000, 0.0379, 0.0000, 0.0000, 0.1200, 0.1110, 0.4712,
         0.0000, 0.0000, 0.0000, 0.3777, 0.3037, 0.0000, 0.2800, 0.0000, 0.0000,
         0.2735, 0.1855, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.3157, 0.0019, 0.2251, 0.1862, 0.0000, 0.0000, 0.1194,
         0.0000, 0.0257, 0.2784, 0.0272, 0.0483, 0.2141, 0.0000, 0.0000, 0.0000,
         0.0000]], grad_fn=<ReluBackward0>)


In [6]:
# Example Arabic sentence
sentence = "التضمينالسياقييلتقطغنىاللغة."

# Tokenize and obtain contextual embeddings
tokens = tokenizer(sentence, return_tensors='pt')
outputs = model(**tokens)

# The last layer hidden states often contain contextual embeddings
contextual_embeddings = outputs.last_hidden_state

# Apply global average pooling (GAP) along the sequence dimension
global_avg_pooled_embedding = torch.mean(contextual_embeddings, dim=1)

# If necessary, apply linear transformation to get embeddings of size 100
desired_embedding_size = 100
linear_layer = torch.nn.Linear(global_avg_pooled_embedding.size(-1), desired_embedding_size)
transformed_embedding = linear_layer(global_avg_pooled_embedding)

# Apply activation function (e.g., ReLU)
transformed_embedding = torch.relu(transformed_embedding)

# For demonstration, let's print the transformed embedding
print("Transformed Embedding:")
print(transformed_embedding.shape)

Transformed Embedding:
torch.Size([1, 100])
