In [2]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import pandas as pd

# Load Mental BERT

In [3]:
# Load model and tokenizer
model_name = "mental/mental-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else 
                    "mps" if torch.backends.mps.is_available() else "cpu")
model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/639 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def get_embeddings(texts, batch_size=64):
    """Extract CLS token embeddings in batches"""
    embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        
        # Tokenize and convert to tensors
        inputs = tokenizer(
            batch, 
            padding=True, 
            truncation=True, 
            max_length=512, 
            return_tensors="pt"
        ).to(device)
        
        # Get embeddings
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Use CLS token as sentence embedding
        batch_embeddings = outputs.last_hidden_state[:,0,:].cpu().numpy()
        embeddings.append(batch_embeddings)
    
    return np.vstack(embeddings)

# Batch preprocessing of text

In [16]:
# Load preprocessed data
train_df = pd.read_csv('data/train_preprocessed_fill_missing.csv')
test_df = pd.read_csv('data/test_preprocessed_fill_missing.csv')


In [17]:
train_df.head()

Unnamed: 0,id,cleaned_text,target
0,11098,post remove request member hi welcome immediat...,suicidal-thoughts-and-self-harm
1,116,hi nmtb thank post think lot people terrify st...,anxiety
2,7189,hello cas fair anxiety depression work lot com...,anxiety
3,4350,hey everyone discover another mum 's sister de...,anxiety
4,9749,hi everyone guess title say really .. 28 year ...,depression


In [18]:

# Generate embeddings (preserve order)
print("Generating training embeddings...")
# train_embeddings = get_embeddings(train_df['cleaned_text'].tolist())

print("Generating test embeddings...")
test_embeddings = get_embeddings(test_df['cleaned_text'].tolist())

Generating training embeddings...


100%|██████████| 347/347 [12:20<00:00,  2.13s/it]


Generating test embeddings...


 87%|████████▋ | 34/39 [01:13<00:10,  2.15s/it]


TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

# Save embeddings for later reuse

In [19]:
# Save embeddings with IDs for reference
np.save('data/train_embeddings.npy', train_embeddings)
# np.save('data/test_embeddings.npy', test_embeddings)

In [10]:
train_embeddings.shape

(22151, 768)

In [20]:
34 * 64

2176