# Memory-Efficient Text Embedding

**Problem identified:**
- Max text length: 7,041,805 characters (7M!)
- GPU: 12.43 GB (but 10GB already in use)
- Model is very large

**Solutions:**
1. Restart kernel to clear GPU memory
2. Use CPU instead of GPU (slower but stable)
3. Truncate texts to reasonable length (512-2048 tokens)
4. Process with small batch size

In [1]:
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# # DIAGNOSTIC: Check memory requirements before running full embedding
# import pandas as pd
# import torch
# from transformers import AutoTokenizer, AutoModel
# import numpy as np

# # Load data
# input_path = "../derived/user_text_and_stances.csv"
# model_name = "Kingsoft-LLM/QZhou-Embedding"

# df = pd.read_csv(input_path)
# raw_texts = df["all_text"].fillna("").astype(str).tolist()

# print(f"Total texts: {len(raw_texts)}")
# print(f"\nText length statistics (characters):")
# text_lengths = [len(t) for t in raw_texts]
# print(f"  Min: {min(text_lengths):,}")
# print(f"  Max: {max(text_lengths):,}")
# print(f"  Mean: {np.mean(text_lengths):,.0f}")
# print(f"  Median: {np.median(text_lengths):,.0f}")

# # Check GPU memory if available
# if torch.cuda.is_available():
#     print(f"\nGPU: {torch.cuda.get_device_name(0)}")
#     print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
#     print(f"Currently allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
#     print(f"Currently cached: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")
# else:
#     print("\nNo GPU available - using CPU")

# # Load tokenizer to check token counts
# print("\nLoading tokenizer to check token lengths...")
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# # Sample a few texts to check tokenized lengths
# sample_indices = [0, len(raw_texts)//2, len(raw_texts)-1] if len(raw_texts) > 2 else [0]
# print("\nSample tokenized lengths:")
# for idx in sample_indices:
#     tokens = tokenizer(raw_texts[idx], return_tensors="pt", truncation=False)
#     print(f"  Text {idx}: {tokens['input_ids'].shape[1]:,} tokens")

# # Estimate memory for model
# print("\nLoading model to check memory footprint...")
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)

# if torch.cuda.is_available():
#     torch.cuda.empty_cache()
#     torch.cuda.reset_peak_memory_stats(0)
    
#     # Test with one short text
#     test_text = ["This is a test."]
#     test_input = tokenizer(test_text, padding=True, truncation=True, max_length=512, return_tensors="pt")
#     test_input = {k: v.to(device) for k, v in test_input.items()}
    
#     with torch.no_grad():
#         _ = model(**test_input)
    
#     peak_memory = torch.cuda.max_memory_allocated(0) / 1e9
#     print(f"\nModel + short text (512 tokens) peak memory: {peak_memory:.2f} GB")
    
#     # Estimate for max_length=8192
#     estimated_8192 = peak_memory * (8192 / 512) * 1.5  # Rough scaling factor
#     print(f"Estimated memory for 8192 tokens: {estimated_8192:.2f} GB")
    
#     available = torch.cuda.get_device_properties(0).total_memory / 1e9
#     print(f"\nAvailable GPU memory: {available:.2f} GB")
#     if estimated_8192 > available * 0.8:
#         print("⚠️ WARNING: Texts may be too long for your GPU with max_length=8192")
#         suggested_max_len = int(512 * (available * 0.8 / peak_memory))
#         print(f"Suggested max_length: {suggested_max_len}")

# del model
# if torch.cuda.is_available():
#     torch.cuda.empty_cache()

In [3]:
# STEP 1: Clear GPU memory from any stuck processes
import torch
import gc

# Clear Python's garbage collector
gc.collect()

# Clear PyTorch GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    print("GPU cache cleared")
    
    # Check memory status
    print(f"GPU memory allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
    print(f"GPU memory cached: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")
    
    # If still full, you may need to restart the kernel
    if torch.cuda.memory_reserved(0) / 1e9 > 1.0:
        print("\n⚠️ WARNING: GPU still has cached memory.")
        print("Consider: Kernel > Restart Kernel to fully clear GPU memory")
else:
    print("No GPU available")

GPU cache cleared
GPU memory allocated: 0.00 GB
GPU memory cached: 0.00 GB


In [4]:
from nltk import word_tokenize

In [5]:
# Paths
input_path = "../derived/user_text_and_stances.csv"
model_name = "Kingsoft-LLM/QZhou-Embedding"
output_path = "../derived/user_text_and_stances_qzhou_embedding_Kingsoft-LLM_QZhou-Embedding.csv"

# Settings - ADJUSTED FOR MEMORY CONSTRAINTS
batch_size = 1
max_length = 512  # Reduced from 8192 - this is critical for memory
save_every_n_batches = 50  # Save more frequently
use_cpu = True  # Set to False to try GPU (but may crash)

# Load data
df = pd.read_csv(input_path)
raw_texts = df["all_text"].fillna("").astype(str).tolist()
user_ids = df["user_id"].tolist()

print(f"Loaded {len(raw_texts)} texts.")

Loaded 20314 texts.


In [6]:
# Calculate text statistics using word tokenization
word_counts = [len(word_tokenize(t)) for t in tqdm(raw_texts, desc="Counting words")]
max_words = max(word_counts)
avg_words = sum(word_counts) / len(word_counts)
print(f"Max text length: {max_words:,} words")
print(f"Average text length: {avg_words:.1f} words")

Counting words: 100%|██████████| 20314/20314 [05:02<00:00, 67.17it/s] 

Max text length: 1,375,556 words
Average text length: 7056.1 words





In [7]:
# Tokenizer and model
print(f"Loading model {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="left",
    trust_remote_code=True,
)
model = AutoModel.from_pretrained(
    model_name,
    trust_remote_code=True,
)

# Device selection
if use_cpu:
    device = torch.device("cpu")
    print("Using CPU (slower but stable)")
else:
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    print(f"Using device: {device}")

model.to(device)
model.eval()

def mean_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    seq_lengths = attention_mask.sum(dim=-1)
    return torch.stack(
        [
            last_hidden_states[i, -length:, :].sum(dim=0) / length
            for i, length in enumerate(seq_lengths)
        ],
        dim=0,
    )

Loading model Kingsoft-LLM/QZhou-Embedding...


Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.78it/s]

Using CPU (slower but stable)





In [8]:
# Embed with incremental saving
print("Starting embedding process...")
print(f"Processing {len(raw_texts)} texts with batch_size={batch_size}, max_length={max_length}")

temp_embeddings = []
temp_user_ids = []
first_save = True

for batch_idx, i in enumerate(tqdm(range(0, len(raw_texts), batch_size), desc="Embedding batches")):
    batch = raw_texts[i:i + batch_size]
    batch_user_ids = user_ids[i:i + batch_size]
    
    # Tokenize with truncation
    batch_dict = tokenizer(
        batch,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )
    batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
    
    # Generate embeddings (use_cache=False to avoid compatibility issues)
    with torch.no_grad():
        outputs = model(**batch_dict, use_cache=False)
    
    batch_embeds = mean_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
    batch_embeds = F.normalize(batch_embeds, p=2, dim=1)
    
    # Store in temporary lists
    temp_embeddings.extend(batch_embeds.cpu().tolist())
    temp_user_ids.extend(batch_user_ids)
    
    # Clear cache
    if device.type == "cuda":
        torch.cuda.empty_cache()
    
    # Save incrementally
    if (batch_idx + 1) % save_every_n_batches == 0 or i + batch_size >= len(raw_texts):
        temp_df = pd.DataFrame({
            "user_id": temp_user_ids,
            "qzhou_embedding": temp_embeddings,
        })
        
        temp_df.to_csv(
            output_path, 
            mode='w' if first_save else 'a', 
            header=first_save, 
            index=False
        )
        
        # Clear temporary storage
        temp_embeddings = []
        temp_user_ids = []
        first_save = False
        
        if (batch_idx + 1) % (save_every_n_batches * 5) == 0:
            print(f"Progress: {min(i + batch_size, len(raw_texts))} / {len(raw_texts)} texts")

print("\nEmbedding process completed.")
print(f"Saved embeddings to {output_path}")
print(f"\nNote: Texts were truncated to {max_length} tokens.")

Starting embedding process...
Processing 20314 texts with batch_size=1, max_length=512


Embedding batches:   0%|          | 5/20314 [00:54<61:07:11, 10.83s/it]


KeyboardInterrupt: 

In [None]:
# # DIAGNOSTIC: Check memory requirements before running full embedding
# import pandas as pd
# import torch
# from transformers import AutoTokenizer, AutoModel
# import numpy as np

# # Load data
# input_path = "../derived/user_text_and_stances.csv"
# model_name = "Kingsoft-LLM/QZhou-Embedding"

# df = pd.read_csv(input_path)
# raw_texts = df["all_text"].fillna("").astype(str).tolist()

# print(f"Total texts: {len(raw_texts)}")
# print(f"\nText length statistics (characters):")
# text_lengths = [len(t) for t in raw_texts]
# print(f"  Min: {min(text_lengths):,}")
# print(f"  Max: {max(text_lengths):,}")
# print(f"  Mean: {np.mean(text_lengths):,.0f}")
# print(f"  Median: {np.median(text_lengths):,.0f}")

# # Check GPU memory if available
# if torch.cuda.is_available():
#     print(f"\nGPU: {torch.cuda.get_device_name(0)}")
#     print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
#     print(f"Currently allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
#     print(f"Currently cached: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")
# else:
#     print("\nNo GPU available - using CPU")

# # Load tokenizer to check token counts
# print("\nLoading tokenizer to check token lengths...")
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# # Sample a few texts to check tokenized lengths
# sample_indices = [0, len(raw_texts)//2, len(raw_texts)-1] if len(raw_texts) > 2 else [0]
# print("\nSample tokenized lengths:")
# for idx in sample_indices:
#     tokens = tokenizer(raw_texts[idx], return_tensors="pt", truncation=False)
#     print(f"  Text {idx}: {tokens['input_ids'].shape[1]:,} tokens")

# # Estimate memory for model
# print("\nLoading model to check memory footprint...")
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)

# if torch.cuda.is_available():
#     torch.cuda.empty_cache()
#     torch.cuda.reset_peak_memory_stats(0)
    
#     # Test with one short text
#     test_text = ["This is a test."]
#     test_input = tokenizer(test_text, padding=True, truncation=True, max_length=512, return_tensors="pt")
#     test_input = {k: v.to(device) for k, v in test_input.items()}
    
#     with torch.no_grad():
#         _ = model(**test_input)
    
#     peak_memory = torch.cuda.max_memory_allocated(0) / 1e9
#     print(f"\nModel + short text (512 tokens) peak memory: {peak_memory:.2f} GB")
    
#     # Estimate for max_length=8192
#     estimated_8192 = peak_memory * (8192 / 512) * 1.5  # Rough scaling factor
#     print(f"Estimated memory for 8192 tokens: {estimated_8192:.2f} GB")
    
#     available = torch.cuda.get_device_properties(0).total_memory / 1e9
#     print(f"\nAvailable GPU memory: {available:.2f} GB")
#     if estimated_8192 > available * 0.8:
#         print("⚠️ WARNING: Texts may be too long for your GPU with max_length=8192")
#         suggested_max_len = int(512 * (available * 0.8 / peak_memory))
#         print(f"Suggested max_length: {suggested_max_len}")

# del model
# if torch.cuda.is_available():
#     torch.cuda.empty_cache()

In [None]:
# # FIXED VERSION with realistic settings for your GPU
# import pandas as pd
# import torch
# import torch.nn.functional as F
# from transformers import AutoTokenizer, AutoModel
# from tqdm.auto import tqdm
# import gc

# # Paths
# input_path = "../derived/user_text_and_stances.csv"
# model_name = "Kingsoft-LLM/QZhou-Embedding"
# output_path = "../derived/user_text_and_stances_qzhou_embedding_Kingsoft-LLM_QZhou-Embedding.csv"

# # Settings - ADJUSTED FOR YOUR GPU
# batch_size = 1
# max_length = 512  # Reduced from 8192 - this is critical!
# save_every_n_batches = 50  # Save more frequently

# print("IMPORTANT: Clearing GPU memory...")
# torch.cuda.empty_cache()
# gc.collect()

# # Load data
# df = pd.read_csv(input_path)
# raw_texts = df["all_text"].fillna("").astype(str).tolist()
# user_ids = df["user_id"].tolist()

# print(f"Loaded {len(raw_texts)} texts.")
# print(f"Using max_length={max_length} tokens (texts will be truncated if longer)")

# # Tokenizer and model
# print("Loading tokenizer...")
# tokenizer = AutoTokenizer.from_pretrained(
#     model_name,
#     padding_side="left",
#     trust_remote_code=True,
# )

# print("Loading model... (this uses ~10GB GPU memory)")
# model = AutoModel.from_pretrained(
#     model_name,
#     trust_remote_code=True,
#     torch_dtype=torch.float16,  # Use half precision to save memory
# )

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)
# model.eval()

# if torch.cuda.is_available():
#     print(f"GPU memory after loading model: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")

# def mean_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
#     seq_lengths = attention_mask.sum(dim=-1)
#     return torch.stack(
#         [
#             last_hidden_states[i, -length:, :].sum(dim=0) / length
#             for i, length in enumerate(seq_lengths)
#         ],
#         dim=0,
#     )

# # Embed with aggressive memory management
# print(f"Using device: {device}")
# print("Starting embedding process...")

# temp_embeddings = []
# temp_user_ids = []
# first_save = True
# error_count = 0

# for batch_idx, i in enumerate(tqdm(range(0, len(raw_texts), batch_size), desc="Embedding batches")):
#     try:
#         batch = raw_texts[i:i + batch_size]
#         batch_user_ids = user_ids[i:i + batch_size]
        
#         batch_dict = tokenizer(
#             batch,
#             padding=True,
#             truncation=True,
#             max_length=max_length,
#             return_tensors="pt",
#         )
#         batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
        
#         with torch.no_grad():
#             outputs = model(**batch_dict)
        
#         batch_embeds = mean_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
#         batch_embeds = F.normalize(batch_embeds, p=2, dim=1)
        
#         # Move to CPU immediately
#         temp_embeddings.extend(batch_embeds.cpu().tolist())
#         temp_user_ids.extend(batch_user_ids)
        
#         # Aggressive cleanup
#         del batch_dict, outputs, batch_embeds
#         if device.type == "cuda":
#             torch.cuda.empty_cache()
        
#         # Save incrementally
#         if (batch_idx + 1) % save_every_n_batches == 0 or i + batch_size >= len(raw_texts):
#             temp_df = pd.DataFrame({
#                 "user_id": temp_user_ids,
#                 "qzhou_embedding": temp_embeddings,
#             })
            
#             temp_df.to_csv(
#                 output_path, 
#                 mode='w' if first_save else 'a', 
#                 header=first_save, 
#                 index=False
#             )
            
#             print(f"✓ Saved checkpoint: {min(i + batch_size, len(raw_texts))} / {len(raw_texts)} texts")
            
#             # Clear temporary storage
#             temp_embeddings = []
#             temp_user_ids = []
#             first_save = False
#             gc.collect()
    
#     except torch.cuda.OutOfMemoryError:
#         error_count += 1
#         print(f"⚠️ OOM error at text {i}, skipping...")
#         if device.type == "cuda":
#             torch.cuda.empty_cache()
#         gc.collect()
#         if error_count > 10:
#             print("Too many errors, stopping.")
#             break

# print("Embedding process completed.")
# print(f"Saved embeddings to {output_path}")
# print(f"Errors encountered: {error_count}")

# # Final cleanup
# del model, tokenizer
# if torch.cuda.is_available():
#     torch.cuda.empty_cache()
# gc.collect()