In [None]:
# Term-to-Type RAG Implementation


import json
import pandas as pd
import numpy as np
import os
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import gc
from typing import List, Dict, Any
import shutil

import json
import os
from typing import List, Dict, Any


In [None]:
def load_terms2types_data(file_path: str) -> List[Dict[str, Any]]:
    """Loads terms2types data from a JSON file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def load_taskb_test_data(file_path: str) -> List[Dict[str, Any]]:
    """Loads TaskB test data from a JSON file (not txt!)"""
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

# Valid domains for TaskB
domains = ['MatOnto', 'OBI', 'SWEET']

train_data = {}
test_data = {}

print("Loading training data...")
for domain in domains:
    train_path = f'/home/jovyan/rahmatullaev/rand_exps/LLMs4OL-Challenge/2025/TaskB-TermTyping/{domain}/train/term_typing_train_data.json'
    train_data[domain] = load_terms2types_data(train_path)
    print(f"{domain} train: {len(train_data[domain])} examples")

print("\nLoading test data...")
for domain in domains:
    # FIXED: using lowercase file names
    domain_lower = domain.lower()
    test_path = f'../../../2025/TaskB-TermTyping/{domain}/test/{domain_lower}_term_typing_test_data.json'
    
    if os.path.exists(test_path):
        test_json_data = load_taskb_test_data(test_path)
        # Convert to required format
        test_data[domain] = [{"term": item["term"], "types": []} for item in test_json_data]
        print(f"{domain} test: {len(test_data[domain])} terms")
    else:
        print(f"❌ Test file not found: {test_path}")

print("\n✅ TaskB data loaded successfully!")
print(f"Train domains: {list(train_data.keys())}")
print(f"Test domains: {list(test_data.keys())}")

# Show data examples
print("\n📊 Data examples:")
for domain in domains:
    if domain in train_data and train_data[domain]:
        print(f"\n{domain} train example:")
        print(json.dumps(train_data[domain][0], indent=2, ensure_ascii=False))
        break

for domain in domains:
    if domain in test_data and test_data[domain]:
        print(f"\n{domain} test example:")
        print(json.dumps(test_data[domain][0], indent=2, ensure_ascii=False))
        break

In [None]:
# 2. Embedding Model Initialization (Qwen3-Embedding)

def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    """Pooling function to obtain embeddings"""
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_detailed_instruct(task_description: str, query: str) -> str:
    """Creates an instruction prompt for the embedding model"""
    return f'Instruct: {task_description}\nQuery: {query}'

print("Initializing the embedding model...")
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-4B', padding_side='left')
model = AutoModel.from_pretrained(
    'Qwen/Qwen3-Embedding-4B', 
    attn_implementation="flash_attention_2", 
    torch_dtype=torch.bfloat16
).cuda()

print("Model loaded successfully!")

In [None]:
# 3. Functions for Generating Embeddings

def get_embeddings_batch(texts: List[str], model, tokenizer, max_length=8192, batch_size=8):
    """Generate embeddings for texts in batches"""
    all_embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        batch_texts = texts[i:i+batch_size]
        
        # Tokenize the batch
        batch_tokenized = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt",
        )
        batch_tokenized = {k: v.to(model.device) for k, v in batch_tokenized.items()}
        
        # Generate embeddings
        with torch.no_grad():
            outputs = model(**batch_tokenized)
            batch_embeddings = last_token_pool(outputs.last_hidden_state, batch_tokenized['attention_mask'])
            batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
            all_embeddings.append(batch_embeddings.cpu())
        
        # Memory cleanup
        del batch_tokenized, outputs, batch_embeddings
        torch.cuda.empty_cache()
        gc.collect()
    
    return torch.cat(all_embeddings, dim=0)

def compute_similarity_matrix(embeddings1, embeddings2, batch_size=100):
    """Compute a similarity matrix between two embedding sets in batches"""
    n1, n2 = embeddings1.shape[0], embeddings2.shape[0]
    similarity_matrix = torch.zeros(n1, n2)
    
    for i in tqdm(range(0, n1, batch_size), desc="Computing similarity"):
        end_i = min(i + batch_size, n1)
        batch1 = embeddings1[i:end_i]
        
        for j in range(0, n2, batch_size):
            end_j = min(j + batch_size, n2)
            batch2 = embeddings2[j:end_j]
            
            # Compute similarity for the batch
            similarity_batch = torch.mm(batch1, batch2.T)
            similarity_matrix[i:end_i, j:end_j] = similarity_batch
    
    return similarity_matrix

print("Embedding utility functions are ready!")


In [None]:
# 4. Domain Processing: Generate Embeddings and Similarity Matrices

def process_domain_embeddings(domain: str, train_data: List[Dict], test_data: List[Dict] = None):
    """Processes a single domain: generates embeddings and similarity matrices"""
    print(f"\n=== Processing domain: {domain} ===")
    
    # Prepare texts for embedding
    instruction = "Given a term, find similar terms that have related semantic types or categories."
    
    # Train data
    train_terms = [item['term'] for item in train_data]
    train_texts_with_types = [f"Term: {item['term']}, Types: {', '.join(item['types'])}" for item in train_data]
    train_texts_instruct = [get_detailed_instruct(instruction, text) for text in train_texts_with_types]
    
    print(f"Train: {len(train_terms)} terms")
    
    # Generate embeddings for train set
    print("Generating train embeddings...")
    train_embeddings = get_embeddings_batch(train_texts_instruct, model, tokenizer, batch_size=8)
    
    # Train/Train similarity matrix
    print("Computing Train/Train similarity matrix...")
    train_train_scores = compute_similarity_matrix(train_embeddings, train_embeddings, batch_size=100)
    
    # Save Train/Train matrix
    train_train_df = pd.DataFrame(
        train_train_scores.numpy(), 
        columns=train_terms, 
        index=train_terms
    )
    train_train_path = f'../../../src/taskB/method_v1_1/{domain}_train_train_scores.csv'
    train_train_df.to_csv(train_train_path)
    print(f"Saved: {train_train_path}")
    
    # Test data (if available)
    if test_data:
        test_terms = [item['term'] for item in test_data]
        test_texts_with_types = [f"Term: {item['term']}" for item in test_data]  # Test data has no types
        test_texts_instruct = [get_detailed_instruct(instruction, text) for text in test_texts_with_types]
        
        print(f"Test: {len(test_terms)} terms")
        
        # Generate embeddings for test set
        print("Generating test embeddings...")
        test_embeddings = get_embeddings_batch(test_texts_instruct, model, tokenizer, batch_size=8)
        
        # Test/Train similarity matrix
        print("Computing Test/Train similarity matrix...")
        test_train_scores = compute_similarity_matrix(test_embeddings, train_embeddings, batch_size=100)
        
        # Save Test/Train matrix
        test_train_df = pd.DataFrame(
            test_train_scores.numpy(), 
            columns=train_terms, 
            index=test_terms
        )
        test_train_path = f'../../../src/taskB/method_v1_1/{domain}_test_train_scores.csv'
        test_train_df.to_csv(test_train_path)
        print(f"Saved: {test_train_path}")
    
    # Free up memory
    del train_embeddings, train_train_scores
    if test_data:
        del test_embeddings, test_train_scores
    torch.cuda.empty_cache()
    gc.collect()
    
    print(f"Finished processing domain: {domain}")

# Process all domains
for domain in ['MatOnto', 'OBI', 'SWEET']:
    process_domain_embeddings(domain, train_data[domain], test_data[domain])

In [None]:
import json
import pandas as pd
import os
from tqdm import tqdm
import copy

def create_taskb_terms2types_with_rag_fixed():
    """Creates terms2types.json files with RAG data for TaskB (corrected version without cyclic references)"""
    
    # Valid domains for TaskB
    domains = ['MatOnto', 'OBI', 'SWEET']
    
    # Process each domain
    for domain in domains:
        print(f"\n=== Processing {domain} ===")
        
        # Load TaskB train data
        train_path = f'/home/jovyan/rahmatullaev/rand_exps/LLMs4OL-Challenge/2025/TaskB-TermTyping/{domain}/train/term_typing_train_data.json'
        if not os.path.exists(train_path):
            print(f"Train data not found: {train_path}")
            continue
            
        with open(train_path, 'r', encoding='utf-8') as f:
            train_data_domain = json.load(f)
        
        # Create a lookup dictionary (WITHOUT RAG field to avoid cyclic references)
        train_dict = {}
        for item in train_data_domain:
            # Create a clean copy without RAG field
            clean_item = {k: v for k, v in item.items() if k != 'RAG'}
            train_dict[item['term']] = clean_item
        
        # Load similarity matrix (correct path for TaskB)
        scores_path = f'/home/jovyan/rahmatullaev/rand_exps/LLMs4OL-Challenge/src/taskB/method_v1_1/{domain}_train_train_scores.csv'
        if not os.path.exists(scores_path):
            print(f"Similarity matrix not found: {scores_path}")
            continue
            
        scores_df = pd.read_csv(scores_path, index_col=0)
        
        # Simplified function to find similar terms
        def find_top_similar_terms(scores_df, term, top_k=10, exclude_self=True):
            if term not in scores_df.index:
                return []
            try:
                # Simple approach: use .loc and process the result
                scores_row = scores_df.loc[term]
                
                # If duplicated indices yield a DataFrame, take the first row
                if isinstance(scores_row, pd.DataFrame):
                    scores_row = scores_row.iloc[0]
                
                # Remove the term itself if needed
                if exclude_self and term in scores_row.index:
                    scores_row = scores_row.drop(term)
                
                # Sort descending and get top-k
                top_terms = scores_row.sort_values(ascending=False).head(top_k)
                return top_terms.index.tolist()
            except Exception as e:
                print(f"Error processing term '{term}': {e}")
                return []
        
        # Process train data
        print(f"Processing {len(train_data_domain)} train examples...")
        for item in tqdm(train_data_domain, desc=f"Processing {domain} train"):
            term = item['term']
            similar_terms = find_top_similar_terms(scores_df, term, top_k=10, exclude_self=True)
            
            # Add RAG examples (clean copies)
            rag_examples = []
            for similar_term in similar_terms:
                if similar_term in train_dict:
                    # Use deep copy to fully avoid shared references
                    rag_examples.append(copy.deepcopy(train_dict[similar_term]))
            item['RAG'] = rag_examples
        
        # Save updated train data
        with open(train_path, 'w', encoding='utf-8') as f:
            json.dump(train_data_domain, f, ensure_ascii=False, indent=1)
        print(f"✅ Train data saved: {train_path}")
        
        # Process test data
        domain_lower = domain.lower()
        test_path = f'/home/jovyan/rahmatullaev/rand_exps/LLMs4OL-Challenge/2025/TaskB-TermTyping/{domain}/test/{domain_lower}_term_typing_test_data.json'
        test_scores_path = f'/home/jovyan/rahmatullaev/rand_exps/LLMs4OL-Challenge/src/taskB/method_v1_1/{domain}_test_train_scores.csv'
        
        if os.path.exists(test_path) and os.path.exists(test_scores_path):
            with open(test_path, 'r', encoding='utf-8') as f:
                test_json_data = json.load(f)
            
            test_data_domain = [{"term": item["term"], "types": []} for item in test_json_data]
            test_scores_df = pd.read_csv(test_scores_path, index_col=0)
            
            print(f"Processing {len(test_data_domain)} test examples...")
            for item in tqdm(test_data_domain, desc=f"Processing {domain} test"):
                term = item['term']
                if term in test_scores_df.index:
                    try:
                        scores_row = test_scores_df.loc[term]
                        if isinstance(scores_row, pd.DataFrame):
                            scores_row = scores_row.iloc[0]
                        
                        top_train_terms = scores_row.sort_values(ascending=False).head(10).index.tolist()
                        rag_examples = []
                        for train_term in top_train_terms:
                            if train_term in train_dict:
                                rag_examples.append(copy.deepcopy(train_dict[train_term]))
                        item['RAG'] = rag_examples
                    except Exception as e:
                        print(f"Error processing test term '{term}': {e}")
                        item['RAG'] = []
                else:
                    item['RAG'] = []
            
            # Save test data with RAG
            test_output_path = f'/home/jovyan/rahmatullaev/rand_exps/LLMs4OL-Challenge/2025/TaskB-TermTyping/{domain}/test/terms2types.json'
            os.makedirs(os.path.dirname(test_output_path), exist_ok=True)
            with open(test_output_path, 'w', encoding='utf-8') as f:
                json.dump(test_data_domain, f, ensure_ascii=False, indent=1)
            print(f"✅ Test data saved: {test_output_path}")
        else:
            print("Test data or scores not found:")
            print(f"  Test: {test_path} - {'✅' if os.path.exists(test_path) else '❌'}")
            print(f"  Scores: {test_scores_path} - {'✅' if os.path.exists(test_scores_path) else '❌'}")
        
        print(f"Finished processing {domain}")
    
    print("\n🎉 RAG data creation for TaskB completed!")

if __name__ == "__main__":
    print("🔄 Running the corrected TaskB function without cyclic references...")
    create_taskb_terms2types_with_rag_fixed()

In [None]:
import json
import os
from collections import defaultdict

def fix_test_data_add_id_and_check_rag():
    """Fixes test data: adds 'id' field and checks the number of RAG examples"""
    
    domains = ['MatOnto', 'OBI', 'SWEET']
    rag_stats = defaultdict(list)  # Statistics on number of RAG examples
    
    for domain in domains:
        print(f"\n=== Fixing {domain} ===")
        
        # File paths
        domain_lower = domain.lower()
        original_test_path = f'/home/jovyan/rahmatullaev/rand_exps/LLMs4OL-Challenge/2025/TaskB-TermTyping/{domain}/test/{domain_lower}_term_typing_test_data.json'
        current_test_path = f'/home/jovyan/rahmatullaev/rand_exps/LLMs4OL-Challenge/2025/TaskB-TermTyping/{domain}/test/terms2types.json'
        
        # Check file existence
        if not os.path.exists(original_test_path):
            print(f"❌ Original test file not found: {original_test_path}")
            continue
            
        if not os.path.exists(current_test_path):
            print(f"❌ Current test file not found: {current_test_path}")
            continue
        
        # Load original data (with id)
        with open(original_test_path, 'r', encoding='utf-8') as f:
            original_data = json.load(f)
        
        # Load current data (with RAG but without id)
        with open(current_test_path, 'r', encoding='utf-8') as f:
            current_data = json.load(f)
        
        print(f"Original entries: {len(original_data)}")
        print(f"Current entries: {len(current_data)}")
        
        # Build a quick lookup dictionary by term
        original_dict = {item['term']: item for item in original_data}
        
        # Fix data: add id and check RAG
        fixed_data = []
        missing_ids = []
        
        for i, current_item in enumerate(current_data):
            term = current_item['term']
            
            # Look up the corresponding original item
            if term in original_dict:
                original_item = original_dict[term]
                
                # Construct the corrected item
                fixed_item = {
                    'id': original_item['id'],  # Add id from original data
                    'term': current_item['term'],
                    'types': current_item.get('types', []),
                    'RAG': current_item.get('RAG', [])
                }
                
                # Check number of RAG examples
                rag_count = len(fixed_item['RAG'])
                rag_stats[domain].append(rag_count)
                
                if rag_count != 10:
                    print(f"⚠️  {domain} - term '{term}' (id: {fixed_item['id']}): {rag_count} RAG examples instead of 10")
                
                fixed_data.append(fixed_item)
            else:
                print(f"❌ Term '{term}' not found in original data!")
                missing_ids.append(term)
                
                # Add fallback with missing id
                fixed_item = {
                    'id': f"MISSING_ID_{i}",
                    'term': current_item['term'],
                    'types': current_item.get('types', []),
                    'RAG': current_item.get('RAG', [])
                }
                fixed_data.append(fixed_item)
        
        # Save fixed data
        with open(current_test_path, 'w', encoding='utf-8') as f:
            json.dump(fixed_data, f, ensure_ascii=False, indent=1)
        
        print(f"✅ Fixed {len(fixed_data)} records")
        if missing_ids:
            print(f"❌ Missing id for {len(missing_ids)} terms: {missing_ids}")
        
        # RAG statistics
        if rag_stats[domain]:
            rag_counts = rag_stats[domain]
            print(f"📊 RAG statistics:")
            print(f"   Total records: {len(rag_counts)}")
            print(f"   Min RAG: {min(rag_counts)}")
            print(f"   Max RAG: {max(rag_counts)}")
            print(f"   Avg RAG: {sum(rag_counts)/len(rag_counts):.2f}")
            print(f"   Records with 10 RAG: {rag_counts.count(10)}")
            print(f"   Records with != 10 RAG: {len(rag_counts) - rag_counts.count(10)}")
    
    print(f"\n🎉 Fixing completed!")
    
    # Return RAG statistics for further analysis
    return dict(rag_stats)

# Run the fix
print("🔄 Running test data fix...")
rag_statistics = fix_test_data_add_id_and_check_rag()

# Save statistics for later use
print(f"\n📈 RAG statistics saved to variable 'rag_statistics'")
print(f"Available domains: {list(rag_statistics.keys())}")
