# Labelling

In [None]:
import re
import os
import json

RAW_DATA_PATH = os.path.join('..', 'testing')

print(f"Data path: {os.path.abspath(RAW_DATA_PATH)}")

paper_ids = [d for d in os.listdir(RAW_DATA_PATH) if os.path.isdir(os.path.join(RAW_DATA_PATH, d))]
paper_ids.sort()

## 1. Extract References
### 1.1 From .bib and .tex files

In [None]:
def clean_latex_string(text):
    if not text:
        return ""
    
    # remove \bibinfo{type}{content} -> content
    text = re.sub(r'\\bibinfo\{.*?\}\{(.*?)\}', r'\1', text)

    # remove latex commands
    text = re.sub(r'\\[a-zA-Z]+\{(.*?)\}', r'\1', text)

    # remove newlines
    text = text.replace('\n', ' ').strip()
    text = re.sub(r'\s+', ' ', text)

    return text

def get_field(field_name, entry):
    field_pattern = re.compile(fr'\b{field_name}\s*=\s*\{{((?:[^{{}}]|\{{[^{{}}]*\}})*)\}}', re.IGNORECASE | re.DOTALL)
    match = field_pattern.search(entry)
    
    if match:
        return match.group(1)
    
    pat_quotes = re.compile(fr'\b{field_name}\s*=\s*\"(.*?)\"', re.IGNORECASE | re.DOTALL)
    match = pat_quotes.search(entry)
    
    if match:
        return match.group(1)  
    else: 
        return None

def extract_from_bib(file_content):
    references = []
    raw_entries = re.split(r'^@', file_content, flags=re.MULTILINE)
    
    for entry in raw_entries:
        entry = entry.strip()
        if not entry or entry.startswith('%'): 
            continue
            
        key_match = re.search(r'^(\w+)\s*\{\s*([^,]+),', entry)
        if not key_match:
            continue
            
        ref_type = key_match.group(1)
        ref_id = key_match.group(2).strip()

        if ref_type.lower() in ['string', 'comment', 'preamble']:
            continue
        
        title_raw = get_field('title', entry)
        year_raw = get_field('year', entry)
        author_raw = get_field('author', entry)
        
        if title_raw:
            title = clean_latex_string(title_raw) 
        else:
            title = ""
        
        if year_raw:
            year = year_raw 
        else: 
            year = ""
        
        authors = []
        if author_raw:
            raw_authors = clean_latex_string(author_raw)
            authors = [a.strip() for a in raw_authors.split(' and ')]

        references.append({
            "id": ref_id,
            "title": title,
            "authors": authors,
            "year": year,
            "source_type": "bib"
        })
        
    return references

def extract_from_bibitem(file_content):
    bib_section = re.search(r'\\begin\{thebibliography\}.*?\n(.*?)\\end\{thebibliography\}', file_content, re.DOTALL)
    if not bib_section:
        return []
    
    raw_items = re.split(r'\\bibitem', bib_section.group(1))
    references = []

    for item in raw_items:
        if not item.strip(): 
            continue
        
        id_match = re.search(r'\{([^}]+)\}', item)
        
        if not id_match: 
            continue

        ref_id = id_match.group(1).strip()
        
        title_match = re.search(r'\\bibinfo\{title\}\{((?:[^{}]|\{[^{}]*\})*)\}', item, re.DOTALL | re.IGNORECASE)
        if title_match:
            title = clean_latex_string(title_match.group(1)) 
        else:
            title = ""
        
        year_match = re.search(r'\\bibinfo\{year\}\{([0-9]{4})\}', item)
        if year_match:
            year = year_match.group(1) 
        else: 
            year = ""
        
        author_matches = re.findall(r'\\bibinfo\{author\}\{((?:[^{}]|\{[^{}]*\})*)\}', item)
        authors = [clean_latex_string(a) for a in author_matches]

        references.append({
            "id": ref_id,
            "title": title,
            "authors": authors,
            "year": year,
            "source_type": "bibitem"
        })
            
    return references

In [None]:
def get_paper_references(paper_id):
    paper_path = os.path.join(RAW_DATA_PATH, paper_id)
    unique_references = {} 
    found_bib = False
    
    # search for all .bib files
    for root, dirs, files in os.walk(paper_path):
        for file in files:
            if file.endswith(".bib"):
                try:
                    with open(os.path.join(root, file), 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                        extracted_references = extract_from_bib(content)
                        
                        for reference in extracted_references:
                            if reference['id'] not in unique_references:
                                unique_references[reference['id']] = reference
                                
                        if extracted_references:
                            found_bib = True
                except Exception as e:
                    print(f"Error parsing {file}: {e}")

    # no .bib file found, check .tex files
    if not found_bib:
        for root, dirs, files in os.walk(paper_path):
            for file in files:
                if file.endswith(".tex"):
                    try:
                        with open(os.path.join(root, file), 'r', encoding='utf-8', errors='ignore') as f:
                            content = f.read()
                            
                            if "\\begin{thebibliography}" in content:
                                extracted_references = extract_from_bibitem(content)
                                
                                for reference in extracted_references:
                                    if reference['id'] not in unique_references:
                                        unique_references[reference['id']] = reference
                    except Exception as e:
                        print(f"Error parsing {file}: {e}")
            
    return list(unique_references.values())

In [None]:
all_papers_refs = {}

for pid in paper_ids:
    print(f"Processing Paper ID: {pid}")
    
    refs = get_paper_references(pid)
    all_papers_refs[pid] = refs
    
    print(f"  -> Extracted {len(refs)} references.")
    print("-" * 40)

# DEBUG
k = 5
if len(paper_ids) > 0:
    sample_pid = paper_ids[2]
    print(f"\nExample Output for {sample_pid} (First {k} refs):")
    print(json.dumps(all_papers_refs[sample_pid][-k:-1], indent=4))

In [None]:
# DEBUG
debug_dir = os.path.join('..', 'output', 'debug_extracted_refs')
os.makedirs(debug_dir, exist_ok=True)

target_pid = sample_pid 
data_to_save = all_papers_refs[target_pid]

output_filename = f"{target_pid}_extracted_refs_2.json"
output_path = os.path.join(debug_dir, output_filename)

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(data_to_save, f, indent=4, ensure_ascii=False)

print(f"Successfully saved extracted references for {target_pid} to:")
print(os.path.abspath(output_path))

### 1.2 From references.json

In [None]:
def load_references_json_file(paper_id):
    json_path = os.path.join(RAW_DATA_PATH, paper_id, 'references.json')
    
    if not os.path.exists(json_path):
        print(f"references.json not found for paper {paper_id}")
        return []
    
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        if data is None or not isinstance(data, dict):
            print(f"Warning: references.json for {paper_id} is empty or invalid format.")
            return []
            
    except json.JSONDecodeError:
        print(f"Could not decode JSON for paper {paper_id}")
        return []
        
    target_references = []
    
    for arxiv_id, metadata in data.items():
        
        title = metadata.get('title', "")
        authors = metadata.get('authors', [])
        date_str = metadata.get('submission_date', "")
        year = ""

        if date_str:
            year = date_str.split('-')[0]
            
        target_references.append({
            "id": arxiv_id,             
            "title": title,
            "authors": authors,
            "year": year,
            "source_type": "references_json"
        })
        
    return target_references

In [None]:
all_papers_target_refs = {}

for pid in paper_ids:
    targets = load_references_json_file(pid)
    all_papers_target_refs[pid] = targets
    

print(f"\nLoaded targets for {len(all_papers_target_refs)} papers.")

# DEBUG
if len(paper_ids) > 0:
    sample_pid = paper_ids[1]
    if all_papers_target_refs[sample_pid]:
        print(f"\nExample Target Entry for {sample_pid}:")
        print(json.dumps(all_papers_target_refs[sample_pid][0], indent=4))

## 2. Clean references

In [None]:
STOP_WORDS = {
    'a', 'an', 'the', 'and', 'or', 'of', 'to', 'in', 'on', 'at', 'by', 'for', 
    'with', 'from', 'as', 'is', 'are', 'was', 'were', 'be', 'been', 'this', 'that'
}

In [None]:
def clean_text(text):
    if not text:
        return ""
    
    # to lowercase
    text = text.lower()
    
    # remove punctuation
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    
    # filter stop words
    tokens = text.split()
    clean_tokens = [t for t in tokens if t not in STOP_WORDS]
    
    return " ".join(clean_tokens)

def tokenize_author_list(authors_list):
    if not authors_list:
        return set()
    
    all_authors = " ".join(authors_list)
    all_authors = all_authors.lower()
    all_authors = re.sub(r'[^a-z\s]', ' ', all_authors)
    
    # separate all author string into tokens
    tokens = set(all_authors.split())
    
    return tokens

In [None]:
def clean_references(reference_list):
    for reference in reference_list:
        reference['clean_title'] = clean_text(reference.get('title', ''))
        reference['clean_author_tokens'] = tokenize_author_list(reference.get('authors', []))
        
    return reference_list


In [None]:
print("Cleaning extracted references (Source)...")
for pid in all_papers_refs:
    clean_references(all_papers_refs[pid])

print("Cleaning target references (Target)...")
for pid in all_papers_target_refs:
    clean_references(all_papers_target_refs[pid])
    
print("Cleaning completed")

In [None]:
# DEBUG
if len(paper_ids) > 0:
    sample_pid = paper_ids[0]
    
    # check source (bib)
    if all_papers_refs[sample_pid]:
        ref = all_papers_refs[sample_pid][0]
        print(f"--- Source (BibTeX) ---")
        print(f"Original Title:  {ref['title']}")
        print(f"Clean Title:     {ref['clean_title']}")
        print(f"Original Authors:{ref['authors']}")
        print(f"Author Tokens:   {ref['clean_author_tokens']}")
        print("")

    # check target (json)
    if all_papers_target_refs[sample_pid]:
        target = all_papers_target_refs[sample_pid][0]
        print(f"--- Target (JSON) ---")
        print(f"Original Title:  {target['title']}")
        print(f"Clean Title:     {target['clean_title']}")
        print(f"Original Authors:{target['authors']}")
        print(f"Author Tokens:   {target['clean_author_tokens']}")

## 3. Match reference ID

In [None]:
def generate_automatic_labels(source_reference_list, target_reference_list):
    target_map = {}
    for reference in target_reference_list:
        # avoid titles that are too short, short titles doesnt contain enough info
        if reference['clean_title'] and len(reference['clean_title']) > 10: 
            target_map[reference['clean_title']] = reference['id']
            
    matches = {}
    
    for reference in source_reference_list:
        s_title = reference['clean_title']
        s_id = reference['id']
        
        if not s_title or len(s_title) <= 10:
            continue
            
        if s_title in target_map:
            matched_target_reference = target_map[s_title]
            matches[s_id] = matched_target_reference
            
    return matches

In [None]:
all_papers_gt = {} # ground truth for all papers
total_matches = 0
total_refs = 0

print("Generating Automatic Ground Truth...")

for pid in paper_ids:
    sources = all_papers_refs[pid]
    targets = all_papers_target_refs[pid]
    
    paper_matches = generate_automatic_labels(sources, targets)
    all_papers_gt[pid] = paper_matches
    
    n_source = len(sources)
    n_match = len(paper_matches)
    total_refs += n_source
    total_matches += n_match

print("-" * 40)
print(f"Total References Processed: {total_refs}")
print(f"Total Automatic Matches:    {total_matches}")
print(f"Global Match Rate:          {total_matches/total_refs:.1%}")
print("-" * 40)

## 4. Save ground truth

In [None]:
debug_gt_dir = os.path.join('..', 'output', 'debug_ground_truth')
os.makedirs(debug_gt_dir, exist_ok=True)

filename = "automatic_labels_check_2.json"
output_gt_path = os.path.join(debug_gt_dir, filename)

with open(output_gt_path, 'w', encoding='utf-8') as f:
    json.dump(all_papers_gt, f, indent=4, ensure_ascii=False)

print(f"Successfully saved automatic labels to:")
print(os.path.abspath(output_gt_path))