# Notebook 2: Labelling

In [26]:
import re
import os
import json
from difflib import SequenceMatcher

AUTO_DATA_PATH = os.path.join('..', 'data', 'processed', 'parsed', 'auto')
SAMPLE_SIZE = 2000

print(f"Data path: {os.path.abspath(AUTO_DATA_PATH)}")

paper_ids = [d for d in os.listdir(AUTO_DATA_PATH) if os.path.isdir(os.path.join(AUTO_DATA_PATH, d))]
paper_ids.sort()
paper_ids = paper_ids[:SAMPLE_SIZE]

Data path: /Users/thomas200905/Documents/Thomas/HCMUS/Third Year/Semester 7/Intro to Data Science/Milestones/MS02/data/processed/parsed/auto


## 1. Extract References
### 1.1 From .bib and .tex files

In [27]:
def clean_latex_string(text):
    if not text:
        return ""
    
    # remove \bibinfo{type}{content} -> content
    text = re.sub(r'\\bibinfo\{.*?\}\{(.*?)\}', r'\1', text)

    # remove latex commands
    text = re.sub(r'\\[a-zA-Z]+\{(.*?)\}', r'\1', text)

    # remove newlines
    text = text.replace('\n', ' ').strip()
    text = re.sub(r'\s+', ' ', text)

    return text

def get_field(field_name, entry):
    field_pattern = re.compile(fr'\b{field_name}\s*=\s*\{{((?:[^{{}}]|\{{[^{{}}]*\}})*)\}}', re.IGNORECASE | re.DOTALL)
    match = field_pattern.search(entry)
    
    if match:
        return match.group(1)
    
    pat_quotes = re.compile(fr'\b{field_name}\s*=\s*\"(.*?)\"', re.IGNORECASE | re.DOTALL)
    match = pat_quotes.search(entry)
    
    if match:
        return match.group(1)  
    else: 
        return None

def extract_from_bib(file_content):
    references = []
    raw_entries = re.split(r'^@', file_content, flags=re.MULTILINE)
    
    for entry in raw_entries:
        entry = entry.strip()
        if not entry or entry.startswith('%'): 
            continue
            
        key_match = re.search(r'^(\w+)\s*\{\s*([^,]+),', entry)
        if not key_match:
            continue
            
        ref_type = key_match.group(1)
        ref_id = key_match.group(2).strip()

        if ref_type.lower() in ['string', 'comment', 'preamble']:
            continue
        
        title_raw = get_field('title', entry)
        year_raw = get_field('year', entry)
        author_raw = get_field('author', entry)
        
        if title_raw:
            title = clean_latex_string(title_raw) 
        else:
            title = ""
        
        if year_raw:
            year = year_raw 
        else: 
            year = ""
        
        authors = []
        if author_raw:
            raw_authors = clean_latex_string(author_raw)
            authors = [a.strip() for a in raw_authors.split(' and ')]

        references.append({
            "id": ref_id,
            "title": title,
            "authors": authors,
            "year": year,
            "source_type": "bib"
        })
        
    return references

In [28]:
def get_paper_references(paper_id, data_path):
    if not data_path:
        paper_path = os.path.join(AUTO_DATA_PATH, paper_id)
    else:
        paper_path = os.path.join(data_path, paper_id)

    unique_references = {} 
    
    # search for all .bib files
    for root, dirs, files in os.walk(paper_path):
        for file in files:
            if file.endswith(".bib"):
                try:
                    with open(os.path.join(root, file), 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                        extracted_references = extract_from_bib(content)
                        
                        for reference in extracted_references:
                            if reference['id'] not in unique_references:
                                unique_references[reference['id']] = reference
                except Exception as e:
                    print(f"Error parsing {file}: {e}")
            
    return list(unique_references.values())

In [29]:
all_papers_refs = {}

for pid in paper_ids:
    print(f"Processing Paper ID: {pid}")
    
    refs = get_paper_references(pid, AUTO_DATA_PATH)
    all_papers_refs[pid] = refs
    
    print(f"  -> Extracted {len(refs)} references.")
    print("-" * 40)

# DEBUG
k = 5
if len(paper_ids) > 0:
    sample_pid = paper_ids[2]
    print(f"\nExample Output for {sample_pid} (First {k} refs):")
    print(json.dumps(all_papers_refs[sample_pid][-k:-1], indent=4))

Processing Paper ID: 2211.03001
  -> Extracted 43 references.
----------------------------------------
Processing Paper ID: 2211.03002
  -> Extracted 29 references.
----------------------------------------
Processing Paper ID: 2211.03003
  -> Extracted 68 references.
----------------------------------------
Processing Paper ID: 2211.03005
  -> Extracted 3 references.
----------------------------------------
Processing Paper ID: 2211.03006
  -> Extracted 39 references.
----------------------------------------
Processing Paper ID: 2211.03007
  -> Extracted 0 references.
----------------------------------------
Processing Paper ID: 2211.03008
  -> Extracted 0 references.
----------------------------------------
Processing Paper ID: 2211.03009
  -> Extracted 119 references.
----------------------------------------
Processing Paper ID: 2211.03010
  -> Extracted 0 references.
----------------------------------------
Processing Paper ID: 2211.03011
  -> Extracted 78 references.
--------------

### 1.2 From references.json

In [30]:
def load_references_json_file(paper_id, data_path):
    if not data_path:
        json_path = os.path.join(AUTO_DATA_PATH, paper_id, 'references.json')
    else:
        json_path = os.path.join(data_path, paper_id, 'references.json')
    
    if not os.path.exists(json_path):
        print(f"references.json not found for paper {paper_id}")
        return []
    
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        if data is None or not isinstance(data, dict):
            print(f"Warning: references.json for {paper_id} is empty or invalid format.")
            return []
            
    except json.JSONDecodeError:
        print(f"Could not decode JSON for paper {paper_id}")
        return []
        
    target_references = []
    
    for arxiv_id, metadata in data.items():
        
        title = metadata.get('title', "")
        authors = metadata.get('authors', [])
        date_str = metadata.get('submission_date', "")
        year = ""

        if date_str:
            year = date_str.split('-')[0]
            
        target_references.append({
            "id": arxiv_id,             
            "title": title,
            "authors": authors,
            "year": year,
            "source_type": "references_json"
        })
        
    return target_references

In [31]:
all_papers_target_refs = {}

for pid in paper_ids:
    targets = load_references_json_file(pid, AUTO_DATA_PATH)
    all_papers_target_refs[pid] = targets
    

print(f"\nLoaded targets for {len(all_papers_target_refs)} papers.")

# DEBUG
if len(paper_ids) > 0:
    sample_pid = paper_ids[1]
    if all_papers_target_refs[sample_pid]:
        print(f"\nExample Target Entry for {sample_pid}:")
        print(json.dumps(all_papers_target_refs[sample_pid][0], indent=4))

references.json not found for paper 2211.04143
references.json not found for paper 2211.04243
references.json not found for paper 2211.04370

Loaded targets for 2000 papers.

Example Target Entry for 2211.03002:
{
    "id": "2011.09318",
    "title": "Analysis of Cryptocurrency Transactions from a Network Perspective: An Overview",
    "authors": [
        "Jiajing Wu",
        "Jieli Liu",
        "Yijing Zhao",
        "Zibin Zheng"
    ],
    "year": "2020",
    "source_type": "references_json"
}


## 2. Clean references

In [32]:
STOP_WORDS = {
    'a', 'an', 'the', 'and', 'or', 'of', 'to', 'in', 'on', 'at', 'by', 'for', 
    'with', 'from', 'as', 'is', 'are', 'was', 'were', 'be', 'been', 'this', 'that'
}

In [33]:
def clean_text(text):
    if not text:
        return ""
    
    # to lowercase
    text = text.lower()
    
    # remove punctuation
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    
    # filter stop words
    tokens = text.split()
    clean_tokens = [t for t in tokens if t not in STOP_WORDS]
    
    return " ".join(clean_tokens)

def tokenize_author_list(authors_list):
    if not authors_list:
        return set()
    
    all_authors = " ".join(authors_list)
    all_authors = all_authors.lower()
    all_authors = re.sub(r'[^a-z\s]', ' ', all_authors)
    
    # separate all author string into tokens
    tokens = set(all_authors.split())
    
    return tokens

In [34]:
def clean_references(reference_list):
    for reference in reference_list:
        reference['clean_title'] = clean_text(reference.get('title', ''))
        reference['clean_author_tokens'] = tokenize_author_list(reference.get('authors', []))
        
    return reference_list


In [35]:
print("Cleaning extracted references (Source)...")
for pid in all_papers_refs:
    clean_references(all_papers_refs[pid])

print("Cleaning target references (Target)...")
for pid in all_papers_target_refs:
    clean_references(all_papers_target_refs[pid])
    
print("Cleaning completed")

Cleaning extracted references (Source)...
Cleaning target references (Target)...
Cleaning completed


## 3. Match reference ID

In [36]:
def calc_string_similarity(s1, s2):
    if not s1 or not s2: 
        return 0.0
    
    return SequenceMatcher(None, s1, s2).ratio()

def generate_automatic_labels(source_reference_list, target_reference_list, threshold=0.8):
    matches = {}
    
    for src in source_reference_list:
        s_title = src.get('clean_title', '')
        s_id = src['id']
        
        if not s_title or len(s_title) <= 10:
            continue
            
        best_candidate = None
        best_score = 0.0
        
        for cand in target_reference_list:
            c_title = cand.get('clean_title', '')
            if not c_title: continue
            
            score = calc_string_similarity(s_title, c_title)
            
            if score > best_score:
                best_score = score
                best_candidate = cand
        
        if best_score >= threshold and best_candidate:
            s_auth = src.get('clean_author_tokens', set())
            c_auth = best_candidate.get('clean_author_tokens', set())
            
            if s_auth and c_auth:
                auth_score = len(s_auth & c_auth) / len(s_auth | c_auth)
                
                if auth_score < 0.3:
                    continue 

            s_year = str(src.get('year', '')).strip()
            c_year = str(best_candidate.get('year', '')).strip()
            
            if s_year.isdigit() and c_year.isdigit():
                diff = abs(int(s_year) - int(c_year))
                if diff > 0: 
                    continue # reject if years are different

            matches[s_id] = best_candidate['id']
            
    return matches

In [37]:
all_papers_gt = {} # ground truth for all papers
total_matches = 0
total_refs = 0

print("Generating Automatic Ground Truth...")

for pid in paper_ids:
    sources = all_papers_refs[pid]
    targets = all_papers_target_refs[pid]
    
    paper_matches = generate_automatic_labels(sources, targets)
    all_papers_gt[pid] = paper_matches
    
    n_source = len(sources)
    n_match = len(paper_matches)
    total_refs += n_source
    total_matches += n_match

print("-" * 40)
print(f"Total References Processed: {total_refs}")
print(f"Total Automatic Matches:    {total_matches}")
print(f"Global Match Rate:          {total_matches/total_refs:.1%}")
print("-" * 40)

Generating Automatic Ground Truth...
----------------------------------------
Total References Processed: 70582
Total Automatic Matches:    10516
Global Match Rate:          14.9%
----------------------------------------


## 4. Save ground truth

In [38]:
debug_gt_dir = os.path.join('..', 'data', 'processed','ground_truth')
os.makedirs(debug_gt_dir, exist_ok=True)

filename = "auto.json"
output_gt_path = os.path.join(debug_gt_dir, filename)

with open(output_gt_path, 'w', encoding='utf-8') as f:
    json.dump(all_papers_gt, f, indent=4, ensure_ascii=False)

print(f"Saved automatic labels to:")
print(os.path.abspath(output_gt_path))

Saved automatic labels to:
/Users/thomas200905/Documents/Thomas/HCMUS/Third Year/Semester 7/Intro to Data Science/Milestones/MS02/data/processed/ground_truth/auto.json
