# Notebook 2: Labelling

In [16]:
import re
import os
import json

AUTO_DATA_PATH = os.path.join('..', 'data', 'raw', 'auto')

print(f"Data path: {os.path.abspath(AUTO_DATA_PATH)}")

paper_ids = [d for d in os.listdir(AUTO_DATA_PATH) if os.path.isdir(os.path.join(AUTO_DATA_PATH, d))]
paper_ids.sort()

Data path: /Users/thomas200905/Documents/Thomas/HCMUS/Third Year/Semester 7/Intro to Data Science/Milestones/MS02/data/raw/auto


## 1. Extract References
### 1.1 From .bib and .tex files

In [17]:
def clean_latex_string(text):
    if not text:
        return ""
    
    # remove \bibinfo{type}{content} -> content
    text = re.sub(r'\\bibinfo\{.*?\}\{(.*?)\}', r'\1', text)

    # remove latex commands
    text = re.sub(r'\\[a-zA-Z]+\{(.*?)\}', r'\1', text)

    # remove newlines
    text = text.replace('\n', ' ').strip()
    text = re.sub(r'\s+', ' ', text)

    return text

def get_field(field_name, entry):
    field_pattern = re.compile(fr'\b{field_name}\s*=\s*\{{((?:[^{{}}]|\{{[^{{}}]*\}})*)\}}', re.IGNORECASE | re.DOTALL)
    match = field_pattern.search(entry)
    
    if match:
        return match.group(1)
    
    pat_quotes = re.compile(fr'\b{field_name}\s*=\s*\"(.*?)\"', re.IGNORECASE | re.DOTALL)
    match = pat_quotes.search(entry)
    
    if match:
        return match.group(1)  
    else: 
        return None

def extract_from_bib(file_content):
    references = []
    raw_entries = re.split(r'^@', file_content, flags=re.MULTILINE)
    
    for entry in raw_entries:
        entry = entry.strip()
        if not entry or entry.startswith('%'): 
            continue
            
        key_match = re.search(r'^(\w+)\s*\{\s*([^,]+),', entry)
        if not key_match:
            continue
            
        ref_type = key_match.group(1)
        ref_id = key_match.group(2).strip()

        if ref_type.lower() in ['string', 'comment', 'preamble']:
            continue
        
        title_raw = get_field('title', entry)
        year_raw = get_field('year', entry)
        author_raw = get_field('author', entry)
        
        if title_raw:
            title = clean_latex_string(title_raw) 
        else:
            title = ""
        
        if year_raw:
            year = year_raw 
        else: 
            year = ""
        
        authors = []
        if author_raw:
            raw_authors = clean_latex_string(author_raw)
            authors = [a.strip() for a in raw_authors.split(' and ')]

        references.append({
            "id": ref_id,
            "title": title,
            "authors": authors,
            "year": year,
            "source_type": "bib"
        })
        
    return references

def extract_from_bibitem(file_content):
    references = []
    
    raw_items = re.split(r'\\bibitem', file_content)
    
    bibinfo_pattern = re.compile(r'\\bibinfo\{(.*?)\}\{((?:[^{}]|\{[^{}]*\})*)\}', re.DOTALL | re.IGNORECASE)
    title_italic_pattern = re.compile(r'(?:\{\\em\s+|\\emph\{|\\textit\{)((?:[^{}]|\{[^{}]*\})*)\}', re.IGNORECASE | re.DOTALL)
    year_pattern = re.compile(r'\((\d{4})\)')

    for item in raw_items:
        if not item.strip(): continue
        
        id_match = re.search(r'^\{([^}]+)\}', item.strip())
        if not id_match: 
            continue

        ref_id = id_match.group(1).strip()
        
        if r'\bibinfo' in item:
            fields = {'author': [], 'title': '', 'year': ''}
            for key, value in bibinfo_pattern.findall(item):
                key = key.lower()
                clean_val = clean_latex_string(value)
                if key == 'author':
                    fields['author'].append(clean_val)
                else:
                    fields[key] = clean_val
            
            title = fields['title']
            authors = fields['author']
            year = fields['year']
            
        else:
            title_match = title_italic_pattern.search(item)
            if title_match:
                title = clean_latex_string(title_match.group(1))

                title_start_index = item.find(title_match.group(0))
                id_end_index = item.find('}') + 1
                
                if title_start_index > id_end_index:
                    raw_authors = item[id_end_index:title_start_index]
                    clean_auth_str = clean_latex_string(raw_authors)
                    authors = [clean_auth_str]
                else:
                    authors = []
            else:
                id_end_index = item.find('}') + 1
                title = clean_latex_string(item[id_end_index:])
                authors = []

            year_match = year_pattern.search(item)
            year = year_match.group(1) if year_match else ""

        references.append({
            "id": ref_id,
            "title": title,
            "authors": authors,
            "year": year,
            "source_type": "bibitem"
        })
            
    return references

In [18]:
def get_paper_references(paper_id, data_path):
    if not data_path:
        paper_path = os.path.join(AUTO_DATA_PATH, paper_id)
    else:
        paper_path = os.path.join(data_path, paper_id)

    unique_references = {} 
    
    # search for all .bib files
    for root, dirs, files in os.walk(paper_path):
        for file in files:
            if file.endswith(".bib"):
                try:
                    with open(os.path.join(root, file), 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                        extracted_references = extract_from_bib(content)
                        
                        for reference in extracted_references:
                            if reference['id'] not in unique_references:
                                unique_references[reference['id']] = reference
                except Exception as e:
                    print(f"Error parsing {file}: {e}")

    # search for all .tex files
    for root, dirs, files in os.walk(paper_path):
        for file in files:
            if file.endswith(".tex"):
                try:
                    with open(os.path.join(root, file), 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                        
                        extracted_references = extract_from_bibitem(content)
                        
                        for reference in extracted_references:
                            if reference['id'] not in unique_references:
                                unique_references[reference['id']] = reference
                except Exception as e:
                    print(f"Error parsing {file}: {e}")
            
    return list(unique_references.values())

In [19]:
all_papers_refs = {}

for pid in paper_ids:
    print(f"Processing Paper ID: {pid}")
    
    refs = get_paper_references(pid, AUTO_DATA_PATH)
    all_papers_refs[pid] = refs
    
    print(f"  -> Extracted {len(refs)} references.")
    print("-" * 40)

# DEBUG
k = 5
if len(paper_ids) > 0:
    sample_pid = paper_ids[2]
    print(f"\nExample Output for {sample_pid} (First {k} refs):")
    print(json.dumps(all_papers_refs[sample_pid][-k:-1], indent=4))

Processing Paper ID: 2211.03001
  -> Extracted 79 references.
----------------------------------------
Processing Paper ID: 2211.03002
  -> Extracted 29 references.
----------------------------------------
Processing Paper ID: 2211.03003
  -> Extracted 73 references.
----------------------------------------
Processing Paper ID: 2211.03004
  -> Extracted 287 references.
----------------------------------------
Processing Paper ID: 2211.03005
  -> Extracted 131 references.
----------------------------------------
Processing Paper ID: 2211.03006
  -> Extracted 39 references.
----------------------------------------
Processing Paper ID: 2211.03007
  -> Extracted 0 references.
----------------------------------------
Processing Paper ID: 2211.03008
  -> Extracted 0 references.
----------------------------------------
Processing Paper ID: 2211.03009
  -> Extracted 130 references.
----------------------------------------
Processing Paper ID: 2211.03010
  -> Extracted 0 references.
-----------

In [20]:
# DEBUG
debug_dir = os.path.join('..', 'output', 'debug_extracted_refs')
os.makedirs(debug_dir, exist_ok=True)

# Select the paper you want to check (using the sample_pid from your previous cell)
target_pid = sample_pid 
data_to_save = all_papers_refs[target_pid]

# Define the filename
output_filename = f"{target_pid}_extracted_refs_2.json"
output_path = os.path.join(debug_dir, output_filename)

# Save to JSON
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(data_to_save, f, indent=4, ensure_ascii=False)

print(f"Successfully saved extracted references for {target_pid} to:")
print(os.path.abspath(output_path))

Successfully saved extracted references for 2211.03003 to:
/Users/thomas200905/Documents/Thomas/HCMUS/Third Year/Semester 7/Intro to Data Science/Milestones/MS02/output/debug_extracted_refs/2211.03003_extracted_refs_2.json


### 1.2 From references.json

In [21]:
def load_references_json_file(paper_id, data_path):
    if not data_path:
        json_path = os.path.join(AUTO_DATA_PATH, paper_id, 'references.json')
    else:
        json_path = os.path.join(data_path, paper_id, 'references.json')
    
    if not os.path.exists(json_path):
        print(f"references.json not found for paper {paper_id}")
        return []
    
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        if data is None or not isinstance(data, dict):
            print(f"Warning: references.json for {paper_id} is empty or invalid format.")
            return []
            
    except json.JSONDecodeError:
        print(f"Could not decode JSON for paper {paper_id}")
        return []
        
    target_references = []
    
    for arxiv_id, metadata in data.items():
        
        title = metadata.get('title', "")
        authors = metadata.get('authors', [])
        date_str = metadata.get('submission_date', "")
        year = ""

        if date_str:
            year = date_str.split('-')[0]
            
        target_references.append({
            "id": arxiv_id,             
            "title": title,
            "authors": authors,
            "year": year,
            "source_type": "references_json"
        })
        
    return target_references

In [22]:
all_papers_target_refs = {}

for pid in paper_ids:
    targets = load_references_json_file(pid, AUTO_DATA_PATH)
    all_papers_target_refs[pid] = targets
    

print(f"\nLoaded targets for {len(all_papers_target_refs)} papers.")

# DEBUG
if len(paper_ids) > 0:
    sample_pid = paper_ids[1]
    if all_papers_target_refs[sample_pid]:
        print(f"\nExample Target Entry for {sample_pid}:")
        print(json.dumps(all_papers_target_refs[sample_pid][0], indent=4))


Loaded targets for 498 papers.

Example Target Entry for 2211.03002:
{
    "id": "2011.09318",
    "title": "Analysis of Cryptocurrency Transactions from a Network Perspective: An Overview",
    "authors": [
        "Jiajing Wu",
        "Jieli Liu",
        "Yijing Zhao",
        "Zibin Zheng"
    ],
    "year": "2020",
    "source_type": "references_json"
}


## 2. Clean references

In [23]:
STOP_WORDS = {
    'a', 'an', 'the', 'and', 'or', 'of', 'to', 'in', 'on', 'at', 'by', 'for', 
    'with', 'from', 'as', 'is', 'are', 'was', 'were', 'be', 'been', 'this', 'that'
}

In [24]:
def clean_text(text):
    if not text:
        return ""
    
    # to lowercase
    text = text.lower()
    
    # remove punctuation
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    
    # filter stop words
    tokens = text.split()
    clean_tokens = [t for t in tokens if t not in STOP_WORDS]
    
    return " ".join(clean_tokens)

def tokenize_author_list(authors_list):
    if not authors_list:
        return set()
    
    all_authors = " ".join(authors_list)
    all_authors = all_authors.lower()
    all_authors = re.sub(r'[^a-z\s]', ' ', all_authors)
    
    # separate all author string into tokens
    tokens = set(all_authors.split())
    
    return tokens

In [25]:
def clean_references(reference_list):
    for reference in reference_list:
        reference['clean_title'] = clean_text(reference.get('title', ''))
        reference['clean_author_tokens'] = tokenize_author_list(reference.get('authors', []))
        
    return reference_list


In [26]:
print("Cleaning extracted references (Source)...")
for pid in all_papers_refs:
    clean_references(all_papers_refs[pid])

print("Cleaning target references (Target)...")
for pid in all_papers_target_refs:
    clean_references(all_papers_target_refs[pid])
    
print("Cleaning completed")

Cleaning extracted references (Source)...
Cleaning target references (Target)...
Cleaning completed


In [27]:
# DEBUG
if len(paper_ids) > 0:
    sample_pid = paper_ids[0]
    
    # check source (bib)
    if all_papers_refs[sample_pid]:
        ref = all_papers_refs[sample_pid][0]
        print(f"--- Source (BibTeX) ---")
        print(f"Original Title:  {ref['title']}")
        print(f"Clean Title:     {ref['clean_title']}")
        print(f"Original Authors:{ref['authors']}")
        print(f"Author Tokens:   {ref['clean_author_tokens']}")
        print("")

    # check target (json)
    if all_papers_target_refs[sample_pid]:
        target = all_papers_target_refs[sample_pid][0]
        print(f"--- Target (JSON) ---")
        print(f"Original Title:  {target['title']}")
        print(f"Clean Title:     {target['clean_title']}")
        print(f"Original Authors:{target['authors']}")
        print(f"Author Tokens:   {target['clean_author_tokens']}")

--- Source (BibTeX) ---
Original Title:  Information Visualization, Visual Data Mining, and Its Application to Drug Design
Clean Title:     information visualization visual data mining its application drug design
Original Authors:['Georges Grinstein', 'Daniel Keim', 'Matthew Ward']
Author Tokens:   {'matthew', 'keim', 'daniel', 'georges', 'grinstein', 'ward'}

--- Target (JSON) ---
Original Title:  PoVRPoint: Authoring Presentations in Mobile Virtual Reality
Clean Title:     povrpoint authoring presentations mobile virtual reality
Original Authors:['Verena Biener', 'Travis Gesslein', 'D. Schneider', 'Felix Kawala', 'Alexander Otte', 'P. Kristensson', 'M. Pahud', 'E. Ofek', 'Cuauhtli Campos', 'Matjavz Kljun', 'Klen vCopivc Pucihar', 'Jens Grubert']
Author Tokens:   {'cuauhtli', 'otte', 'campos', 'pucihar', 'm', 'jens', 'schneider', 'd', 'kristensson', 'grubert', 'alexander', 'gesslein', 'e', 'felix', 'verena', 'kljun', 'biener', 'travis', 'klen', 'pahud', 'vcopivc', 'ofek', 'p', 'matjav

## 3. Match reference ID

In [28]:
def generate_automatic_labels(source_reference_list, target_reference_list):
    target_map = {}
    for reference in target_reference_list:
        # avoid titles that are too short, short titles doesnt contain enough info
        if reference['clean_title'] and len(reference['clean_title']) > 10: 
            target_map[reference['clean_title']] = reference['id']
            
    matches = {}
    
    for reference in source_reference_list:
        s_title = reference['clean_title']
        s_id = reference['id']
        
        if not s_title or len(s_title) <= 10:
            continue
            
        if s_title in target_map:
            matched_target_reference = target_map[s_title]
            matches[s_id] = matched_target_reference
            
    return matches

In [29]:
all_papers_gt = {} # ground truth for all papers
total_matches = 0
total_refs = 0

print("Generating Automatic Ground Truth...")

for pid in paper_ids:
    sources = all_papers_refs[pid]
    targets = all_papers_target_refs[pid]
    
    paper_matches = generate_automatic_labels(sources, targets)
    all_papers_gt[pid] = paper_matches
    
    n_source = len(sources)
    n_match = len(paper_matches)
    total_refs += n_source
    total_matches += n_match

print("-" * 40)
print(f"Total References Processed: {total_refs}")
print(f"Total Automatic Matches:    {total_matches}")
print(f"Global Match Rate:          {total_matches/total_refs:.1%}")
print("-" * 40)

Generating Automatic Ground Truth...
----------------------------------------
Total References Processed: 399500
Total Automatic Matches:    4471
Global Match Rate:          1.1%
----------------------------------------


## 4. Save ground truth

In [30]:
debug_gt_dir = os.path.join('..', 'data', 'processed','ground_truth')
os.makedirs(debug_gt_dir, exist_ok=True)

filename = "auto.json"
output_gt_path = os.path.join(debug_gt_dir, filename)

with open(output_gt_path, 'w', encoding='utf-8') as f:
    json.dump(all_papers_gt, f, indent=4, ensure_ascii=False)

print(f"Saved automatic labels to:")
print(os.path.abspath(output_gt_path))

Saved automatic labels to:
/Users/thomas200905/Documents/Thomas/HCMUS/Third Year/Semester 7/Intro to Data Science/Milestones/MS02/data/processed/ground_truth/auto.json
