### Deep Data Cleaning

#### Deduplication Steps

##### Step 1 - ID-based Deduplication
- **Process**: Remove duplicate papers based on exact `paper_id` matching
- **Rules**:
  - Papers with identical `paper_id` are considered duplicates
  - Only one copy of each unique `paper_id` is retained
  - Papers without `paper_id` are temporarily preserved for further processing

##### Step 2 - Title Hash Deduplication
- **Process**: Eliminate papers with identical titles using MD5 hashing
- **Rules**:
  - Paper titles are normalized (lowercased, trimmed) before hashing
  - MD5 hash is computed for each normalized title
  - Papers with identical title hashes are considered duplicates
  - Only one paper per unique title hash is retained

##### Step 3 - Title Similarity Deduplication
- **Process**: Remove papers with highly similar titles using Jaccard similarity
- **Rules**:
  - Titles are tokenized into words after normalization
  - Jaccard similarity is calculated between title word sets
  - Papers with similarity score ≥ 0.9 are considered potential duplicates
  - Priority rules for retaining duplicates:
    1. Keep the paper with more recent `publish_year`
    2. If `publish_year` is identical or unavailable, retain the first occurrence
- **Jaccard Similarity Formula**:  
  `similarity = |Title_A ∩ Title_B| / |Title_A ∪ Title_B|`

In [None]:
# strict_deduplication.py
import json
import hashlib
from collections import defaultdict

def load_jsonl(file_path):
    """Load JSONL file"""
    papers = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            papers.append(json.loads(line))
    return papers

def save_jsonl(papers, file_path):
    """Save JSONL file"""
    with open(file_path, "w", encoding="utf-8") as f:
        for paper in papers:
            f.write(json.dumps(paper, ensure_ascii=False) + "\n")

def safe_get_title(paper):
    """Safely get and clean title from paper dictionary"""
    title = paper.get("title")
    if title is None:
        return ""
    # Ensure it's a string
    if not isinstance(title, str):
        title = str(title)
    return title.strip().lower()

# 1. Exact ID Deduplication
def dedup_by_id(papers):
    seen_ids = set()
    unique_papers = []
    for paper in papers:
        pid = paper.get("paper_id")
        if pid and pid not in seen_ids:
            seen_ids.add(pid)
            unique_papers.append(paper)
        elif not pid:
            unique_papers.append(paper)
    print(f"Step 1 - After ID deduplication: {len(unique_papers)} papers")
    return unique_papers

# 2. Title Hash Deduplication (exact match)
def dedup_by_title_hash(papers):
    seen_hashes = set()
    unique_papers = []
    for paper in papers:
        title = safe_get_title(paper)
        if not title:  # Skip empty titles or keep them? For now, we keep them
            unique_papers.append(paper)
            continue
        h = hashlib.md5(title.encode("utf-8")).hexdigest()
        if h not in seen_hashes:
            seen_hashes.add(h)
            unique_papers.append(paper)
    print(f"Step 2 - After title hash deduplication: {len(unique_papers)} papers")
    return unique_papers

# 3. Title Similarity Deduplication (intersection ratio threshold)
def dedup_by_title_similarity(papers, threshold=0.9):
    to_remove = set()
    n = len(papers)
    for i in range(n):
        if i in to_remove:
            continue
        title_i = safe_get_title(papers[i])
        if not title_i:
            continue  # Skip papers without titles for similarity comparison
        title_i_words = title_i.split()
        set_i = set(title_i_words)
        if not set_i:
            continue
        for j in range(i+1, n):
            if j in to_remove:
                continue
            title_j = safe_get_title(papers[j])
            if not title_j:
                continue
            title_j_words = title_j.split()
            set_j = set(title_j_words)
            if not set_j:
                continue
            similarity = len(set_i & set_j) / len(set_i | set_j)
            if similarity >= threshold:
                # Keep the one with more recent publish_date
                # Try different possible date fields
                date_i = papers[i].get("publish_year") or papers[i].get("publish_date") or 0
                date_j = papers[j].get("publish_year") or papers[j].get("publish_date") or 0
                
                # Convert to integer year if possible
                if isinstance(date_i, str):
                    try:
                        # Try to extract year from string
                        import re
                        year_match = re.search(r'\d{4}', date_i)
                        date_i = int(year_match.group()) if year_match else 0
                    except:
                        date_i = 0
                
                if isinstance(date_j, str):
                    try:
                        import re
                        year_match = re.search(r'\d{4}', date_j)
                        date_j = int(year_match.group()) if year_match else 0
                    except:
                        date_j = 0
                
                if date_i >= date_j:
                    to_remove.add(j)
                else:
                    to_remove.add(i)
                    break  # Break inner loop since i is being removed
    unique_papers = [papers[k] for k in range(n) if k not in to_remove]
    print(f"Step 3 - After title similarity deduplication: {len(unique_papers)} papers")
    return unique_papers

# Main Workflow
if __name__ == "__main__":
    input_file = "../Data_Collection/merged_papers.jsonl"
    output_file = "merged_papers_dedup.jsonl"

    papers = load_jsonl(input_file)
    print(f"Original paper count: {len(papers)}")
    
    # Check for papers with None titles
    none_titles = sum(1 for p in papers if p.get("title") is None)
    print(f"Papers with None title: {none_titles}")
    
    # Print sample of problematic papers
    for i, p in enumerate(papers[:10]):
        if p.get("title") is None:
            print(f"Paper {i} has None title: {p.get('paper_id', 'No ID')}")

    # Perform deduplication in sequence
    papers = dedup_by_id(papers)
    papers = dedup_by_title_hash(papers)
    papers = dedup_by_title_similarity(papers, threshold=0.9)

    save_jsonl(papers, output_file)
    print(f"Final deduplicated paper count: {len(papers)}")

Original paper count: 7397
Papers with None title: 1
Step 1 - After ID deduplication: 7397 papers
Step 2 - After title hash deduplication: 6259 papers
Step 3 - After title similarity deduplication: 6242 papers
Final deduplicated paper count: 6242


#### Text Cleaning

Perform deep text cleaning on the title and abstract (remove LaTeX, HTML, special characters, and standardize Spaces)

In [None]:
# text_cleaning.py
import json
import re

def load_jsonl(file_path):
    papers = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            papers.append(json.loads(line))
    return papers

def save_jsonl(papers, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        for paper in papers:
            f.write(json.dumps(paper, ensure_ascii=False) + "\n")

def clean_text(papers):
    for paper in papers:
        # Clean title
        if 'title' in paper and paper['title'] is not None:
            # Normalize whitespace in title
            paper['title'] = re.sub(r'\s+', ' ', str(paper['title'])).strip()
        
        # Clean abstract
        if 'abstract' in paper and paper['abstract'] is not None:
            abstract = str(paper['abstract'])
            
            # Remove LaTeX inline math expressions: $...$
            abstract = re.sub(r'\$.*?\$', '', abstract)
            
            # Remove LaTeX display math expressions: $$...$$
            abstract = re.sub(r'\$\$.*?\$\$', '', abstract)
            
            # Remove LaTeX commands with braces: \command{...}
            abstract = re.sub(r'\\[a-zA-Z]+\{.*?\}', '', abstract)
            
            # Remove standalone LaTeX commands
            abstract = re.sub(r'\\[a-zA-Z]+\s*', '', abstract)
            
            # Remove HTML entities
            abstract = re.sub(r'&[a-zA-Z]+;', '', abstract)
            
            # Remove non-ASCII characters (optional, can be adjusted)
            abstract = re.sub(r'[^\x00-\x7F]+', ' ', abstract)
            
            # Normalize whitespace
            abstract = re.sub(r'\s+', ' ', abstract).strip()
            
            paper['abstract'] = abstract
            paper['abstract_source'] = "original_cleaned"
        
        # Clean authors list
        if 'authors' in paper and isinstance(paper['authors'], list):
            cleaned_authors = []
            for author in paper['authors']:
                if author is not None:
                    # Convert to string and strip whitespace
                    cleaned_author = str(author).strip()
                    if cleaned_author:  # Only add non-empty strings
                        cleaned_authors.append(cleaned_author)
            paper['authors'] = cleaned_authors
    
    return papers

if __name__ == "__main__":
    # Input and output file paths
    input_file = "merged_papers_dedup.jsonl"
    output_file = "papers_cleaned_text.jsonl"
    
    papers = load_jsonl(input_file)
    
    papers = clean_text(papers)
    
    save_jsonl(papers, output_file)
    
    print(f"Text cleaning completed successfully.")
    print(f"Original count: {len(papers)} papers")
    print(f"Output saved to: {output_file}")

Text cleaning completed successfully.
Original count: 6242 papers
Output saved to: papers_cleaned_text.jsonl


#### Filter out articles with exceptionally low citations

Remove papers with abnormally low citation numbers (optional threshold) to ensure data quality.    
Since our project focuses on newly published papers, especially on timeliness, some of the relatively recent papers collected were not cited.    
Therefore, we have kept all of them. If there are specific requirements regarding the number of citations, we can make the necessary adjustments accordingly.

In [None]:
# citation_filter.py
import json

def load_jsonl(file_path):
    papers = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            papers.append(json.loads(line))
    return papers

def save_jsonl(papers, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        for paper in papers:
            f.write(json.dumps(paper, ensure_ascii=False) + "\n")

def filter_by_citations(papers, min_citations=0):
    filtered = [p for p in papers if p.get("citation_count", 0) >= min_citations]
    print(f"Citation filter: {len(papers)} -> {len(filtered)}")
    return filtered

if __name__ == "__main__":
    input_file = "papers_cleaned_text.jsonl"
    output_file = "papers_filtered_citations.jsonl"

    papers = load_jsonl(input_file)
    papers = filter_by_citations(papers, min_citations=0)
    save_jsonl(papers, output_file)


Citation filter: 6242 -> 6242


#### Clean other content like field_of_study

Unify the capitalization of fields_of_study and remove duplicate null values

In [11]:
# fields_of_study_clean.py
import json

def load_jsonl(file_path):
    papers = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            papers.append(json.loads(line))
    return papers

def save_jsonl(papers, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        for paper in papers:
            f.write(json.dumps(paper, ensure_ascii=False) + "\n")

def clean_fields_of_study(papers):
    for paper in papers:
        if 'fields_of_study' in paper and isinstance(paper['fields_of_study'], list):
            cleaned = list({f.strip().title() for f in paper['fields_of_study'] if f.strip()})
            paper['fields_of_study'] = cleaned
    return papers

if __name__ == "__main__":
    input_file = "papers_filtered_citations.jsonl"
    output_file = "papers_cleaned_fields.jsonl"

    papers = load_jsonl(input_file)
    papers = clean_fields_of_study(papers)
    save_jsonl(papers, output_file)


#### Do the format alignment

Align all the fields into a uniform output format to facilitate subsequent data processing and training, etc.

In [12]:
# format_alignment.py
import json

REQUIRED_FIELDS = [
    "source", "paper_id", "title", "abstract", "abstract_source",
    "authors", "publish_year", "venue", "citation_count",
    "fields_of_study", "url"
]

def load_jsonl(file_path):
    papers = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            papers.append(json.loads(line))
    return papers

def save_jsonl(papers, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        for paper in papers:
            f.write(json.dumps(paper, ensure_ascii=False) + "\n")

def align_format(papers):
    aligned = []
    for paper in papers:
        new_paper = {}
        for field in REQUIRED_FIELDS:
            new_paper[field] = paper.get(field, "" if field != "authors" and field != "fields_of_study" else [])
        aligned.append(new_paper)
    return aligned

if __name__ == "__main__":
    input_file = "papers_cleaned_fields.jsonl"
    output_file = "papers_final_aligned.jsonl"

    papers = load_jsonl(input_file)
    papers = align_format(papers)
    save_jsonl(papers, output_file)
    print(f"Format alignment done. Papers count: {len(papers)}")


Format alignment done. Papers count: 6242
