In [28]:
### Import relevant libraries
import pandas as pd
from pathlib import Path
from datetime import date
import numpy as np
from tqdm import tqdm

from pdf_text_extraction import (
        extract_text_from_pdf,
        extract_main_content,
        extract_references,
        process_pdf_with_metadata,
        process_pdf_without_metadata
    )

import warnings

# Suppress openpyxl extension warnings (common with Excel files from different sources)
warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')

In [25]:
# =============================================================================
# 1) Configuration
# =============================================================================

DATA_DIR = Path("../data")
OUTPUT_DIR = DATA_DIR  # CSV backup saved to data folder


# LNI to year mapping (necesseary for files without metadata or for files with metadataa, in casee the "dc.date.issued" column in the metadata excel files has no values)
LNI_MAPPING = {
    "lni37": 2003,
    "lni52": 2004,
    "lni66": 2005,
    "lni87": 2006,
    "lni111": 2007,
    "lni132": 2008,
    "lni153": 2009,
    "lni169": 2010,
    "lni188": 2011,
    "lni207": 2012,
    "lni218": 2013,
    "lni233": 2014,
    "lni247": 2015,
    "lni262": 2016,
    "lni273": 2017,
    "lni284": 2018,
    "lni297": 2019,
    "lni308": 2020,
    "lni316": 2021,
    "lni322": 2022,
    "lni338": 2023,
    "lni356": 2024,
    "lni369": 2025,
}

# Full proceedings PDFs to exclude (manually identified)
FULL_PROCEEDINGS_SET = {
    DATA_DIR / "lni153/lni-p-153-komplett.pdf",
    DATA_DIR / "lni169/lni-p-169-komplett.pdf",
    DATA_DIR / "lni188/lni-p-188-komplett.pdf",
    DATA_DIR / "lni207/lni-p-207-komplett.pdf",
    DATA_DIR / "lni218/lni-p-218-komplett.pdf",
    DATA_DIR / "lni233/lni-p-233-komplett.pdf",
    DATA_DIR / "lni247/lni-p-247-komplett.pdf",
    DATA_DIR / "lni262/lni-p-262-komplett.pdf",
    DATA_DIR / "lni273/lni-p-273-komplett.pdf",
    DATA_DIR / "lni284/proceedings_complete.pdf",
    DATA_DIR / "lni297/DELFI2019_Tagungsband_komplett.pdf",
    DATA_DIR / "lni297/DELFI2019_Tagungsband_komplett_Onlineversion.pdf",
    DATA_DIR / "lni308/DELFI2020_Proceedings_komplett.pdf",
    DATA_DIR / "lni316/DELFI_2021-Proceedings.pdf",
    DATA_DIR / "lni322/DELFI_2022_Proceedings_FINAL.pdf",
    DATA_DIR / "lni338/Komplettband.pdf",
    DATA_DIR / "lni356/DELFI_2024_ProceedingsComplete_alt.pdf",
    DATA_DIR / "lni356/DELFI_2024_ProceedingsComplete.pdf",
    DATA_DIR / "lni356/proceedings.pdf",
    DATA_DIR / "lni369/DELFI2025_ProceedingsComplete.pdf",
}

# Terms to exclude (covers, prefaces, etc.)
EXCLUSION_TERMS = ["cover", "vorwort", "preface", "foreword"]

# Processing settings
MIN_PAGES = 3  # Only process papers with > 2 pages

print(f"Full proceedings to exclude: {len(FULL_PROCEEDINGS_SET)}")

Full proceedings to exclude: 20


In [26]:
# =============================================================================
# 2) Helper Functions
# =============================================================================


def should_exclude_pdf(pdf_path: Path) -> bool:
    """
    Check if PDF should be excluded from annotation.
    
    Excludes:
    - Full proceedings (from manual set)
    - Covers, prefaces, forewords (by filename keyword)
    
    Returns:
        True if PDF should be excluded, False otherwise
    """
    # Check manual exclusion set
    if pdf_path in FULL_PROCEEDINGS_SET:
        return True
    
    # Check filename keywords (case-insensitive)
    filename_lower = pdf_path.name.lower()
    if any(term in filename_lower for term in EXCLUSION_TERMS):
        return True
    
    return False



def parse_page_range(page_str) -> tuple[int | None, int | None]:
    """
    Parse 'start-end' page range format into (start_page, end_page).
    
    Handles mci.reference.pages format like '101-112'.
    
    Args:
        page_str: String in format 'start-end' or None/NaN
    
    Returns:
        Tuple of (start_page, end_page) as integers, or (None, None) if parsing fails
    """
    if pd.isna(page_str) or not page_str:
        return None, None
    
    parts = str(page_str).strip().split('-')
    if len(parts) == 2:
        try:
            return int(parts[0].strip()), int(parts[1].strip())
        except ValueError:
            return None, None
    return None, None


def load_metadata_excel(metadata_path: Path) -> pd.DataFrame:
    """
    Load metadata Excel file with special handling for DeLFI2018.
    
    Args:
        metadata_path: Path to metadata Excel file
    
    Returns:
        DataFrame with metadata
    """
    if 'DeLFI2018' in metadata_path.name:
        return pd.read_excel(metadata_path, sheet_name='Einreichung_GI') # the first sheet "Arbeitsversion" contains dirty data
    else:
        return pd.read_excel(metadata_path)
    

# Quick test of parse_page_range
assert parse_page_range("101-112") == (101, 112)
assert parse_page_range("1-10") == (1, 10)
assert parse_page_range(None) == (None, None)
assert parse_page_range("") == (None, None)
print("\n✓ parse_page_range tests passed")



✓ parse_page_range tests passed


### 1) Pdf files with metadata

In [30]:
# =============================================================================
# 3) Discover and Load Metadata Files
# =============================================================================

# Find all metadata Excel files
metadata_files = sorted(DATA_DIR.glob("*/metadata*.xlsx"))

print(f"Found {len(metadata_files)} metadata files:\n")

# Load each metadata file into a dictionary: {lni_folder: DataFrame}
metadata_dict = {}

for metadata_path in metadata_files:
    lni_folder = metadata_path.parent.name
    df = load_metadata_excel(metadata_path)
    metadata_dict[lni_folder] = {
        'path': metadata_path,
        'df': df,
        'n_rows': len(df)
    }
    print(f"  {lni_folder}: {metadata_path.name} ({len(df)} rows)")


print(f"\n{'='*60}")
print(f"Total LNI folders with metadata: {len(metadata_dict)}")
print(f"Total papers in metadata: {sum(m['n_rows'] for m in metadata_dict.values())}")

# Quick inspection: show columns from first and last file to see variation
first_lni = list(metadata_dict.keys())[0]
last_lni = list(metadata_dict.keys())[-1]

print(f"\n{'='*60}")
print(f"Column comparison (first vs last):")
print(f"\n{first_lni} columns ({len(metadata_dict[first_lni]['df'].columns)}):")
print(f"  {list(metadata_dict[first_lni]['df'].columns)}")
print(f"\n{last_lni} columns ({len(metadata_dict[last_lni]['df'].columns)}):")
print(f"  {list(metadata_dict[last_lni]['df'].columns)}")


Found 17 metadata files:

  lni132: metadata-lni-132.xlsx (39 rows)
  lni153: metadata-lni-153.xlsx (25 rows)
  lni188: metadata-lni-188.xlsx (23 rows)
  lni207: metadata-lni-207.xlsx (31 rows)
  lni218: metadata-lni-218.xlsx (34 rows)
  lni284: metadata-DeLFI2018.xlsx (47 rows)
  lni297: metadata-lni-297.xlsx (60 rows)
  lni308: metadata-delfi-2020.xlsx (64 rows)
  lni316: metadata-lni-316.xlsx (67 rows)
  lni322: metadata_lni-322.xlsx (50 rows)
  lni338: metadata_lni-338.xlsx (67 rows)
  lni356: metadata_lni-356.xlsx (58 rows)
  lni369: metadata_lni-369.xlsx (61 rows)
  lni37: metadata-lni-37.xlsx (52 rows)
  lni52: metadata-lni-52.xlsx (57 rows)
  lni66: metadata-lni-66.xlsx (55 rows)
  lni87: metadata-lni-87.xlsx (44 rows)

Total LNI folders with metadata: 17
Total papers in metadata: 834

Column comparison (first vs last):

lni132 columns (20):
  ['dc.title', 'dc.contributor.author', 'dc.language.iso', 'dc.relation.ispartof', 'dc.contributor.editor', 'mci.reference.pages', 'dc.des

In [33]:
# =============================================================================
# 4) Normalize Metadata Columns
# =============================================================================

# Column mapping: source column(s) -> MySQL column name
# For columns with variants (mci.* vs gi.*), we list alternatives
COLUMN_MAPPING = {
    # Direct mappings (same in all files)
    'dc.title': 'title',
    'dc.contributor.author': 'authors',
    'dc.language.iso': 'language',
    'dc.relation.ispartof': 'proceeding_title',
    'dc.contributor.editor': 'editors',
    'dc.description.abstract': 'abstract',
    'dc.subject': 'subject',
    'filename': 'filename',
    'dc.identifier.doi': 'doi',
    'dc.identifier.isbn': 'isbn',
    'dc.identifier.issn': 'issn',
    'dc.relation.ispartofseries': 'series_title',
    'dc.publisher': 'publisher',
    'dc.pubPlace': 'publication_place',
    'dc.date.issued': 'year',
    'dc.type': 'publication_type',
}

# Columns with mci.* / gi.* variants
VARIANT_COLUMNS = {
    'conference_date': ['mci.conference.date', 'gi.conference.date'],
    'conference_location': ['mci.conference.location', 'gi.conference.location'],
    'session_title': ['mci.conference.sessiontitle', 'gi.conference.sessiontitle'],
}

# Special columns (require parsing or only in some files)
# - mci.reference.pages -> start_page, end_page (15 of 17 files)
# - gi.citation.startPage, gi.citation.endPage (2 of 17 files)
# - gi.conference.review -> peer_review_status (only in gi.* files)


def normalize_metadata_df(df: pd.DataFrame, lni_folder: str) -> pd.DataFrame:
    """
    Normalize a metadata DataFrame to match MySQL column names.
    
    Handles:
    - Direct column renames
    - mci.* vs gi.* variants
    - Page range parsing (mci.reference.pages -> start_page, end_page)
    - gi.citation.startPage/endPage handling
    
    Args:
        df: Raw metadata DataFrame
        lni_folder: LNI folder name (for logging)
    
    Returns:
        Normalized DataFrame with MySQL column names
    """
    normalized = pd.DataFrame()
    
    # 1) Direct mappings
    for src_col, mysql_col in COLUMN_MAPPING.items():
        if src_col in df.columns:
            normalized[mysql_col] = df[src_col]
        else:
            normalized[mysql_col] = None
    
    # 2) Variant columns (mci.* / gi.*)
    for mysql_col, variants in VARIANT_COLUMNS.items():
        for variant in variants:
            if variant in df.columns:
                normalized[mysql_col] = df[variant]
                break
        else:
            normalized[mysql_col] = None
    
    # 3) Page columns - handle both formats
    if 'mci.reference.pages' in df.columns:
        # Parse "101-112" format
        pages = df['mci.reference.pages'].apply(parse_page_range)
        normalized['start_page'] = pages.apply(lambda x: x[0])
        normalized['end_page'] = pages.apply(lambda x: x[1])
    elif 'gi.citation.startPage' in df.columns and 'gi.citation.endPage' in df.columns:
        # Direct columns (convert to int, handling NaN)
        normalized['start_page'] = pd.to_numeric(df['gi.citation.startPage'], errors='coerce').astype('Int64')
        normalized['end_page'] = pd.to_numeric(df['gi.citation.endPage'], errors='coerce').astype('Int64')
    else:
        normalized['start_page'] = None
        normalized['end_page'] = None
    
    # 4) Peer review status (only in gi.* files)
    if 'gi.conference.review' in df.columns:
        normalized['peer_review_status'] = df['gi.conference.review']
    else:
        normalized['peer_review_status'] = None
    
    return normalized



# Apply normalization to all metadata DataFrames
normalized_metadata = {}
all_normalized_dfs = []

for lni_folder, data in metadata_dict.items():
    df_normalized = normalize_metadata_df(data['df'], lni_folder)
    normalized_metadata[lni_folder] = df_normalized
    all_normalized_dfs.append(df_normalized)
    
# Combine all into one DataFrame
df_all_metadata = pd.concat(all_normalized_dfs, ignore_index=True)

print(f"Normalized metadata for {len(normalized_metadata)} LNI folders")
print(f"Total rows: {len(df_all_metadata)}")
print(f"\nColumns ({len(df_all_metadata.columns)}):")
print(df_all_metadata.columns.tolist())

# Quick check: show sample row
print(f"\nSample row (first paper):")
print(df_all_metadata.iloc[0].to_dict())


Normalized metadata for 17 LNI folders
Total rows: 834

Columns (22):
['title', 'authors', 'language', 'proceeding_title', 'editors', 'abstract', 'subject', 'filename', 'doi', 'isbn', 'issn', 'series_title', 'publisher', 'publication_place', 'year', 'publication_type', 'conference_date', 'conference_location', 'session_title', 'start_page', 'end_page', 'peer_review_status']

Sample row (first paper):
{'title': 'Gibt es eine Net Generation? Widerlegung einer Mystifizierung', 'authors': 'Schulmeister, Rolf', 'language': 'de', 'proceeding_title': 'DeLFI 2008: Die 6. e-Learning Fachtagung Informatik', 'editors': 'Seehusen, Silke; Lucke, Ulrike; Fischer, Stefan', 'abstract': 'Es ist von einer Net Generation, von der Generation @, der Generation Y oder den Millenials die Rede, und es werden Mutmaßungen über die Rolle der Net Generation für die Lehre angestellt. Der Beitrag ist als kritische Analyse solcher Behauptungen und Mutmaßungen zu verstehen und als Diskurs zur Medien- nutzung aus de

In [34]:
# =============================================================================
# 5) Verify all MySQL columns are present (except id, text, references)
# =============================================================================

MYSQL_COLUMNS = [
    # 'id',  # AUTO_INCREMENT - generated by MySQL
    'title',
    'authors', 
    'year',
    'abstract',
    # 'text',  # Extracted from PDF
    # 'references',  # Extracted from PDF
    'start_page',
    'end_page',
    'subject',
    'filename',
    'editors',
    'doi',
    'isbn',
    'issn',
    'proceeding_title',
    'series_title',
    'publisher',
    'publication_place',
    'conference_date',
    'conference_location',
    'session_title',
    'publication_type',
    'language',
    'peer_review_status',
]

# Check which columns are present/missing
df_columns = set(df_all_metadata.columns)
required_columns = set(MYSQL_COLUMNS)

present = required_columns & df_columns
missing = required_columns - df_columns
extra = df_columns - required_columns

print(f"\n{'='*60}")
print("MySQL COLUMN VERIFICATION")
print(f"{'='*60}")
print(f"Required columns: {len(MYSQL_COLUMNS)}")
print(f"Present in DataFrame: {len(present)}")

if missing:
    print(f"\n❌ MISSING columns ({len(missing)}):")
    for col in sorted(missing):
        print(f"   - {col}")
else:
    print(f"\n✓ All required columns present!")

if extra:
    print(f"\nExtra columns (not in MySQL schema): {sorted(extra)}")



MySQL COLUMN VERIFICATION
Required columns: 22
Present in DataFrame: 22

✓ All required columns present!


In [35]:
# =============================================================================
# 6) Match PDFs with Metadata Entries
# =============================================================================

# For each LNI folder with metadata, find matching PDF files
pdf_metadata_pairs = []  # List of (pdf_path, metadata_row) tuples
unmatched_pdfs = []      # PDFs in folder but not in metadata
unmatched_metadata = []  # Metadata entries without matching PDF

for lni_folder, df_meta in normalized_metadata.items():
    lni_path = DATA_DIR / lni_folder
    
    # Get all PDFs in this folder
    pdf_files = {p.name: p for p in lni_path.glob("*.pdf")}
    
    # Get filenames from metadata
    metadata_filenames = set(df_meta['filename'].dropna().tolist())
    
    # Match each metadata entry to its PDF
    for idx, row in df_meta.iterrows():
        filename = row['filename']
        
        if pd.isna(filename):
            unmatched_metadata.append((lni_folder, idx, "NaN filename"))
            continue
            
        if filename in pdf_files:
            pdf_path = pdf_files[filename]
            
            # Check exclusion BEFORE adding to pairs
            if should_exclude_pdf(pdf_path):
                continue  # Skip excluded PDFs
                
            pdf_metadata_pairs.append((pdf_path, row))
        else:
            unmatched_metadata.append((lni_folder, filename, "PDF not found"))
    
    # Find PDFs without metadata (for info only)
    for pdf_name, pdf_path in pdf_files.items():
        if pdf_name not in metadata_filenames:
            if not should_exclude_pdf(pdf_path):  # Only report non-excluded
                unmatched_pdfs.append((lni_folder, pdf_name))

# Summary
print(f"{'='*60}")
print("PDF-METADATA MATCHING SUMMARY")
print(f"{'='*60}")
print(f"Successfully matched: {len(pdf_metadata_pairs)} papers")
print(f"Unmatched metadata entries: {len(unmatched_metadata)}")
print(f"PDFs without metadata (non-excluded): {len(unmatched_pdfs)}")

# Show details if there are issues
if unmatched_metadata:
    print(f"\n⚠️  Metadata entries without matching PDF:")
    for item in unmatched_metadata[:10]:  # Show first 10
        print(f"   {item}")
    if len(unmatched_metadata) > 10:
        print(f"   ... and {len(unmatched_metadata) - 10} more")

if unmatched_pdfs:
    print(f"\n⚠️  PDFs without metadata (will be processed separately):")
    for item in unmatched_pdfs[:10]:  # Show first 10
        print(f"   {item}")
    if len(unmatched_pdfs) > 10:
        print(f"   ... and {len(unmatched_pdfs) - 10} more")

# Distribution by year
print(f"\n{'='*60}")
print("DISTRIBUTION BY YEAR")
print(f"{'='*60}")
year_counts = {}
for pdf_path, row in pdf_metadata_pairs:
    year = row['year']
    year_counts[year] = year_counts.get(year, 0) + 1

for year in sorted(year_counts.keys()):
    print(f"  {year}: {year_counts[year]} papers")


PDF-METADATA MATCHING SUMMARY
Successfully matched: 815 papers
Unmatched metadata entries: 7
PDFs without metadata (non-excluded): 6

⚠️  Metadata entries without matching PDF:
   ('lni132', 38, 'NaN filename')
   ('lni284', 'Proceedings_complete.pdf', 'PDF not found')
   ('lni37', 51, 'NaN filename')
   ('lni52', 'GI.-.Proceedings.52-51.pdf', 'PDF not found')
   ('lni52', 56, 'NaN filename')
   ('lni66', 54, 'NaN filename')
   ('lni87', 43, 'NaN filename')

⚠️  PDFs without metadata (will be processed separately):
   ('lni132', '433.pdf')
   ('lni316', 'DELFI_2021_375-376.pdf')
   ('lni316', 'DELFI_2021_23-24.pdf')
   ('lni316', 'DELFI_2021_1-14.pdf')
   ('lni316', 'DELFI_2021_349-350.pdf')
   ('lni316', 'DELFI_2021_15-16.pdf')

DISTRIBUTION BY YEAR
  2003: 51 papers
  2004: 55 papers
  2005: 54 papers
  2006: 43 papers
  2008.0: 38 papers
  2009: 24 papers
  2011: 22 papers
  2012: 30 papers
  2013: 33 papers
  2018: 46 papers
  2019: 59 papers
  2020: 62 papers
  2021: 66 papers
  2

### 2) Pdf files without metadata