In [89]:
### Import relevant libraries
import pandas as pd
from pathlib import Path
from datetime import date
import numpy as np
from tqdm import tqdm

import importlib
import pdf_text_extraction
importlib.reload(pdf_text_extraction) #to solve caching issues
from pdf_text_extraction import (
        extract_text_from_pdf,
        extract_main_content,
        extract_references,
        get_page_count,
        process_pdf_with_metadata,
        process_pdf_without_metadata
    )

import warnings

# Suppress openpyxl extension warnings (common with Excel files from different sources)
warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')

In [90]:
# =============================================================================
# 1) Configuration
# =============================================================================

DATA_DIR = Path("../data")
OUTPUT_DIR = DATA_DIR  # CSV backup saved to data folder


# LNI to year mapping (necesseary for files without metadata or for files with metadataa, in casee the "dc.date.issued" column in the metadata excel files has no values)
LNI_MAPPING = {
    "lni37": 2003,
    "lni52": 2004,
    "lni66": 2005,
    "lni87": 2006,
    "lni111": 2007,
    "lni132": 2008,
    "lni153": 2009,
    "lni169": 2010,
    "lni188": 2011,
    "lni207": 2012,
    "lni218": 2013,
    "lni233": 2014,
    "lni247": 2015,
    "lni262": 2016,
    "lni273": 2017,
    "lni284": 2018,
    "lni297": 2019,
    "lni308": 2020,
    "lni316": 2021,
    "lni322": 2022,
    "lni338": 2023,
    "lni356": 2024,
    "lni369": 2025,
}

# Full proceedings PDFs to exclude (manually identified, only filename)
FULL_PROCEEDINGS_SET = {
    "lni-p-153-komplett.pdf",
    "lni-p-169-komplett.pdf",
    "lni-p-188-komplett.pdf",
    "lni-p-207-komplett.pdf",
    "lni-p-218-komplett.pdf",
    "lni-p-233-komplett.pdf",
    "lni-p-247-komplett.pdf",
    "lni-p-262-komplett.pdf",
    "lni-p-273-komplett.pdf",
    "proceedings_complete.pdf",
    "DELFI2019_Tagungsband_komplett.pdf",
    "DELFI2019_Tagungsband_komplett_Onlineversion.pdf",
    "DELFI2020_Proceedings_komplett.pdf",
    "DELFI_2021-Proceedings.pdf",
    "DELFI_2022_Proceedings_FINAL.pdf",
    "Komplettband.pdf",
    "DELFI_2024_ProceedingsComplete_alt.pdf",
    "DELFI_2024_ProceedingsComplete.pdf",
    "proceedings.pdf",
    "DELFI2025_ProceedingsComplete.pdf",
}

# Terms to exclude (covers, prefaces, etc.)
EXCLUSION_TERMS = ["cover", "vorwort", "preface", "foreword"]

# Processing settings
MIN_PAGES = 3  # Only process papers with > 2 pages

print(f"Full proceedings to exclude: {len(FULL_PROCEEDINGS_SET)}")

Full proceedings to exclude: 20


In [91]:
# =============================================================================
# 2) Helper Functions
# =============================================================================


def should_exclude_pdf(pdf_path: Path) -> bool:
    """
    Check if PDF should be excluded from annotation.
    
    Excludes:
    - Full proceedings (from manual set)
    - Covers, prefaces, forewords (by filename keyword)
    
    Returns:
        True if PDF should be excluded, False otherwise
    """
    # Check manual exclusion set
    if pdf_path.name in FULL_PROCEEDINGS_SET:
        return True
    
    # Check filename keywords (case-insensitive)
    filename_lower = pdf_path.name.lower()
    if any(term in filename_lower for term in EXCLUSION_TERMS):
        return True
    
    return False



def parse_page_range(page_str) -> tuple[int | None, int | None]:
    """
    Parse 'start-end' page range format into (start_page, end_page).
    
    Handles mci.reference.pages format like '101-112'.
    
    Args:
        page_str: String in format 'start-end' or None/NaN
    
    Returns:
        Tuple of (start_page, end_page) as integers, or (None, None) if parsing fails
    """
    if pd.isna(page_str) or not page_str:
        return None, None
    
    parts = str(page_str).strip().split('-')
    if len(parts) == 2:
        try:
            return int(parts[0].strip()), int(parts[1].strip())
        except ValueError:
            return None, None
    return None, None


def load_metadata_excel(metadata_path: Path) -> pd.DataFrame:
    """
    Load metadata Excel file with special handling for DeLFI2018.
    
    Args:
        metadata_path: Path to metadata Excel file
    
    Returns:
        DataFrame with metadata
    """
    if 'DeLFI2018' in metadata_path.name:
        return pd.read_excel(metadata_path, sheet_name='Einreichung_GI') # the first sheet "Arbeitsversion" contains dirty data
    else:
        return pd.read_excel(metadata_path)
    

# Quick test of parse_page_range
assert parse_page_range("101-112") == (101, 112)
assert parse_page_range("1-10") == (1, 10)
assert parse_page_range(None) == (None, None)
assert parse_page_range("") == (None, None)
print("\n✓ parse_page_range tests passed")



✓ parse_page_range tests passed


### 1) Pdf files with metadata

In [92]:
# =============================================================================
# 3) Discover and Load Metadata Files
# =============================================================================

# Find all metadata Excel files
metadata_files = sorted(DATA_DIR.glob("*/metadata*.xlsx"))

print(f"Found {len(metadata_files)} metadata files:\n")

# Load each metadata file into a dictionary: {lni_folder: DataFrame}
metadata_dict = {}

for metadata_path in metadata_files:
    lni_folder = metadata_path.parent.name
    df = load_metadata_excel(metadata_path)
    metadata_dict[lni_folder] = {
        'path': metadata_path,
        'df': df,
        'n_rows': len(df)
    }
    print(f"  {lni_folder}: {metadata_path.name} ({len(df)} rows)")


print(f"\n{'='*60}")
print(f"Total LNI folders with metadata: {len(metadata_dict)}")
print(f"Total papers in metadata: {sum(m['n_rows'] for m in metadata_dict.values())}")

# Quick inspection: show columns from first and last file to see variation
first_lni = list(metadata_dict.keys())[0]
last_lni = list(metadata_dict.keys())[-1]

print(f"\n{'='*60}")
print(f"Column comparison (first vs last):")
print(f"\n{first_lni} columns ({len(metadata_dict[first_lni]['df'].columns)}):")
print(f"  {list(metadata_dict[first_lni]['df'].columns)}")
print(f"\n{last_lni} columns ({len(metadata_dict[last_lni]['df'].columns)}):")
print(f"  {list(metadata_dict[last_lni]['df'].columns)}")


Found 17 metadata files:

  lni132: metadata-lni-132.xlsx (39 rows)
  lni153: metadata-lni-153.xlsx (25 rows)
  lni188: metadata-lni-188.xlsx (23 rows)
  lni207: metadata-lni-207.xlsx (31 rows)
  lni218: metadata-lni-218.xlsx (34 rows)
  lni284: metadata-DeLFI2018.xlsx (47 rows)
  lni297: metadata-lni-297.xlsx (60 rows)
  lni308: metadata-delfi-2020.xlsx (64 rows)
  lni316: metadata-lni-316.xlsx (67 rows)
  lni322: metadata_lni-322.xlsx (50 rows)
  lni338: metadata_lni-338.xlsx (67 rows)
  lni356: metadata_lni-356.xlsx (58 rows)
  lni369: metadata_lni-369.xlsx (61 rows)
  lni37: metadata-lni-37.xlsx (52 rows)
  lni52: metadata-lni-52.xlsx (57 rows)
  lni66: metadata-lni-66.xlsx (55 rows)
  lni87: metadata-lni-87.xlsx (44 rows)

Total LNI folders with metadata: 17
Total papers in metadata: 834

Column comparison (first vs last):

lni132 columns (20):
  ['dc.title', 'dc.contributor.author', 'dc.language.iso', 'dc.relation.ispartof', 'dc.contributor.editor', 'mci.reference.pages', 'dc.des

In [93]:
# =============================================================================
# 4) Normalize Metadata Columns
# =============================================================================

# Column mapping: source column(s) -> MySQL column name
# For columns with variants (mci.* vs gi.*), we list alternatives
COLUMN_MAPPING = {
    # Direct mappings (same in all files)
    'dc.title': 'title',
    'dc.contributor.author': 'authors',
    'dc.language.iso': 'language',
    'dc.relation.ispartof': 'proceeding_title',
    'dc.contributor.editor': 'editors',
    'dc.description.abstract': 'abstract',
    'dc.subject': 'subject',
    'filename': 'filename',
    'dc.identifier.doi': 'doi',
    'dc.identifier.isbn': 'isbn',
    'dc.identifier.issn': 'issn',
    'dc.relation.ispartofseries': 'series_title',
    'dc.publisher': 'publisher',
    'dc.pubPlace': 'publication_place',
    'dc.date.issued': 'year',
    'dc.type': 'publication_type',
}

# Columns with mci.* / gi.* variants
VARIANT_COLUMNS = {
    'conference_date': ['mci.conference.date', 'gi.conference.date'],
    'conference_location': ['mci.conference.location', 'gi.conference.location'],
    'session_title': ['mci.conference.sessiontitle', 'gi.conference.sessiontitle'],
}

# Special columns (require parsing or only in some files)
# - mci.reference.pages -> start_page, end_page (15 of 17 files)
# - gi.citation.startPage, gi.citation.endPage (2 of 17 files)
# - gi.conference.review -> peer_review_status (only in gi.* files)


def normalize_metadata_df(df: pd.DataFrame, lni_folder: str) -> pd.DataFrame:
    """
    Normalize a metadata DataFrame to match MySQL column names.
    
    Handles:
    - Direct column renames
    - mci.* vs gi.* variants
    - Page range parsing (mci.reference.pages -> start_page, end_page)
    - gi.citation.startPage/endPage handling
    
    Args:
        df: Raw metadata DataFrame
        lni_folder: LNI folder name (for logging)
    
    Returns:
        Normalized DataFrame with MySQL column names
    """
    normalized = pd.DataFrame()
    
    # 1) Direct mappings
    for src_col, mysql_col in COLUMN_MAPPING.items():
        if src_col in df.columns:
            normalized[mysql_col] = df[src_col]
        else:
            normalized[mysql_col] = None
    
    # 2) Variant columns (mci.* / gi.*)
    for mysql_col, variants in VARIANT_COLUMNS.items():
        for variant in variants:
            if variant in df.columns:
                normalized[mysql_col] = df[variant]
                break
        else:
            normalized[mysql_col] = None
    
    # 3) Page columns - handle both formats
    if 'mci.reference.pages' in df.columns:
        # Parse "101-112" format
        pages = df['mci.reference.pages'].apply(parse_page_range)
        normalized['start_page'] = pages.apply(lambda x: x[0])
        normalized['end_page'] = pages.apply(lambda x: x[1])
    elif 'gi.citation.startPage' in df.columns and 'gi.citation.endPage' in df.columns:
        # Direct columns (convert to int, handling NaN)
        normalized['start_page'] = pd.to_numeric(df['gi.citation.startPage'], errors='coerce').astype('Int64')
        normalized['end_page'] = pd.to_numeric(df['gi.citation.endPage'], errors='coerce').astype('Int64')
    else:
        normalized['start_page'] = None
        normalized['end_page'] = None
    
    # 4) Peer review status (only in gi.* files)
    if 'gi.conference.review' in df.columns:
        normalized['peer_review_status'] = df['gi.conference.review']
    else:
        normalized['peer_review_status'] = None
    
    return normalized



# Apply normalization to all metadata DataFrames
normalized_metadata = {}
all_normalized_dfs = []

for lni_folder, data in metadata_dict.items():
    df_normalized = normalize_metadata_df(data['df'], lni_folder)
    normalized_metadata[lni_folder] = df_normalized
    all_normalized_dfs.append(df_normalized)
    
# Combine all into one DataFrame
df_all_metadata = pd.concat(all_normalized_dfs, ignore_index=True)

print(f"Normalized metadata for {len(normalized_metadata)} LNI folders")
print(f"Total rows: {len(df_all_metadata)}")
print(f"\nColumns ({len(df_all_metadata.columns)}):")
print(df_all_metadata.columns.tolist())

# Quick check: show sample row
print(f"\nSample row (first paper):")
print(df_all_metadata.iloc[0].to_dict())


Normalized metadata for 17 LNI folders
Total rows: 834

Columns (22):
['title', 'authors', 'language', 'proceeding_title', 'editors', 'abstract', 'subject', 'filename', 'doi', 'isbn', 'issn', 'series_title', 'publisher', 'publication_place', 'year', 'publication_type', 'conference_date', 'conference_location', 'session_title', 'start_page', 'end_page', 'peer_review_status']

Sample row (first paper):
{'title': 'Gibt es eine Net Generation? Widerlegung einer Mystifizierung', 'authors': 'Schulmeister, Rolf', 'language': 'de', 'proceeding_title': 'DeLFI 2008: Die 6. e-Learning Fachtagung Informatik', 'editors': 'Seehusen, Silke; Lucke, Ulrike; Fischer, Stefan', 'abstract': 'Es ist von einer Net Generation, von der Generation @, der Generation Y oder den Millenials die Rede, und es werden Mutmaßungen über die Rolle der Net Generation für die Lehre angestellt. Der Beitrag ist als kritische Analyse solcher Behauptungen und Mutmaßungen zu verstehen und als Diskurs zur Medien- nutzung aus de

In [94]:
# =============================================================================
# 5) Verify all MySQL columns are present (except id, text, references)
# =============================================================================

MYSQL_COLUMNS = [
    # 'id',  # AUTO_INCREMENT - generated by MySQL
    'title',
    'authors', 
    'year',
    'abstract',
    # 'text',  # Extracted from PDF
    # 'references',  # Extracted from PDF
    'start_page',
    'end_page',
    'subject',
    'filename',
    'editors',
    'doi',
    'isbn',
    'issn',
    'proceeding_title',
    'series_title',
    'publisher',
    'publication_place',
    'conference_date',
    'conference_location',
    'session_title',
    'publication_type',
    'language',
    'peer_review_status',
]

# Check which columns are present/missing
df_columns = set(df_all_metadata.columns)
required_columns = set(MYSQL_COLUMNS)

present = required_columns & df_columns
missing = required_columns - df_columns
extra = df_columns - required_columns

print(f"\n{'='*60}")
print("MySQL COLUMN VERIFICATION")
print(f"{'='*60}")
print(f"Required columns: {len(MYSQL_COLUMNS)}")
print(f"Present in DataFrame: {len(present)}")

if missing:
    print(f"\n❌ MISSING columns ({len(missing)}):")
    for col in sorted(missing):
        print(f"   - {col}")
else:
    print(f"\n✓ All required columns present!")

if extra:
    print(f"\nExtra columns (not in MySQL schema): {sorted(extra)}")



MySQL COLUMN VERIFICATION
Required columns: 22
Present in DataFrame: 22

✓ All required columns present!


In [95]:
# =============================================================================
# 6) Match PDFs with Metadata Entries
# =============================================================================

# For each LNI folder with metadata, find matching PDF files
pdf_metadata_pairs = []  # List of (pdf_path, metadata_row) tuples
unmatched_pdfs = []      # PDFs in folder but not in metadata
unmatched_metadata = []  # Metadata entries without matching PDF

for lni_folder, df_meta in normalized_metadata.items():
    lni_path = DATA_DIR / lni_folder
    
    # Get all PDFs in this folder
    pdf_files = {p.name: p for p in lni_path.glob("*.pdf")}
    
    # Get filenames from metadata
    metadata_filenames = set(df_meta['filename'].dropna().tolist())
    
    # Match each metadata entry to its PDF
    for idx, row in df_meta.iterrows():
        filename = row['filename']
        
        if pd.isna(filename):
            unmatched_metadata.append((lni_folder, idx, "NaN filename"))
            continue
            
        if filename in pdf_files:
            pdf_path = pdf_files[filename]
            
            # Check exclusion BEFORE adding to pairs
            if should_exclude_pdf(pdf_path):
                continue  # Skip excluded PDFs
                
            pdf_metadata_pairs.append((pdf_path, row))
        else:
            unmatched_metadata.append((lni_folder, filename, "PDF not found"))
    
    # Find PDFs without metadata (for info only)
    for pdf_name, pdf_path in pdf_files.items():
        if pdf_name not in metadata_filenames:
            if not should_exclude_pdf(pdf_path):  # Only report non-excluded
                unmatched_pdfs.append((lni_folder, pdf_name))

# Summary
print(f"{'='*60}")
print("PDF-METADATA MATCHING SUMMARY")
print(f"{'='*60}")
print(f"Successfully matched: {len(pdf_metadata_pairs)} papers")
print(f"Unmatched metadata entries: {len(unmatched_metadata)}")
print(f"PDFs without metadata (non-excluded): {len(unmatched_pdfs)}")

# Show details if there are issues
if unmatched_metadata:
    print(f"\n⚠️  Metadata entries without matching PDF:")
    for item in unmatched_metadata[:10]:  # Show first 10
        print(f"   {item}")
    if len(unmatched_metadata) > 10:
        print(f"   ... and {len(unmatched_metadata) - 10} more")

if unmatched_pdfs:
    print(f"\n⚠️  PDFs without metadata (will be processed separately):")
    for item in unmatched_pdfs[:10]:  # Show first 10
        print(f"   {item}")
    if len(unmatched_pdfs) > 10:
        print(f"   ... and {len(unmatched_pdfs) - 10} more")

# Distribution by year
print(f"\n{'='*60}")
print("DISTRIBUTION BY YEAR")
print(f"{'='*60}")
year_counts = {}
for pdf_path, row in pdf_metadata_pairs:
    year = row['year']
    year_counts[year] = year_counts.get(year, 0) + 1

for year in sorted(year_counts.keys()):
    print(f"  {year}: {year_counts[year]} papers")


PDF-METADATA MATCHING SUMMARY
Successfully matched: 815 papers
Unmatched metadata entries: 7
PDFs without metadata (non-excluded): 6

⚠️  Metadata entries without matching PDF:
   ('lni132', 38, 'NaN filename')
   ('lni284', 'Proceedings_complete.pdf', 'PDF not found')
   ('lni37', 51, 'NaN filename')
   ('lni52', 'GI.-.Proceedings.52-51.pdf', 'PDF not found')
   ('lni52', 56, 'NaN filename')
   ('lni66', 54, 'NaN filename')
   ('lni87', 43, 'NaN filename')

⚠️  PDFs without metadata (will be processed separately):
   ('lni132', '433.pdf')
   ('lni316', 'DELFI_2021_375-376.pdf')
   ('lni316', 'DELFI_2021_23-24.pdf')
   ('lni316', 'DELFI_2021_1-14.pdf')
   ('lni316', 'DELFI_2021_349-350.pdf')
   ('lni316', 'DELFI_2021_15-16.pdf')

DISTRIBUTION BY YEAR
  2003: 51 papers
  2004: 55 papers
  2005: 54 papers
  2006: 43 papers
  2008.0: 38 papers
  2009: 24 papers
  2011: 22 papers
  2012: 30 papers
  2013: 33 papers
  2018: 46 papers
  2019: 59 papers
  2020: 62 papers
  2021: 66 papers
  2

#### Metadata entries without matching PDF


-    ('lni132', 38, 'NaN filename')

    - is row 40 in the metadata xlsx file -> contains the full proceedings, but has no filename -> is not important since it would be excluded in the next step anyway

-    ('lni284', 'Proceedings_complete.pdf', 'PDF not found')
    - filename in the metadata xlsx file is wrong -> is "Proceedings_complete.pdf" -> but should be "proceedings_complete.pdf" -> is not important since it would be excluded in the next step anyway

- ('lni37', 51, 'NaN filename')

    - is row 53 in the metadata xlsx file -> contains the full proceedings, but has no filename -> is not important since it would be excluded in the next step anyway

-  ('lni52', 'GI.-.Proceedings.52-51.pdf', 'PDF not found')

    - is actually missing -> the pdf is not part of the folder -> was not part of the data provided in the cloud -> is only n = 1 -> not important 

-    ('lni52', 56, 'NaN filename')

    - is row 58 in the metadata xlsx file -> contains the full proceedings, but has no filename -> is not important since it would be excluded in the next step anyway


- ('lni66', 54, 'NaN filename')

  - is row 56 in the metadata xlsx file -> contains the full proceedings, but has no filename; and the lni folder does not contain a full proceedings pdf -> is not important since it would be excluded in the next step anyway

-   ('lni87', 43, 'NaN filename')

    - is row 45 in the metadata xlsx file -> contains the full proceedings, but has no filename; and the lni folder does not contain a full proceedings pdf -> is not important since it would be excluded in the next step anyway

#### PDFs without metadata (will be processed separately)

-    ('lni132', '433.pdf')

    - pdf file is actually missing in the metadata xlsx file -> can be processed later with all the other papers without metadata 

- lni316 papers: 

    -    ('lni316', 'DELFI_2021_1-14.pdf') -> is the table of contents -> is also not needed later -> does not need to be processed at all 

    -    ('lni316', 'DELFI_2021_15-16.pdf') -> empty pages with "keynote" heading and nothing else -> but will be excluded (2 pages) anyway 

    -    ('lni316', 'DELFI_2021_23-24.pdf') -> only contains "Forschungs-, Praxis- und Positionsbeiträge" and nothing else -> but will be excluded (2 pages) anyway 

    -    ('lni316', 'DELFI_2021_349-350.pdf') -> only contains "Posterbeiträge" heading and nothing else -> but will be excluded (2 pages) anyway 

    -    ('lni316', 'DELFI_2021_375-376.pdf') -> only contains "Demonstrationsbeiträge" heading and nothing else -> but will be excluded (2 pages) anyway 


In [96]:
# Using the normalized DataFrame - without lni_folder
missing_year = df_all_metadata[df_all_metadata['year'].isna()]
print(missing_year[['filename', 'title']].values)


[[nan 'DeLFI 2008: Die 6. e-Learning Fachtagung Informatik']
 ['Hieke-Keuning.pdf'
  'The interplay between rich and big data in programming education research']]


- is from lni356 -> and the column "dc.date.issued" lacks a value for this row -> should be 2024

This paper's year can be filled using LNI_MAPPING[lni_folder] as a fallback in Cell 8.

In [97]:
# ============================================================
# 7) Filter to relevant PDFs (> 2 pages, not excluded)
# ============================================================

# Apply filters to pdf_metadata_pairs
filtered_pairs = []
excluded_by_name = [] #full proceedings or papers with ""cover", "vorwort", "preface", "foreword"]"
excluded_by_pages = []

for pdf_path, meta_row in pdf_metadata_pairs:
    # Filter 1: Exclusion by filename pattern
    if should_exclude_pdf(pdf_path):
        excluded_by_name.append((pdf_path.parent.name, pdf_path.name))
        continue
    
    # Filter 2: Page count (must be > 2 pages, i.e., min 3 pages)
    page_count = get_page_count(pdf_path)
    if page_count < 3:
        excluded_by_pages.append((pdf_path.parent.name, pdf_path.name, page_count))
        continue
    
    # Passed all filters
    filtered_pairs.append((pdf_path, meta_row))

# ============================================================
# SUMMARY
# ============================================================
print("=" * 60)
print("FILTERING SUMMARY")
print("=" * 60)
print(f"  Original matched pairs: {len(pdf_metadata_pairs)}")
print(f"  Excluded by filename:   {len(excluded_by_name)}")
print(f"  Excluded by page count: {len(excluded_by_pages)}")
print(f"  Remaining after filter: {len(filtered_pairs)}")
print()

if excluded_by_name:
    print("Files excluded by filename pattern:")
    for lni, fname in excluded_by_name:
        print(f"    {lni}: {fname}")
    print()

if excluded_by_pages:
    print("Files excluded by page count (<= 2 pages):")
    for lni, fname, pages in excluded_by_pages:
        print(f"    {lni}: {fname} ({pages} pages)")


FILTERING SUMMARY
  Original matched pairs: 815
  Excluded by filename:   0
  Excluded by page count: 212
  Remaining after filter: 603

Files excluded by page count (<= 2 pages):
    lni132: 425.pdf (2 pages)
    lni132: 427.pdf (2 pages)
    lni132: 429.pdf (2 pages)
    lni132: 431.pdf (2 pages)
    lni207: 13.pdf (1 pages)
    lni207: 14.pdf (1 pages)
    lni284: proceedings_01.pdf (1 pages)
    lni284: proceedings_02.pdf (1 pages)
    lni284: proceedings_27.pdf (2 pages)
    lni284: proceedings_28.pdf (2 pages)
    lni284: proceedings_29.pdf (2 pages)
    lni284: proceedings_30.pdf (2 pages)
    lni284: proceedings_31.pdf (2 pages)
    lni284: proceedings_32.pdf (2 pages)
    lni284: proceedings_33.pdf (2 pages)
    lni284: proceedings_34.pdf (2 pages)
    lni284: proceedings_35.pdf (2 pages)
    lni284: proceedings_36.pdf (2 pages)
    lni284: proceedings_37.pdf (2 pages)
    lni284: proceedings_38.pdf (2 pages)
    lni284: proceedings_39.pdf (2 pages)
    lni284: proceedings_40.

- Excluded page count n = 212 is plausible, since from all pdf files (with and w/o metadatafiles, n = 1.075) there should be n = 248 papers <= 2 pages.

-  Excluded by filename:   0 -> all full proceeding papers were filtered out in the filtername matching step before! (see the cell output below)

In [98]:
# Debug: Check for suspiciously large papers (potential full proceedings)
suspicious = [(pdf_path.parent.name, pdf_path.name, get_page_count(pdf_path)) 
              for pdf_path, _ in filtered_pairs 
              if get_page_count(pdf_path) > 25]

print(f"Papers with > 25 pages: {len(suspicious)}")
for lni, fname, pages in sorted(suspicious, key=lambda x: -x[2]):
    print(f"  {lni}: {fname} ({pages} pages)")


Papers with > 25 pages: 0


In [100]:
# ============================================================
# 8) Extract text, main content, references for each PDF
# ============================================================

# Store extraction results
extraction_results = []
extraction_errors = []
corrupted_pdfs = []

print(f"Extracting content from {len(filtered_pairs)} PDFs...")
print()

for pdf_path, meta_row in tqdm(filtered_pairs, desc="Extracting"):
    try:
        # Extract raw text
        raw_text = extract_text_from_pdf(pdf_path)
        
        if raw_text is None or len(raw_text.strip()) == 0:
            extraction_errors.append((pdf_path.parent.name, pdf_path.name, "Empty text"))
            continue
        
        # Extract main content and references
        main_content = extract_main_content(raw_text)
        references = extract_references(raw_text)
        
        # Check for corrupted text marker - EXCLUDE from results
        if main_content == "Corrupted text":
            corrupted_pdfs.append((pdf_path.parent.name, pdf_path.name))
            continue  # Skip adding to extraction_results
        
        extraction_results.append({
            'pdf_path': pdf_path,
            'meta_row': meta_row,
            'text': main_content,
            'references': references,
        })
        
    except Exception as e:
        extraction_errors.append((pdf_path.parent.name, pdf_path.name, str(e)))

# ============================================================
# SUMMARY
# ============================================================
print()
print("=" * 60)
print("EXTRACTION SUMMARY")
print("=" * 60)
print(f"  Successfully extracted: {len(extraction_results)}")
print(f"  Corrupted PDFs:         {len(corrupted_pdfs)}")
print(f"  Extraction errors:      {len(extraction_errors)}")
print()

if corrupted_pdfs:
    print("Corrupted PDFs (garbled encoding) - EXCLUDED from results:")
    for lni, fname in corrupted_pdfs[:10]:
        print(f"    {lni}: {fname}")
    if len(corrupted_pdfs) > 10:
        print(f"    ... and {len(corrupted_pdfs) - 10} more")
    print()

if extraction_errors:
    print("Extraction errors:")
    for lni, fname, err in extraction_errors[:10]:
        print(f"    {lni}: {fname} - {err}")
    if len(extraction_errors) > 10:
        print(f"    ... and {len(extraction_errors) - 10} more")


Extracting content from 603 PDFs...



Extracting:   1%|          | 5/603 [00:00<00:15, 39.81it/s]

         Extraction not possible. Returning None.


Extracting:   5%|▌         | 31/603 [00:00<00:14, 40.08it/s]

         Extraction not possible. Returning None.


Extracting:   6%|▌         | 37/603 [00:00<00:13, 42.23it/s]

         Extraction not possible. Returning None.


Extracting:  13%|█▎        | 79/603 [00:01<00:11, 44.91it/s]

         Extraction not possible. Returning None.


Extracting:  24%|██▎       | 142/603 [00:03<00:07, 63.71it/s]

         Extraction not possible. Returning None.


Extracting:  85%|████████▌ | 515/603 [00:09<00:01, 55.81it/s]

         Extraction not possible. Returning None.


Extracting:  96%|█████████▌| 577/603 [00:11<00:00, 44.45it/s]

         Extraction not possible. Returning None.
         Extraction not possible. Returning None.


Extracting: 100%|██████████| 603/603 [00:11<00:00, 51.77it/s]


EXTRACTION SUMMARY
  Successfully extracted: 595
  Corrupted PDFs:         8
  Extraction errors:      0

Corrupted PDFs (garbled encoding) - EXCLUDED from results:
    lni132: 29.pdf
    lni132: 281.pdf
    lni132: 413.pdf
    lni188: 151.pdf
    lni218: 277.pdf
    lni52: GI.-.Proceedings.52-5.pdf
    lni66: GI-Proceedings.66-43.pdf
    lni87: GI-Proceedings-87-7.pdf






In [101]:
# ============================================================
# 9) Combine metadata + extracted content into final DataFrame
# ============================================================

# Build list of records for DataFrame
records = []

for result in extraction_results:
    pdf_path = result['pdf_path']
    meta = result['meta_row']
    
    # Get LNI folder for year fallback
    lni_folder = pdf_path.parent.name
    
    # Fix year: convert float to int, handle NaN with LNI_MAPPING fallback
    year_val = meta.get('year')
    if pd.notna(year_val):
        year = int(float(year_val))
    else:
        year = LNI_MAPPING.get(lni_folder, None)
        if year is not None:
            print(f"  Fixed missing year: {meta.get('filename')} → {year} (from {lni_folder})")
    
    # Fix start_page and end_page: convert float to int
    start_page_val = meta.get('start_page')
    end_page_val = meta.get('end_page')
    start_page = int(start_page_val) if pd.notna(start_page_val) else None
    end_page = int(end_page_val) if pd.notna(end_page_val) else None
    
    # Build record matching MySQL schema
    record = {
        # Extracted content
        'text': result['text'],
        'references': result['references'],
        
        # Metadata fields
        'title': meta.get('title'),
        'authors': meta.get('authors'),
        'year': year,
        'abstract': meta.get('abstract'),
        'start_page': start_page,
        'end_page': end_page,
        'subject': meta.get('subject'),
        'filename': meta.get('filename'),
        'editors': meta.get('editors'),
        'doi': meta.get('doi'),
        'isbn': meta.get('isbn'),
        'issn': meta.get('issn'),
        'proceeding_title': meta.get('proceeding_title'),
        'series_title': meta.get('series_title'),
        'publisher': meta.get('publisher'),
        'publication_place': meta.get('publication_place'),
        'conference_date': meta.get('conference_date'),
        'conference_location': meta.get('conference_location'),
        'session_title': meta.get('session_title'),
        'publication_type': meta.get('publication_type'),
        'language': meta.get('language'),
        'peer_review_status': meta.get('peer_review_status'),
    }
    records.append(record)

# Create DataFrame
df_final = pd.DataFrame(records)

# ============================================================
# VERIFICATION
# ============================================================
print("=" * 60)
print("YEAR VERIFICATION")
print("=" * 60)
print(f"Year dtype: {df_final['year'].dtype}")
print(f"NaN years: {df_final['year'].isna().sum()}")
print()
print("Year distribution (should be integers only):")
print(df_final['year'].value_counts().sort_index())

print()
print("=" * 60)
print("PAGE VERIFICATION")
print("=" * 60)
print(f"start_page dtype: {df_final['start_page'].dtype}")
print(f"end_page dtype: {df_final['end_page'].dtype}")
print(f"Sample pages: {df_final[['start_page', 'end_page']].head(3).values.tolist()}")

print()
print("=" * 60)
print("FINAL DATAFRAME SUMMARY")
print("=" * 60)
print(f"Total papers: {len(df_final)}")
print(f"Columns: {len(df_final.columns)}")


  Fixed missing year: Hieke-Keuning.pdf → 2024 (from lni356)
YEAR VERIFICATION
Year dtype: int64
NaN years: 0

Year distribution (should be integers only):
year
2003    51
2004    29
2005    41
2006    29
2008    31
2009    24
2011    21
2012    28
2013    32
2018    24
2019    32
2020    41
2021    44
2022    24
2023    29
2024    57
2025    58
Name: count, dtype: int64

PAGE VERIFICATION
start_page dtype: int64
end_page dtype: int64
Sample pages: [[15, 28], [41, 52], [53, 64]]

FINAL DATAFRAME SUMMARY
Total papers: 595
Columns: 24


In [102]:
# ============================================================
# 10) Quality Sampling and Inspection
# ============================================================

SAMPLE_SIZE = 50
np.random.seed(42)
sample_indices = np.random.choice(len(df_final), size=min(SAMPLE_SIZE, len(df_final)), replace=False)
df_sample = df_final.iloc[sample_indices].copy()

print("=" * 60)
print(f"QUALITY SAMPLE ({len(df_sample)} papers)")
print("=" * 60)

for i, (idx, row) in enumerate(df_sample.iterrows()):
    print(f"\n--- Paper {i+1} ---")
    print(f"Title: {row['title'][:80]}..." if pd.notna(row['title']) and len(str(row['title'])) > 80 else f"Title: {row['title']}")
    print(f"Year: {row['year']}")
    print(f"Authors: {str(row['authors'])[:60]}..." if pd.notna(row['authors']) and len(str(row['authors'])) > 60 else f"Authors: {row['authors']}")
    print(f"Filename: {row['filename']}")
    print(f"Pages: {row['start_page']} - {row['end_page']}")
    
    # Text quality check
    text = row['text']
    if pd.notna(text):
        text_len = len(text)
        text_preview = text[:150].replace('\n', ' ')
        print(f"Text ({text_len} chars): {text_preview}...")
    else:
        print("Text: NULL")
    
    # References quality check
    refs = row['references']
    if pd.notna(refs) and len(str(refs)) > 0:
        refs_len = len(refs)
        refs_preview = refs[:100].replace('\n', ' ')
        print(f"References ({refs_len} chars): {refs_preview}...")
    else:
        print("References: NULL/Empty")

# ============================================================
# QUICK STATS
# ============================================================
print("\n" + "=" * 60)
print("SAMPLE STATISTICS")
print("=" * 60)
print(f"Papers with text: {df_sample['text'].notna().sum()}/{len(df_sample)}")
print(f"Papers with references: {df_sample['references'].notna().sum()}/{len(df_sample)}")
print(f"Papers with abstract: {df_sample['abstract'].notna().sum()}/{len(df_sample)}")
print(f"Papers with start_page: {df_sample['start_page'].notna().sum()}/{len(df_sample)}")
print(f"Papers with end_page: {df_sample['end_page'].notna().sum()}/{len(df_sample)}")
print(f"Avg text length: {df_sample['text'].str.len().mean():.0f} chars")

print("\n" + "=" * 60)
print("PAGE RANGE VALIDATION")
print("=" * 60)
invalid_pages = df_sample[
    (df_sample['start_page'].notna()) & 
    (df_sample['end_page'].notna()) & 
    (df_sample['start_page'] > df_sample['end_page'])
]
print(f"Invalid page ranges (start > end): {len(invalid_pages)}")

QUALITY SAMPLE (50 papers)

--- Paper 1 ---
Title: Interaktive Unterstützung bei der Begehung eines Kunstlehrpfads: Die WaldSkulptu...
Year: 2019
Authors: Seiffert, Sven-Bjarne; Lingnau, Andreas 
Filename: DELFI2019_221_Interaktive_Unterstuetzung_bei_der_Begehung_eines_Kunstlehrpfads_-_Die_WaldSkulpturenWeg-App.pdf
Pages: 253 - 258
Text (16388 chars): 1  Einleitung  Geographisch durch den Kamm des Rothaargebirges getrennt und kulturhistorisch – insbesondere durch die Reformation – unterschiedlich ge...
References (847 chars): [CO14]  Cook, M; van Riemsdijk, M.: Agents of memorialization: Gunter Demnig's  Stolpersteine and th...

--- Paper 2 ---
Title:  Eine Kategorisierung und Katalogisierung von AR & VR Projekten für die (Hoch-) ...
Year: 2021
Authors: Horn, Florian; Dietze, Andreas;  Doerner, Ralf;  Grimm, Paul...
Filename: DELFI_2021_79-84.pdf
Pages: 79 - 84
Text (10156 chars): 1  Einleitung  Die Nutzung von Virtual Reality (VR) und Augmented Reality (AR) für die (Hoch- )schullehre 

In [103]:
# ============================================================
# 11): NA Analysis by Column
# ============================================================

print("=" * 60)
print("NA/NULL ANALYSIS BY COLUMN")
print("=" * 60)
print(f"Total papers: {len(df_final)}")
print()

# Calculate NA stats for each column
na_stats = []
for col in df_final.columns:
    na_count = df_final[col].isna().sum()
    na_pct = (na_count / len(df_final)) * 100
    non_na_count = len(df_final) - na_count
    na_stats.append({
        'column': col,
        'non_null': non_na_count,
        'null': na_count,
        'null_pct': na_pct
    })

# Sort by null percentage (highest first)
na_stats_sorted = sorted(na_stats, key=lambda x: x['null_pct'], reverse=True)

# Display results
print(f"{'Column':<25} {'Non-Null':>10} {'Null':>10} {'Null %':>10}")
print("-" * 60)

for stat in na_stats_sorted:
    print(f"{stat['column']:<25} {stat['non_null']:>10} {stat['null']:>10} {stat['null_pct']:>9.1f}%")

# ============================================================
# COLUMNS BY COMPLETENESS CATEGORY
# ============================================================
print()
print("=" * 60)
print("COLUMNS BY COMPLETENESS")
print("=" * 60)

complete = [s['column'] for s in na_stats if s['null_pct'] == 0]
mostly_complete = [s['column'] for s in na_stats if 0 < s['null_pct'] <= 10]
partial = [s['column'] for s in na_stats if 10 < s['null_pct'] <= 50]
mostly_empty = [s['column'] for s in na_stats if s['null_pct'] > 50]

print(f"\n✓ Complete (0% null): {len(complete)} columns")
for col in complete:
    print(f"    {col}")

print(f"\n◐ Mostly complete (1-10% null): {len(mostly_complete)} columns")
for col in mostly_complete:
    print(f"    {col}")

print(f"\n◔ Partial (11-50% null): {len(partial)} columns")
for col in partial:
    print(f"    {col}")

print(f"\n○ Mostly empty (>50% null): {len(mostly_empty)} columns")
for col in mostly_empty:
    print(f"    {col}")


NA/NULL ANALYSIS BY COLUMN
Total papers: 595

Column                      Non-Null       Null     Null %
------------------------------------------------------------
peer_review_status                86        509      85.5%
doi                              200        395      66.4%
subject                          311        284      47.7%
issn                             451        144      24.2%
isbn                             480        115      19.3%
publication_place                508         87      14.6%
session_title                    530         65      10.9%
references                       586          9       1.5%
abstract                         592          3       0.5%
text                             595          0       0.0%
title                            595          0       0.0%
authors                          595          0       0.0%
year                             595          0       0.0%
start_page                       595          0       0.0%
end_page

- All columns that need to be NOT NULL (title, authors, year, text, filename) have no NA values -> perfect

In [104]:
# Manually inspect a random sample of the df again

SAMPLE_SIZE = 30
np.random.seed(59)
df_sample_manual = df_final.sample(n=SAMPLE_SIZE, random_state=59)
df_sample_manual

Unnamed: 0,text,references,title,authors,year,abstract,start_page,end_page,subject,filename,editors,doi,isbn,issn,proceeding_title,series_title,publisher,publication_place,conference_date,conference_location,session_title,publication_type,language,peer_review_status
302,1 \nEinleitung \nSelbstreguliert lernen zu kön...,"[Ba09] \nBannert, M.: Promoting self-regulated...",Help me to help myself: Eine Feldstudie zur Wi...,"Radtke, Anna; Osinski, Meike; Scheffel, Maren;...",2023,Lerndatenanalysen eröffnen in digital gestützt...,29,40,"Selbstreguliertes Lernen, SRL-Kompetenzen, Ler...",11.pdf,"Röpke, René; Schroeder, Ulrik",10.18420/delfi2023-11,978-3-88579-732-6,,21. Fachtagung Bildungstechnologien (DELFI),Lecture Notes in Informatics (LNI) - Proceedin...,Gesellschaft für Informatik e.V.,,11.-13. September 2023,Aachen,Best-Paper-Kandidaten,Text/Conference Paper,de,full
402,1\nIntroduction\nThe acronym FAIR refers to re...,[As]\nAssociation for Computing Machinery: ACM...,A Publication Framework for Research Data in t...,"Schulz, Sandra; Schiffner, Daniel; Kiesler, Na...",2025,The interdisciplinary nature of the Educationa...,203,211,"Research Data Publication Framework, Open Data...",paper-18.pdf,"Köppen, Veit; Strickroth, Sven",10.18420/delfi2025_18,,,23. Fachtagung Bildungstechnologien (DELFI 2025),Lecture Notes in Informatics (LNI) - Proceedin...,Gesellschaft für Informatik e.V.,,8.–11. September 2025,Freiberg,Open Science/Software,Text/Conference Paper,en,full
382,1\nIntroduction\nWith the advance of Campus Ma...,"[BCR18]\nBogarín, A.; Cerezo, R.; Romero, C.: ...",BuddyAnalytics: A dashboard and reporting tool...,"Görzen, Sergej; Röpke, René; Schroeder, Ulrik",2024,With students leaving digital traces in Campus...,527,531,"Cohort Monitoring, Study Program Analysis, Cur...",Sergej-Goerzen.pdf,"Schulz, Sandra; Kiesler, Natalie",10.18420/delfi2024_53,,,Proceedings of DELFI 2024,DELFI,Gesellschaft für Informatik e.V.,Bonn,09.-11. September 2024,Fulda,Demo und Poster Session,Text/Conference demo,en,
15,1 Motivation\nVorlesungsaufzeichnungen werden ...,,Weg von der klassischen Frontalveranstaltung -...,"Wichelhaus, Svenja; Schüler, Thomas; Morisse,...",2008,Dieser Beitrag beschreibt eine qualitative Eva...,209,220,,209.pdf,"Seehusen, Silke; Lucke, Ulrike; Fischer, Stefan",,978-3-88579-226-0,1617-5468,DeLFI 2008: Die 6. e-Learning Fachtagung Infor...,Lecture Notes in Informatics (LNI) - Proceedin...,Gesellschaft für Informatik e. V.,Bonn,07. - 10. September 2008,Lübeck,Regular Research Papers,Text/Conference Paper,de,
182,1 \nIntroduction \nAccording to the Ljubljana ...,"[Ab12] \nAbelson, H.; Adida, B.; Linksvayer, M...",Simplifying license attribution for OER with e...,"Lechtenbörger, Jens",2019,Open Educational Resources (OER) come with dif...,205,216,"Open Educational Resource, OER, Attribution, C...",DELFI2019_280_Simplifying_license_attribution_...,"Pinkwart, Niels; Konert, Johannes",10.18420/delfi2019_280,978-3-88579-691-6,1617-5468,DELFI 2019,Lecture Notes in Informatics (LNI) - Proceedin...,Gesellschaft für Informatik e.V.,Bonn,16.-19. September 2019,"Berlin, Germany",Recht & Ethik,Text/Conference Paper,en,
497,...,,Zum Design inspirationaler Lernräume,"Wagner, Ina",2004,,23,26,,GI.-.Proceedings.52-2.pdf,"Engels, Gregor; Seehusen, Silke",,3-88579-381-4,1617-5468,DeLFI 2004: Die 2. e-Learning Fachtagung Infor...,Lecture Notes in Informatics (LNI) - Proceedin...,Gesellschaft für Informatik e.V.,Bonn,6.-8. September 2004,Paderborn,Regular Research Papers,Text/Conference Paper,de,
170,1 \nEinleitung \nAutomatisiert bewertbare Prog...,"[AB06] \nAttali, Y.; Burstein, J.: Automated E...",Ein Format für Bewertungsvorschriften in autom...,"Garmann, Robert",2019,Automatisiert bewertbare Programmieraufgaben d...,103,114,"Automatisierte Bewertung, Programmieraufgaben,...",DELFI2019_73_Ein_Format_fuer_Bewertungsvorschr...,"Pinkwart, Niels; Konert, Johannes",10.18420/delfi2019_73,978-3-88579-691-6,1617-5468,DELFI 2019,Lecture Notes in Informatics (LNI) - Proceedin...,Gesellschaft für Informatik e.V.,Bonn,16.-19. September 2019,"Berlin, Germany",Automatisierung & Generierung,Text/Conference Paper,de,
94,1 Einleitung\nDer von uns verfolgte Zweck best...,"[Bi10]\nBisitz, S.; Wenzel, J.; Riegler, P.: C...",Aktivierende Online-Lehre in der Mathematik mi...,"Bisitz, Stefan; Jensen, Nils",2012,Im Erfahrungsbericht spezifizieren wir unseren...,219,224,,219.pdf,"Desel, Jörg; Haake, Jörg M.; Spannagel, Christian",,978-3-88579-601-5,1617-5468,DeLFI 2012: Die 10. e-Learning Fachtagung Info...,Lecture Notes in Informatics (LNI) - Proceedin...,Gesellschaft für Informatik e.V.,Bonn,24.-26 September 2012,Hagen,Regular Research Papers,Text/Conference Paper,de,
30,1\nEinf¨uhrung und Motivation\nIn den vergange...,"[Ap01]\nApache, Sample dom.DOMAddLines. Intern...",XML-basierte dreidimensionale Animation von Al...,"Baker, Ashraf Abu; Grunwald, Dirk; Kappes, Stefan",2008,Der zunehmende Gewinn an Bedeutung und Akzepta...,401,412,,401.pdf,"Seehusen, Silke; Lucke, Ulrike; Fischer, Stefan",,978-3-88579-226-0,1617-5468,DeLFI 2008: Die 6. e-Learning Fachtagung Infor...,Lecture Notes in Informatics (LNI) - Proceedin...,Gesellschaft für Informatik e. V.,Bonn,07. - 10. September 2008,Lübeck,Regular Research Papers,Text/Conference Paper,de,
8,1 Motivation\nDas Internet bietet eine Vielfal...,"[Be07]\nBenz, B. F., Polushkina, S., Schmitz, ...",Einsatz und Evaluation eines Zielmanagement-We...,"Scholl, Philipp; Benz, Bastian; Böhnstedt, Do...",2008,Das effiziente und effektive selbstständige L...,125,136,,125.pdf,"Seehusen, Silke; Lucke, Ulrike; Fischer, Stefan",,978-3-88579-226-0,1617-5468,DeLFI 2008: Die 6. e-Learning Fachtagung Infor...,Lecture Notes in Informatics (LNI) - Proceedin...,Gesellschaft für Informatik e. V.,Bonn,07. - 10. September 2008,Lübeck,Regular Research Papers,Text/Conference Paper,de,


- NaN and None values need to be compatible with MySQL, when inserting the data to the database table

In [105]:
# ============================================================
# 12) Save to CSV
# ============================================================

# Convert NaN to None for MySQL compatibility
df_export = df_final.where(pd.notnull(df_final), None) # see e.g.: https://stackoverflow.com/questions/14162723/replacing-pandas-or-numpy-nan-with-a-none-to-use-with-mysqldb

# Create preprocessed subdirectory if it doesn't exist
OUTPUT_DIR = DATA_DIR / "preprocessed"
OUTPUT_DIR.mkdir(exist_ok=True)

# Generate filename with today's date
today = date.today().isoformat()  # Format: YYYY-MM-DD
csv_filename = f"delfi_paper_with_metadata_{today}.csv" 
csv_path = OUTPUT_DIR / csv_filename

# Save DataFrame to CSV
df_final.to_csv(csv_path, index=False, encoding='utf-8')


In [106]:
# ============================================================
# Debug: Check NOT NULL columns in saved CSV
# ============================================================

df_check = pd.read_csv(csv_path)

NOT_NULL_COLUMNS = ['title', 'authors', 'year', 'text', 'filename']

for col in NOT_NULL_COLUMNS:
    null_rows = df_check[df_check[col].isna()]
    if len(null_rows) > 0:
        print(f"\n'{col}' has {len(null_rows)} missing value(s):")
        print(null_rows[['filename', 'title', 'authors', 'year']].to_string())
    else:
        print(f"'{col}': OK (no missing values)")


'title': OK (no missing values)
'authors': OK (no missing values)
'year': OK (no missing values)
'text': OK (no missing values)
'filename': OK (no missing values)


### 2) Pdf files without metadata