In [1]:
# Load libraries

import pandas as pd 
import os 
from pathlib import Path
import warnings
import numpy as np

## 1) Metadata files inspection

In [3]:
# pattern of files: metadata-lni-37.xlsx, metadata-lni-52.xlsx, metadata_lni-338, metadata-delfi-2020.xlsx

# Ignore pandas read excel warnings
warnings.filterwarnings("ignore")

# Get current wd
os.getcwd()

# Get the data directory as Path object
data_dir = Path("../data")

# List all sub-directories (contain the annual publications)
data_subdir = [a.name for a in data_dir.iterdir() if a.is_dir()]

print("Sub-directories in current directory:")
print(data_subdir)
print("Number of sub-directories:")
print(len(data_subdir))

# Create list to store metadatafiles
list_metadata_files = []

# Pattern to find all metadata files (starts with "metadata" and ends with .xlsx)
metadata_paths = list(data_dir.glob("*/metadata*.xlsx"))

# Loop through metadata paths
for metadata_path in metadata_paths:
    print(f" {metadata_path}")
    # Append to list
    list_metadata_files.append(metadata_path)

print("Numter of metadata files found:")
print(len(list_metadata_files))

Sub-directories in current directory:
['lni132', 'lni338', 'lni169', 'lni52', 'lni308', 'lni37', 'lni218', 'lni233', 'lni269', 'lni111', 'lni322', 'lni188', 'lni153', 'lni66', 'lni369', 'lni356', 'lni284', 'lni247', 'lni262', 'lni207', 'lni297', 'lni87', 'lni316']
Number of sub-directories:
23
 ../data/lni132/metadata-lni-132.xlsx
 ../data/lni338/metadata_lni-338.xlsx
 ../data/lni52/metadata-lni-52.xlsx
 ../data/lni308/metadata-delfi-2020.xlsx
 ../data/lni37/metadata-lni-37.xlsx
 ../data/lni218/metadata-lni-218.xlsx
 ../data/lni322/metadata_lni-322.xlsx
 ../data/lni188/metadata-lni-188.xlsx
 ../data/lni153/metadata-lni-153.xlsx
 ../data/lni66/metadata-lni-66.xlsx
 ../data/lni369/metadata_lni-369.xlsx
 ../data/lni356/metadata_lni-356.xlsx
 ../data/lni284/metadata-DeLFI2018.xlsx
 ../data/lni207/metadata-lni-207.xlsx
 ../data/lni297/metadata-lni-297.xlsx
 ../data/lni87/metadata-lni-87.xlsx
 ../data/lni316/metadata-lni-316.xlsx
Numter of metadata files found:
17


In [4]:
# Check, whether all metadata files have the same columns 

## List to store column names
list_of_columns = []

for metadata_file in list_metadata_files:
    df = pd.read_excel(metadata_file)
    list_of_columns.append(set(df.columns))

## Check if all sets are equal
first = list_of_columns[0]
all_same = all(cols == first for cols in list_of_columns)

print("All files have the same columns:", all_same)

All files have the same columns: False


In [5]:
# Check, which columns are different
all_cols = []
for f in list_metadata_files:
    df = pd.read_excel(f)
    all_cols.append(set(df.columns))

common = set.intersection(*all_cols)
union  = set.union(*all_cols)

print("Common columns in ALL files:", common)
print("All unique columns across files:", union)
print("Columns that vary (union - common):", union - common)


Common columns in ALL files: {'dc.subject', 'dc.description.abstract', 'dc.relation.ispartofseries', 'dc.type', 'dc.relation.ispartof', 'dc.publisher', 'dc.title', 'dc.language.iso', 'filename', 'dc.contributor.author', 'dc.contributor.editor', 'dc.date.issued'}
All unique columns across files: {'dc.identifier.pissn', 'dc.description.abstract', 'dc.publisher', 'gi.tag', 'dc.pubPlace', 'gi.citation.publisherPlace', 'filename', 'dc.type', 'ID', 'mci.reference.pages', 'dc.relation.ispartof', 'dc.language.iso', 'gi.conference.date', 'dc.identifier.isbn', 'mci.document.quality', 'gi.citation.startPage', 'dc.contributor.editor', 'dc.date.issued', 'gi.conference.sessiontitle', 'dc.subject', 'dc.relation.ispartofseries', 'dc.title', 'mci.conference.sessiontitle', 'dc.identifier.issn', 'Beitragsart', 'gi.citation.endPage', 'mci.conference.date', 'mci.conference.location', 'gi.conference.review', 'dc.identifier.doi', 'gi.conference.location', 'Nr.', 'dc.contributor.author'}
Columns that vary (un

- Pattern: use of "gi." vs. "mci."

### 1.1) Decide which of all columns from the metadatafiles to keep for MySQL "paper" table

In [6]:
# Print out varying number and types of columns:

## Store pandas df's in a list
df_metadata_list = []

for metadata_file in list_metadata_files:
    # Special handling for DeLFI2018 file - use correct tab
    if 'DeLFI2018' in metadata_file.name:
        df = pd.read_excel(metadata_file, sheet_name='Einreichung_GI') # the first sheet "Arbeitsversion" contains dirty data
    else:
        df = pd.read_excel(metadata_file)
    
    print(f"Number of columns: {len(df.columns)}")
    print(f"Columns: {df.columns.tolist()}")
    df_metadata_list.append(df)


Number of columns: 20
Columns: ['dc.title', 'dc.contributor.author', 'dc.language.iso', 'dc.relation.ispartof', 'dc.contributor.editor', 'mci.reference.pages', 'dc.description.abstract', 'dc.subject', 'filename', 'dc.identifier.doi', 'dc.identifier.isbn', 'dc.identifier.issn', 'dc.relation.ispartofseries', 'dc.publisher', 'dc.pubPlace', 'dc.date.issued', 'mci.conference.date', 'mci.conference.location', 'mci.conference.sessiontitle', 'dc.type']
Number of columns: 23
Columns: ['dc.title', 'dc.contributor.author', 'dc.language.iso', 'dc.relation.ispartof', 'dc.contributor.editor', 'dc.relation.ispartofseries', 'dc.publisher', 'gi.citation.publisherPlace', 'dc.date.issued', 'dc.description.abstract', 'dc.subject', 'gi.tag', 'dc.identifier.doi', 'dc.identifier.pissn', 'dc.identifier.isbn', 'gi.citation.startPage', 'gi.citation.endPage', 'gi.conference.date', 'gi.conference.location', 'gi.conference.sessiontitle', 'dc.type', 'gi.conference.review', 'filename']
Number of columns: 19
Columns:

- 'dc.title' -> keep (column name in MySQL: "title")

- 'dc.contributor.author' -> keep (column name in MySQL: "authors")

- 'dc.language.iso' -> keep (column name in MySQL: "language")

- 'dc.relation.ispartof' -> keep (column name in MySQL: "proceeding_title") -> states the year and number of the DeLFI conference 

- 'dc.contributor.editor' -> keep (column name in MySQL: "editors") -> the editors are different for each DeLFI proceeding/year, but are the same for all papers within one DeLFI proceeding

- 'mci.reference.pages' / 'gi.citation.startPage', 'gi.citation.endPage' -> keep (column names in MySQL: "start_page" and "end_page")

    - metadata files (15 of 17) with "mci.reference.pages" need parsing

- 'dc.description.abstract' -> keep (column name in MySQL: "abstract")

- 'dc.subject' -> keep (column name in MySQL: "subject") -> for some years the metadata files contain the topic/subject/associated keywords of a paper

- 'filename' -> keep (column name in MySQL: "filename") -> valuable for provenance, debugging, and reproducibility

    - While individual filenames may duplicate, (year, filename) should be unique:

        - UNIQUE KEY `unique_year_filename` (`year`, `filename`)


- 'dc.identifier.doi' -> keep (column name in MySQL: "doi")

- 'dc.identifier.isbn' -> keep (column name in MySQL: "isbn")

- 'dc.identifier.issn' -> keep (column name in MySQL: "issn")

- 'dc.relation.ispartofseries' -> keep (column name in MySQL: "series_title") -> contains the volume number of the LNI proceedings

- 'dc.publisher' -> keep (column name in MySQL: "publisher")

- 'dc.pubPlace' -> keep (column name in MySQL: "publication_place")

- 'dc.date.issued' -> keep (column name in MySQL: "year")

- 'mci.conference.date'/'gi.conference.date' -> keep (column name in MySQL: "conference_date")

- 'mci.conference.location'/'gi.conference.location' -> keep (column name in MySQL: "conference_location")

- 'mci.conference.sessiontitle'/'gi.conference.sessiontitle' -> keep (column name in MySQL: "session_title")

- 'dc.type' -> keep (column name in MySQL: "publication_type") -> important for filtering out irrelevant paper for LLM annotation, e.g., conference abstract

- 'gi.conference.review' -> keep (column name in MySQL: "peer_review_status")

In [7]:
# 1) Filename
## Check, whether each metadatafile contains the "filename" column. This is important 
## to map the metadata to each pdf file 

# Initialize tracking
files_with_filename = []
files_without_filename = []

# Check each metadata file
for metadata_file in list_metadata_files:
    df = pd.read_excel(metadata_file)
    
    if 'filename' in df.columns:
        files_with_filename.append(metadata_file.name)
    else:
        files_without_filename.append(metadata_file.name)

# Report results
print(f"Total metadata files checked: {len(list_metadata_files)}")
print(f"Files WITH 'filename' column: {len(files_with_filename)}")
print(f"Files WITHOUT 'filename' column: {len(files_without_filename)}")
print()

if files_without_filename:
    print("⚠️  WARNING: The following files are missing the 'filename' column:")
    for filename in files_without_filename:
        print(f"  - {filename}")
else:
    print("✓ All metadata files contain the 'filename' column")

Total metadata files checked: 17
Files WITH 'filename' column: 17
Files WITHOUT 'filename' column: 0

✓ All metadata files contain the 'filename' column


In [8]:
## Test, whether there are duplicate filenames within one year (should not exist)

# Add this to your notebook to test
for year_group in df_metadata_list:
    duplicates = year_group[year_group.duplicated(subset=['filename'], keep=False)]
    if not duplicates.empty:
        print(f"Duplicate filenames within same year: {duplicates[['filename', 'dc.title']]}")


In [9]:
## Test for NaN filenames and identify which metadata file contains them

print("Checking for missing (NaN) filenames in each metadata file:\n")

for i, (metadata_file, df) in enumerate(zip(list_metadata_files, df_metadata_list)):
    # Check for NaN values in filename column
    missing_filenames = df[df['filename'].isna()]
    
    if not missing_filenames.empty:
        print(f"⚠️  File: {metadata_file.name}")
        print(f"   Path: {metadata_file}")
        print(f"   Number of rows with NaN filename: {len(missing_filenames)}")
        print(f"   Row indices: {missing_filenames.index.tolist()}")
        print(f"\n   Details of rows with missing filenames:")
        print(missing_filenames[['dc.title', 'dc.contributor.author', 'dc.date.issued']].to_string())
        print("\n" + "="*80 + "\n")

print("✓ Check complete")


Checking for missing (NaN) filenames in each metadata file:

⚠️  File: metadata-lni-132.xlsx
   Path: ../data/lni132/metadata-lni-132.xlsx
   Number of rows with NaN filename: 1
   Row indices: [38]

   Details of rows with missing filenames:
                                               dc.title dc.contributor.author  dc.date.issued
38  DeLFI 2008: Die 6. e-Learning Fachtagung Informatik                   NaN             NaN


⚠️  File: metadata-lni-52.xlsx
   Path: ../data/lni52/metadata-lni-52.xlsx
   Number of rows with NaN filename: 1
   Row indices: [56]

   Details of rows with missing filenames:
                                               dc.title dc.contributor.author  dc.date.issued
56  DeLFI 2004: Die 2. e-Learning Fachtagung Informatik                   NaN            2004


⚠️  File: metadata-lni-37.xlsx
   Path: ../data/lni37/metadata-lni-37.xlsx
   Number of rows with NaN filename: 1
   Row indices: [51]

   Details of rows with missing filenames:
                   

All of the rows with missing filenames are the name/title of the respective year of the DeLFI proceeding. These are years, where not a full pdf with all papers of that year exist. But for all of these years, each single paper has a filename. Hence, there is no problem.

In [10]:
# 2) Inspect 'dc.identifier.doi', 'dc.identifier.isbn', 'dc.identifier.issn'

## Define the identifier columns to check
identifier_cols = ['dc.identifier.doi', 'dc.identifier.isbn', 'dc.identifier.issn']

print("=" * 80)
print("PART 1: Check which metadata files contain the identifier columns")
print("=" * 80)

# Track files with/without each column
files_with_all_identifiers = []
files_missing_identifiers = []

for metadata_file, df in zip(list_metadata_files, df_metadata_list):
    available_cols = set(df.columns)
    missing_cols = [col for col in identifier_cols if col not in available_cols]
    
    if not missing_cols:
        files_with_all_identifiers.append(metadata_file.name)
    else:
        files_missing_identifiers.append({
            'file': metadata_file.name,
            'missing': missing_cols,
            'has': [col for col in identifier_cols if col in available_cols]
        })

# Report: Files WITH all identifier columns
print(f"\n✓ Files WITH all three identifier columns: {len(files_with_all_identifiers)}/{len(list_metadata_files)}")
for filename in files_with_all_identifiers:
    print(f"  - {filename}")

# Report: Files MISSING one or more identifier columns
print(f"\n⚠️  Files MISSING one or more identifier columns: {len(files_missing_identifiers)}/{len(list_metadata_files)}")
if files_missing_identifiers:
    for item in files_missing_identifiers:
        print(f"\n  File: {item['file']}")
        print(f"    Has: {item['has']}")
        print(f"    Missing: {item['missing']}")

print("\n" + "=" * 80)
print("PART 2: Analyze value distributions for files with all identifier columns")
print("=" * 80)

for metadata_file, df in zip(list_metadata_files, df_metadata_list):
    # Only analyze files that have all three identifier columns
    if metadata_file.name not in files_with_all_identifiers:
        continue
    
    print(f"\n{'=' * 80}")
    print(f"File: {metadata_file.name}")
    print(f"Total rows: {len(df)}")
    print(f"{'=' * 80}")
    
    for col in identifier_cols:
        print(f"\n  Column: {col}")
        
        # Count non-null values
        non_null_count = df[col].notna().sum()
        null_count = df[col].isna().sum()
        
        print(f"    Non-null values: {non_null_count}/{len(df)}")
        print(f"    Null/NaN values: {null_count}/{len(df)}")
        
        if non_null_count == 0:
            print(f"    ✓ Pattern: Column is ALWAYS empty (all NaN)")
        else:
            # Get unique values (excluding NaN)
            unique_values = df[col].dropna().unique()
            print(f"    Unique non-null values: {len(unique_values)}")
            
            if len(unique_values) == 1:
                print(f"    ✓ Pattern: All non-null rows have SAME value")
                print(f"    Value: '{unique_values[0]}'")
            else:
                print(f"    ⚠️  Pattern: Multiple different values found")
                print(f"    Values: {unique_values[:5]}")  # Show first 5
                if len(unique_values) > 5:
                    print(f"    ... and {len(unique_values) - 5} more")
            
            # Value distribution
            value_counts = df[col].value_counts(dropna=False)
            print(f"    Value distribution:")
            for val, count in value_counts.head(5).items():
                val_display = 'NaN' if pd.isna(val) else f"'{val}'"
                print(f"      {val_display}: {count} rows ({count/len(df)*100:.1f}%)")

print("\n" + "=" * 80)
print("SUMMARY: Identifier column patterns across all files")
print("=" * 80)

summary = {
    'doi_always_empty': [],
    'doi_has_values': [],
    'isbn_constant': [],
    'isbn_varies': [],
    'issn_constant': [],
    'issn_varies': []
}

for metadata_file, df in zip(list_metadata_files, df_metadata_list):
    if metadata_file.name not in files_with_all_identifiers:
        continue
    
    # Check DOI pattern
    if df['dc.identifier.doi'].notna().sum() == 0:
        summary['doi_always_empty'].append(metadata_file.name)
    else:
        summary['doi_has_values'].append(metadata_file.name)
    
    # Check ISBN pattern
    isbn_unique = df['dc.identifier.isbn'].dropna().nunique()
    if isbn_unique <= 1:
        summary['isbn_constant'].append(metadata_file.name)
    else:
        summary['isbn_varies'].append(metadata_file.name)
    
    # Check ISSN pattern
    issn_unique = df['dc.identifier.issn'].dropna().nunique()
    if issn_unique <= 1:
        summary['issn_constant'].append(metadata_file.name)
    else:
        summary['issn_varies'].append(metadata_file.name)

print(f"\nDOI patterns:")
print(f"  Files where DOI is ALWAYS empty: {len(summary['doi_always_empty'])}")
print(f"  Files where DOI has values: {len(summary['doi_has_values'])}")
if summary['doi_has_values']:
    for f in summary['doi_has_values']:
        print(f"    - {f}")

print(f"\nISBN patterns:")
print(f"  Files where ISBN is constant (same value for all rows): {len(summary['isbn_constant'])}")
print(f"  Files where ISBN varies across rows: {len(summary['isbn_varies'])}")
if summary['isbn_varies']:
    for f in summary['isbn_varies']:
        print(f"    - {f}")

print(f"\nISSN patterns:")
print(f"  Files where ISSN is constant (same value for all rows): {len(summary['issn_constant'])}")
print(f"  Files where ISSN varies across rows: {len(summary['issn_varies'])}")
if summary['issn_varies']:
    for f in summary['issn_varies']:
        print(f"    - {f}")

print("\n✓ Analysis complete")

PART 1: Check which metadata files contain the identifier columns

✓ Files WITH all three identifier columns: 6/17
  - metadata-lni-132.xlsx
  - metadata-delfi-2020.xlsx
  - metadata_lni-322.xlsx
  - metadata-DeLFI2018.xlsx
  - metadata-lni-297.xlsx
  - metadata-lni-316.xlsx

⚠️  Files MISSING one or more identifier columns: 11/17

  File: metadata_lni-338.xlsx
    Has: ['dc.identifier.doi', 'dc.identifier.isbn']
    Missing: ['dc.identifier.issn']

  File: metadata-lni-52.xlsx
    Has: ['dc.identifier.isbn', 'dc.identifier.issn']
    Missing: ['dc.identifier.doi']

  File: metadata-lni-37.xlsx
    Has: ['dc.identifier.isbn', 'dc.identifier.issn']
    Missing: ['dc.identifier.doi']

  File: metadata-lni-218.xlsx
    Has: ['dc.identifier.isbn', 'dc.identifier.issn']
    Missing: ['dc.identifier.doi']

  File: metadata-lni-188.xlsx
    Has: ['dc.identifier.isbn', 'dc.identifier.issn']
    Missing: ['dc.identifier.doi']

  File: metadata-lni-153.xlsx
    Has: ['dc.identifier.isbn', 'dc.id

Keep all three columns, since all metadata files have at least one of the three columns. Some have two of the three columns and some have all three. Just allow the columns to also have NULL values.

In [11]:
## 3) Check, whether all relevant columns of the metadata files have been considered for MySQL "paper" table creation.
## Make sure no relevant column has been forgotten.

# Define all columns that have been considered (from markdown notes above)
considered_columns = {
    # Core paper metadata
    'dc.title',                    # -> title
    'dc.contributor.author',       # -> authors
    'dc.language.iso',             # -> language
    'dc.description.abstract',     # -> abstract
    'dc.subject',                  # -> subject
    'filename',                    # -> filename

    # Proceeding/series info
    'dc.relation.ispartof',        # -> proceeding_title
    'dc.relation.ispartofseries',  # -> series_title
    'dc.contributor.editor',       # -> editors

    # Identifiers
    'dc.identifier.doi',           # -> doi
    'dc.identifier.isbn',          # -> isbn
    'dc.identifier.issn',          # -> issn
    'dc.identifier.pissn',         # -> issn (variant)

    # Publication info
    'dc.publisher',                # -> publisher
    'dc.pubPlace',                 # -> publication_place
    'gi.citation.publisherPlace',  # -> publication_place (variant)
    'dc.date.issued',              # -> year

    # Page info (variants)
    'mci.reference.pages',         # -> pages
    'gi.citation.startPage',       # -> pages (variant)
    'gi.citation.endPage',         # -> pages (variant)

    # Conference info (mci. and gi. variants)
    'mci.conference.date',         # -> conference_date
    'gi.conference.date',          # -> conference_date (variant)
    'mci.conference.location',     # -> conference_location
    'gi.conference.location',      # -> conference_location (variant)
    'mci.conference.sessiontitle', # -> session_title
    'gi.conference.sessiontitle',  # -> session_title (variant)

    # Document type and review
    'dc.type',                     # -> publication_type
    'gi.conference.review',        # -> peer_review_status
}

# Collect all unique columns from all metadata files
all_columns_in_files = set()
for df in df_metadata_list:
    all_columns_in_files.update(df.columns)

# Find columns that exist in files but haven't been considered
unconsidered_columns = all_columns_in_files - considered_columns

# Report results
print("=" * 80)
print("COLUMN COVERAGE CHECK: Metadata Files vs. MySQL Table Decisions")
print("=" * 80)

print(f"\nTotal unique columns across all 17 metadata files: {len(all_columns_in_files)}")
print(f"Columns considered for MySQL table: {len(considered_columns)}")
print(f"Columns NOT yet considered: {len(unconsidered_columns)}")

if unconsidered_columns:
    print("\n" + "=" * 80)
    print("⚠️  UNCONSIDERED COLUMNS - Require Decision:")
    print("=" * 80)

    for col in sorted(unconsidered_columns):
        # Find which files contain this column
        files_with_col = []
        for metadata_file, df in zip(list_metadata_files, df_metadata_list):
            if col in df.columns:
                files_with_col.append(metadata_file.name)
        
        print(f"\n  Column: '{col}'")
        print(f"    Present in: {len(files_with_col)}/17 metadata files")
        print(f"    Files: {files_with_col}")

        # Show sample values from first file that has this column
        for df in df_metadata_list:
            if col in df.columns:
                sample_values = df[col].dropna().unique()[:3]
                if len(sample_values) > 0:
                    print(f"    Sample values: {list(sample_values)}")
                break
else:
    print("\n" + "=" * 80)
    print("✓ All columns from metadata files have been considered!")
    print("=" * 80)

# Summary of considered columns
print("\n" + "=" * 80)
print("CONSIDERED COLUMNS SUMMARY:")
print("=" * 80)
for col in sorted(considered_columns):
    files_with_col = sum(1 for df in df_metadata_list if col in df.columns)
    status = "✓" if files_with_col > 0 else "○"
    print(f"  {status} '{col}' (in {files_with_col}/17 files)")

print("\n✓ Column coverage check complete")

COLUMN COVERAGE CHECK: Metadata Files vs. MySQL Table Decisions

Total unique columns across all 17 metadata files: 30
Columns considered for MySQL table: 28
Columns NOT yet considered: 2

⚠️  UNCONSIDERED COLUMNS - Require Decision:

  Column: 'gi.tag'
    Present in: 2/17 metadata files
    Files: ['metadata_lni-338.xlsx', 'metadata_lni-369.xlsx']
    Sample values: ['keynote']

  Column: 'mci.document.quality'
    Present in: 1/17 metadata files
    Files: ['metadata_lni-356.xlsx']
    Sample values: ['digidoc']

CONSIDERED COLUMNS SUMMARY:
  ✓ 'dc.contributor.author' (in 17/17 files)
  ✓ 'dc.contributor.editor' (in 17/17 files)
  ✓ 'dc.date.issued' (in 17/17 files)
  ✓ 'dc.description.abstract' (in 17/17 files)
  ✓ 'dc.identifier.doi' (in 9/17 files)
  ✓ 'dc.identifier.isbn' (in 15/17 files)
  ✓ 'dc.identifier.issn' (in 14/17 files)
  ✓ 'dc.identifier.pissn' (in 2/17 files)
  ✓ 'dc.language.iso' (in 17/17 files)
  ✓ 'dc.pubPlace' (in 15/17 files)
  ✓ 'dc.publisher' (in 17/17 files)

#### Manual inspection of the metadata files with unconsidered columns:


1) Column: 'gi.tag': 'metadata_lni-338.xlsx', 'metadata_lni-369.xlsx'

    - values within "gi.tag" column in "metadata_lni-338.xlsx":
        - "keynote"
        - Thus, has the same information as the "session_title" column

    - no values within the "gi.tag" column in 'metadata_lni-369.xlsx'

    - Hence, this column either contains redundant or no information. Thus, the column will not be added to MySQL


2) Column: 'mci.document.quality': 'metadata_lni-356.xlsx

    - All rows had the same values: "digidoc". Most likely means digital document. Since only one file contains this column and the information is not relevant for the task and database, it will not be added to MySQL.

### Meeting Julian: 

- minimales set an relevanten columns nehmen, die immer vorkommen -> mit kleinem set anfangen.  -> alles verpflichtend: NOT NULL 

- paper identification: deu oder metadata files: isbn, issn
  -> bei nicht relevanten columns: kein NOT NULL 
    - pro proceedings: 1 isbn 

- neue delfi papers: research DEU -> 1 pro proceeding?

- welche Felder dürfen unique sein und welche nicht? Integrität Datenbank
    - Text, Titel etc. -> unique sein
    - deu -> über Kombination unique machen (mehrere columns: )





### 1.2) Check character length distribution of string columns with datatype "varchar" to determine the maximum size of "varchar" for each relevant column (e.g., authors, title, dc_subject, filename, session_title, )

In [12]:
def analyze_varchar_length(column_name, metadata_files, dataframes, mysql_column_name=None):
    """
    Analyze character length distribution for a column to determine VARCHAR size.

    Args:
        column_name: Name of column in metadata files (e.g., 'dc.title')
        metadata_files: List of metadata file paths
        dataframes: List of loaded DataFrames (df_metadata_list)
        mysql_column_name: Optional MySQL column name for display

    Returns:
        dict with min, median, max, recommended VARCHAR size
    """
    if mysql_column_name is None:
        mysql_column_name = column_name

    all_lengths = []
    files_with_column = []
    dtype_info = {}

    for metadata_file, df in zip(metadata_files, dataframes):
        if column_name not in df.columns:
            continue

        files_with_column.append(metadata_file.name)

        # Record dtype
        col_dtype = str(df[column_name].dtype)
        dtype_info[metadata_file.name] = col_dtype

        # Get non-null values and convert to string for length calculation
        values = df[column_name].dropna()

        # Calculate character lengths
        for val in values:
            if pd.notna(val):
                str_val = str(val)
                all_lengths.append(len(str_val))

    if not all_lengths:
        print(f"Column '{column_name}' not found or has no values.")
        return None

    # Calculate statistics
    min_len = min(all_lengths)
    max_len = max(all_lengths)
    median_len = int(np.median(all_lengths))
    mean_len = int(np.mean(all_lengths))

    # Recommend VARCHAR size (round up max to nearest 50 or 100, with buffer)
    if max_len <= 50:
        recommended = 50
    elif max_len <= 100:
        recommended = 100
    elif max_len <= 200:
        recommended = 200
    elif max_len <= 300:
        recommended = 300
    elif max_len <= 500:
        recommended = 500
    elif max_len <= 1000:
        recommended = 1000
    else:
        recommended = ((max_len // 500) + 1) * 500  # Round up to nearest 500

    # Print results
    print("=" * 80)
    print(f"VARCHAR LENGTH ANALYSIS: '{column_name}' -> MySQL '{mysql_column_name}'")
    print("=" * 80)
    print(f"\nFiles containing this column: {len(files_with_column)}/17")
    print(f"Total values analyzed: {len(all_lengths)}")

    print(f"\nData types found:")
    for file, dtype in dtype_info.items():
        print(f"  {file}: {dtype}")

    print(f"\nCharacter length statistics:")
    print(f"  Minimum: {min_len}")
    print(f"  Median:  {median_len}")
    print(f"  Mean:    {mean_len}")
    print(f"  Maximum: {max_len}")

    print(f"\n✓ Recommended VARCHAR size: {recommended}")

    # Show longest value for context
    if max_len > 0:
        for metadata_file, df in zip(metadata_files, dataframes):
            if column_name in df.columns:
                values = df[column_name].dropna()
                for val in values:
                    if len(str(val)) == max_len:
                        preview = str(val)[:100] + "..." if len(str(val)) > 100 else str(val)
                        print(f"\nLongest value preview: '{preview}'")
                        break
                break

    return {
        'column': column_name,
        'mysql_column': mysql_column_name,
        'min': min_len,
        'median': median_len,
        'mean': mean_len,
        'max': max_len,
        'recommended': recommended,
        'count': len(all_lengths)
    }

In [13]:
# 1) Analyze 'dc.title' -> MySQL 'title'
analyze_varchar_length('dc.title', list_metadata_files, df_metadata_list, 'title')

VARCHAR LENGTH ANALYSIS: 'dc.title' -> MySQL 'title'

Files containing this column: 17/17
Total values analyzed: 834

Data types found:
  metadata-lni-132.xlsx: object
  metadata_lni-338.xlsx: object
  metadata-lni-52.xlsx: object
  metadata-delfi-2020.xlsx: object
  metadata-lni-37.xlsx: object
  metadata-lni-218.xlsx: object
  metadata_lni-322.xlsx: object
  metadata-lni-188.xlsx: object
  metadata-lni-153.xlsx: object
  metadata-lni-66.xlsx: object
  metadata_lni-369.xlsx: object
  metadata_lni-356.xlsx: object
  metadata-DeLFI2018.xlsx: object
  metadata-lni-207.xlsx: object
  metadata-lni-297.xlsx: object
  metadata-lni-87.xlsx: object
  metadata-lni-316.xlsx: object

Character length statistics:
  Minimum: 10
  Median:  82
  Mean:    83
  Maximum: 310

✓ Recommended VARCHAR size: 500


{'column': 'dc.title',
 'mysql_column': 'title',
 'min': 10,
 'median': 82,
 'mean': 83,
 'max': 310,
 'recommended': 500,
 'count': 834}

In [18]:
# 2) Analyze 'dc.contributor.author' -> MySQL 'authors'
analyze_varchar_length('dc.contributor.author', list_metadata_files, df_metadata_list, 'authors')

VARCHAR LENGTH ANALYSIS: 'dc.contributor.author' -> MySQL 'authors'

Files containing this column: 17/17
Total values analyzed: 823

Data types found:
  metadata-lni-132.xlsx: object
  metadata_lni-338.xlsx: object
  metadata-lni-52.xlsx: object
  metadata-delfi-2020.xlsx: object
  metadata-lni-37.xlsx: object
  metadata-lni-218.xlsx: object
  metadata_lni-322.xlsx: object
  metadata-lni-188.xlsx: object
  metadata-lni-153.xlsx: object
  metadata-lni-66.xlsx: object
  metadata_lni-369.xlsx: object
  metadata_lni-356.xlsx: object
  metadata-DeLFI2018.xlsx: object
  metadata-lni-207.xlsx: object
  metadata-lni-297.xlsx: object
  metadata-lni-87.xlsx: object
  metadata-lni-316.xlsx: object

Character length statistics:
  Minimum: 7
  Median:  49
  Mean:    53
  Maximum: 310

✓ Recommended VARCHAR size: 500


{'column': 'dc.contributor.author',
 'mysql_column': 'authors',
 'min': 7,
 'median': 49,
 'mean': 53,
 'max': 310,
 'recommended': 500,
 'count': 823}

In [14]:
# 3) Analyze 'dc.language.iso' -> MySQL 'language'
analyze_varchar_length('dc.language.iso', list_metadata_files, df_metadata_list, 'language')

VARCHAR LENGTH ANALYSIS: 'dc.language.iso' -> MySQL 'language'

Files containing this column: 17/17
Total values analyzed: 833

Data types found:
  metadata-lni-132.xlsx: object
  metadata_lni-338.xlsx: object
  metadata-lni-52.xlsx: object
  metadata-delfi-2020.xlsx: object
  metadata-lni-37.xlsx: object
  metadata-lni-218.xlsx: object
  metadata_lni-322.xlsx: object
  metadata-lni-188.xlsx: object
  metadata-lni-153.xlsx: object
  metadata-lni-66.xlsx: object
  metadata_lni-369.xlsx: object
  metadata_lni-356.xlsx: object
  metadata-DeLFI2018.xlsx: object
  metadata-lni-207.xlsx: object
  metadata-lni-297.xlsx: object
  metadata-lni-87.xlsx: object
  metadata-lni-316.xlsx: object

Character length statistics:
  Minimum: 2
  Median:  2
  Mean:    2
  Maximum: 5

✓ Recommended VARCHAR size: 50


{'column': 'dc.language.iso',
 'mysql_column': 'language',
 'min': 2,
 'median': 2,
 'mean': 2,
 'max': 5,
 'recommended': 50,
 'count': 833}

- Since max is 5, VARCHAR(10) will be used. 50 is unnecessarily high

In [15]:
# 4) Analyze 'dc.relation.ispartof' -> MySQL 'proceeding_title'
analyze_varchar_length('dc.relation.ispartof', list_metadata_files, df_metadata_list, 'proceeding_title')

VARCHAR LENGTH ANALYSIS: 'dc.relation.ispartof' -> MySQL 'proceeding_title'

Files containing this column: 17/17
Total values analyzed: 825

Data types found:
  metadata-lni-132.xlsx: object
  metadata_lni-338.xlsx: object
  metadata-lni-52.xlsx: object
  metadata-delfi-2020.xlsx: object
  metadata-lni-37.xlsx: object
  metadata-lni-218.xlsx: object
  metadata_lni-322.xlsx: object
  metadata-lni-188.xlsx: object
  metadata-lni-153.xlsx: object
  metadata-lni-66.xlsx: object
  metadata_lni-369.xlsx: object
  metadata_lni-356.xlsx: object
  metadata-DeLFI2018.xlsx: object
  metadata-lni-207.xlsx: object
  metadata-lni-297.xlsx: object
  metadata-lni-87.xlsx: object
  metadata-lni-316.xlsx: object

Character length statistics:
  Minimum: 10
  Median:  51
  Mean:    47
  Maximum: 89

✓ Recommended VARCHAR size: 100


{'column': 'dc.relation.ispartof',
 'mysql_column': 'proceeding_title',
 'min': 10,
 'median': 51,
 'mean': 47,
 'max': 89,
 'recommended': 100,
 'count': 825}

- Take 200 instead of 100 to have some buffer

In [16]:
# 5) Analyze 'dc.relation.ispartofseries' -> MySQL 'series_title'
analyze_varchar_length('dc.relation.ispartofseries', list_metadata_files, df_metadata_list, 'series_title')

VARCHAR LENGTH ANALYSIS: 'dc.relation.ispartofseries' -> MySQL 'series_title'

Files containing this column: 17/17
Total values analyzed: 833

Data types found:
  metadata-lni-132.xlsx: object
  metadata_lni-338.xlsx: object
  metadata-lni-52.xlsx: object
  metadata-delfi-2020.xlsx: object
  metadata-lni-37.xlsx: object
  metadata-lni-218.xlsx: object
  metadata_lni-322.xlsx: object
  metadata-lni-188.xlsx: object
  metadata-lni-153.xlsx: object
  metadata-lni-66.xlsx: object
  metadata_lni-369.xlsx: object
  metadata_lni-356.xlsx: object
  metadata-DeLFI2018.xlsx: object
  metadata-lni-207.xlsx: object
  metadata-lni-297.xlsx: object
  metadata-lni-87.xlsx: object
  metadata-lni-316.xlsx: object

Character length statistics:
  Minimum: 5
  Median:  62
  Mean:    57
  Maximum: 62

✓ Recommended VARCHAR size: 100

Longest value preview: 'Lecture Notes in Informatics (LNI) - Proceedings, Volume P-132'


{'column': 'dc.relation.ispartofseries',
 'mysql_column': 'series_title',
 'min': 5,
 'median': 62,
 'mean': 57,
 'max': 62,
 'recommended': 100,
 'count': 833}

In [17]:
# 6) Analyze 'dc.contributor.editor' -> MySQL 'editors'
analyze_varchar_length('dc.contributor.editor', list_metadata_files, df_metadata_list, 'editors')

VARCHAR LENGTH ANALYSIS: 'dc.contributor.editor' -> MySQL 'editors'

Files containing this column: 17/17
Total values analyzed: 834

Data types found:
  metadata-lni-132.xlsx: object
  metadata_lni-338.xlsx: object
  metadata-lni-52.xlsx: object
  metadata-delfi-2020.xlsx: object
  metadata-lni-37.xlsx: object
  metadata-lni-218.xlsx: object
  metadata_lni-322.xlsx: object
  metadata-lni-188.xlsx: object
  metadata-lni-153.xlsx: object
  metadata-lni-66.xlsx: object
  metadata_lni-369.xlsx: object
  metadata_lni-356.xlsx: object
  metadata-DeLFI2018.xlsx: object
  metadata-lni-207.xlsx: object
  metadata-lni-297.xlsx: object
  metadata-lni-87.xlsx: object
  metadata-lni-316.xlsx: object

Character length statistics:
  Minimum: 29
  Median:  47
  Mean:    47
  Maximum: 95

✓ Recommended VARCHAR size: 100


{'column': 'dc.contributor.editor',
 'mysql_column': 'editors',
 'min': 29,
 'median': 47,
 'mean': 47,
 'max': 95,
 'recommended': 100,
 'count': 834}

- Also take 200 instead of 100 to have some buffer

In [18]:
# 7) 
# 7.1) Analyze 'mci.reference.pages' -> MySQL 'pages'
analyze_varchar_length('mci.reference.pages', list_metadata_files, df_metadata_list, 'pages')

VARCHAR LENGTH ANALYSIS: 'mci.reference.pages' -> MySQL 'pages'

Files containing this column: 15/17
Total values analyzed: 692

Data types found:
  metadata-lni-132.xlsx: object
  metadata-lni-52.xlsx: object
  metadata-delfi-2020.xlsx: object
  metadata-lni-37.xlsx: object
  metadata-lni-218.xlsx: object
  metadata_lni-322.xlsx: object
  metadata-lni-188.xlsx: object
  metadata-lni-153.xlsx: object
  metadata-lni-66.xlsx: object
  metadata_lni-356.xlsx: object
  metadata-DeLFI2018.xlsx: object
  metadata-lni-207.xlsx: object
  metadata-lni-297.xlsx: object
  metadata-lni-87.xlsx: object
  metadata-lni-316.xlsx: object

Character length statistics:
  Minimum: 2
  Median:  7
  Mean:    6
  Maximum: 7

✓ Recommended VARCHAR size: 50

Longest value preview: '101-112'


{'column': 'mci.reference.pages',
 'mysql_column': 'pages',
 'min': 2,
 'median': 7,
 'mean': 6,
 'max': 7,
 'recommended': 50,
 'count': 692}

In [19]:
# 7.2) Analyze 'gi.citation.startPage' -> MySQL 'pages'
analyze_varchar_length('gi.citation.startPage', list_metadata_files, df_metadata_list, 'pages')

VARCHAR LENGTH ANALYSIS: 'gi.citation.startPage' -> MySQL 'pages'

Files containing this column: 2/17
Total values analyzed: 127

Data types found:
  metadata_lni-338.xlsx: int64
  metadata_lni-369.xlsx: float64

Character length statistics:
  Minimum: 1
  Median:  3
  Mean:    3
  Maximum: 5

✓ Recommended VARCHAR size: 50


{'column': 'gi.citation.startPage',
 'mysql_column': 'pages',
 'min': 1,
 'median': 3,
 'mean': 3,
 'max': 5,
 'recommended': 50,
 'count': 127}

In [20]:
# 7.3) Analyze 'gi.citation.endPage' -> MySQL 'pages'
analyze_varchar_length('gi.citation.endPage', list_metadata_files, df_metadata_list, 'pages')

VARCHAR LENGTH ANALYSIS: 'gi.citation.endPage' -> MySQL 'pages'

Files containing this column: 2/17
Total values analyzed: 127

Data types found:
  metadata_lni-338.xlsx: int64
  metadata_lni-369.xlsx: float64

Character length statistics:
  Minimum: 2
  Median:  3
  Mean:    3
  Maximum: 5

✓ Recommended VARCHAR size: 50


{'column': 'gi.citation.endPage',
 'mysql_column': 'pages',
 'min': 2,
 'median': 3,
 'mean': 3,
 'max': 5,
 'recommended': 50,
 'count': 127}

In [21]:
# 8) Analyze 'dc.subject' -> MySQL 'subject'
analyze_varchar_length('dc.subject', list_metadata_files, df_metadata_list, 'subject')

VARCHAR LENGTH ANALYSIS: 'dc.subject' -> MySQL 'subject'

Files containing this column: 17/17
Total values analyzed: 462

Data types found:
  metadata-lni-132.xlsx: float64
  metadata_lni-338.xlsx: object
  metadata-lni-52.xlsx: float64
  metadata-delfi-2020.xlsx: object
  metadata-lni-37.xlsx: object
  metadata-lni-218.xlsx: object
  metadata_lni-322.xlsx: object
  metadata-lni-188.xlsx: float64
  metadata-lni-153.xlsx: float64
  metadata-lni-66.xlsx: object
  metadata_lni-369.xlsx: object
  metadata_lni-356.xlsx: object
  metadata-DeLFI2018.xlsx: object
  metadata-lni-207.xlsx: float64
  metadata-lni-297.xlsx: object
  metadata-lni-87.xlsx: float64
  metadata-lni-316.xlsx: object

Character length statistics:
  Minimum: 14
  Median:  75
  Mean:    81
  Maximum: 228

✓ Recommended VARCHAR size: 300


{'column': 'dc.subject',
 'mysql_column': 'subject',
 'min': 14,
 'median': 75,
 'mean': 81,
 'max': 228,
 'recommended': 300,
 'count': 462}

In [22]:
# 9) Analyze 'filename' -> MySQL 'filename'
analyze_varchar_length('filename', list_metadata_files, df_metadata_list, 'filename')

VARCHAR LENGTH ANALYSIS: 'filename' -> MySQL 'filename'

Files containing this column: 17/17
Total values analyzed: 829

Data types found:
  metadata-lni-132.xlsx: object
  metadata_lni-338.xlsx: object
  metadata-lni-52.xlsx: object
  metadata-delfi-2020.xlsx: object
  metadata-lni-37.xlsx: object
  metadata-lni-218.xlsx: object
  metadata_lni-322.xlsx: object
  metadata-lni-188.xlsx: object
  metadata-lni-153.xlsx: object
  metadata-lni-66.xlsx: object
  metadata_lni-369.xlsx: object
  metadata_lni-356.xlsx: object
  metadata-DeLFI2018.xlsx: object
  metadata-lni-207.xlsx: object
  metadata-lni-297.xlsx: object
  metadata-lni-87.xlsx: object
  metadata-lni-316.xlsx: object

Character length statistics:
  Minimum: 5
  Median:  20
  Mean:    23
  Maximum: 143

✓ Recommended VARCHAR size: 200


{'column': 'filename',
 'mysql_column': 'filename',
 'min': 5,
 'median': 20,
 'mean': 23,
 'max': 143,
 'recommended': 200,
 'count': 829}

In [23]:
# 10) Analyze 'dc.identifier.doi' -> MySQL 'doi'
analyze_varchar_length('dc.identifier.doi', list_metadata_files, df_metadata_list, 'doi')

VARCHAR LENGTH ANALYSIS: 'dc.identifier.doi' -> MySQL 'doi'

Files containing this column: 9/17
Total values analyzed: 294

Data types found:
  metadata-lni-132.xlsx: float64
  metadata_lni-338.xlsx: object
  metadata-delfi-2020.xlsx: float64
  metadata_lni-322.xlsx: object
  metadata_lni-369.xlsx: object
  metadata_lni-356.xlsx: object
  metadata-DeLFI2018.xlsx: float64
  metadata-lni-297.xlsx: object
  metadata-lni-316.xlsx: float64

Character length statistics:
  Minimum: 18
  Median:  21
  Mean:    21
  Maximum: 22

✓ Recommended VARCHAR size: 50


{'column': 'dc.identifier.doi',
 'mysql_column': 'doi',
 'min': 18,
 'median': 21,
 'mean': 21,
 'max': 22,
 'recommended': 50,
 'count': 294}

In [24]:
# 11) Analyze 'dc.identifier.isbn' -> MySQL 'isbn'
analyze_varchar_length('dc.identifier.isbn', list_metadata_files, df_metadata_list, 'isbn')

VARCHAR LENGTH ANALYSIS: 'dc.identifier.isbn' -> MySQL 'isbn'

Files containing this column: 15/17
Total values analyzed: 714

Data types found:
  metadata-lni-132.xlsx: object
  metadata_lni-338.xlsx: object
  metadata-lni-52.xlsx: object
  metadata-delfi-2020.xlsx: object
  metadata-lni-37.xlsx: object
  metadata-lni-218.xlsx: object
  metadata_lni-322.xlsx: object
  metadata-lni-188.xlsx: object
  metadata-lni-153.xlsx: object
  metadata-lni-66.xlsx: object
  metadata-DeLFI2018.xlsx: object
  metadata-lni-207.xlsx: object
  metadata-lni-297.xlsx: object
  metadata-lni-87.xlsx: object
  metadata-lni-316.xlsx: object

Character length statistics:
  Minimum: 13
  Median:  17
  Mean:    16
  Maximum: 17

✓ Recommended VARCHAR size: 50

Longest value preview: '978-3-88579-226-0'


{'column': 'dc.identifier.isbn',
 'mysql_column': 'isbn',
 'min': 13,
 'median': 17,
 'mean': 16,
 'max': 17,
 'recommended': 50,
 'count': 714}

- Since the ISBN is variable, but the maximum character length is 17, VARCHAR(20) will be used

In [25]:
# 12) Analyze 'dc.identifier.issn' -> MySQL 'issn'
analyze_varchar_length('dc.identifier.issn', list_metadata_files, df_metadata_list, 'issn')

VARCHAR LENGTH ANALYSIS: 'dc.identifier.issn' -> MySQL 'issn'

Files containing this column: 14/17
Total values analyzed: 647

Data types found:
  metadata-lni-132.xlsx: object
  metadata-lni-52.xlsx: object
  metadata-delfi-2020.xlsx: object
  metadata-lni-37.xlsx: object
  metadata-lni-218.xlsx: object
  metadata_lni-322.xlsx: object
  metadata-lni-188.xlsx: object
  metadata-lni-153.xlsx: object
  metadata-lni-66.xlsx: object
  metadata-DeLFI2018.xlsx: object
  metadata-lni-207.xlsx: object
  metadata-lni-297.xlsx: object
  metadata-lni-87.xlsx: object
  metadata-lni-316.xlsx: object

Character length statistics:
  Minimum: 9
  Median:  9
  Mean:    9
  Maximum: 9

✓ Recommended VARCHAR size: 50

Longest value preview: '1617-5468'


{'column': 'dc.identifier.issn',
 'mysql_column': 'issn',
 'min': 9,
 'median': 9,
 'mean': 9,
 'max': 9,
 'recommended': 50,
 'count': 647}

- Since the ISSN is fix, CHAR(9) will be used

In [26]:
# 13) Analyze 'dc.publisher' -> MySQL 'publisher'
analyze_varchar_length('dc.publisher', list_metadata_files, df_metadata_list, 'publisher')

VARCHAR LENGTH ANALYSIS: 'dc.publisher' -> MySQL 'publisher'

Files containing this column: 17/17
Total values analyzed: 833

Data types found:
  metadata-lni-132.xlsx: object
  metadata_lni-338.xlsx: object
  metadata-lni-52.xlsx: object
  metadata-delfi-2020.xlsx: object
  metadata-lni-37.xlsx: object
  metadata-lni-218.xlsx: object
  metadata_lni-322.xlsx: object
  metadata-lni-188.xlsx: object
  metadata-lni-153.xlsx: object
  metadata-lni-66.xlsx: object
  metadata_lni-369.xlsx: object
  metadata_lni-356.xlsx: object
  metadata-DeLFI2018.xlsx: object
  metadata-lni-207.xlsx: object
  metadata-lni-297.xlsx: object
  metadata-lni-87.xlsx: object
  metadata-lni-316.xlsx: object

Character length statistics:
  Minimum: 32
  Median:  32
  Mean:    32
  Maximum: 33

✓ Recommended VARCHAR size: 50

Longest value preview: 'Gesellschaft für Informatik e. V.'


{'column': 'dc.publisher',
 'mysql_column': 'publisher',
 'min': 32,
 'median': 32,
 'mean': 32,
 'max': 33,
 'recommended': 50,
 'count': 833}

In [27]:
# 14) Analyze 'dc.pubPlace' -> MySQL 'publication_place'
analyze_varchar_length('dc.pubPlace', list_metadata_files, df_metadata_list, 'publication_place')

VARCHAR LENGTH ANALYSIS: 'dc.pubPlace' -> MySQL 'publication_place'

Files containing this column: 15/17
Total values analyzed: 705

Data types found:
  metadata-lni-132.xlsx: object
  metadata-lni-52.xlsx: object
  metadata-delfi-2020.xlsx: object
  metadata-lni-37.xlsx: object
  metadata-lni-218.xlsx: object
  metadata_lni-322.xlsx: object
  metadata-lni-188.xlsx: object
  metadata-lni-153.xlsx: object
  metadata-lni-66.xlsx: object
  metadata_lni-356.xlsx: object
  metadata-DeLFI2018.xlsx: object
  metadata-lni-207.xlsx: object
  metadata-lni-297.xlsx: object
  metadata-lni-87.xlsx: object
  metadata-lni-316.xlsx: object

Character length statistics:
  Minimum: 4
  Median:  4
  Mean:    4
  Maximum: 4

✓ Recommended VARCHAR size: 50

Longest value preview: 'Bonn'


{'column': 'dc.pubPlace',
 'mysql_column': 'publication_place',
 'min': 4,
 'median': 4,
 'mean': 4,
 'max': 4,
 'recommended': 50,
 'count': 705}

- Since the values are fix (always 'Bonn'), CHAR(4) will be used

In [28]:
# 15) Analyze 'dc.date.issued' -> MySQL 'year'
analyze_varchar_length('dc.date.issued', list_metadata_files, df_metadata_list, 'year')

VARCHAR LENGTH ANALYSIS: 'dc.date.issued' -> MySQL 'year'

Files containing this column: 17/17
Total values analyzed: 832

Data types found:
  metadata-lni-132.xlsx: float64
  metadata_lni-338.xlsx: int64
  metadata-lni-52.xlsx: int64
  metadata-delfi-2020.xlsx: int64
  metadata-lni-37.xlsx: int64
  metadata-lni-218.xlsx: int64
  metadata_lni-322.xlsx: int64
  metadata-lni-188.xlsx: int64
  metadata-lni-153.xlsx: int64
  metadata-lni-66.xlsx: int64
  metadata_lni-369.xlsx: int64
  metadata_lni-356.xlsx: float64
  metadata-DeLFI2018.xlsx: int64
  metadata-lni-207.xlsx: int64
  metadata-lni-297.xlsx: int64
  metadata-lni-87.xlsx: int64
  metadata-lni-316.xlsx: int64

Character length statistics:
  Minimum: 4
  Median:  4
  Mean:    4
  Maximum: 6

✓ Recommended VARCHAR size: 50

Longest value preview: '2008.0'


{'column': 'dc.date.issued',
 'mysql_column': 'year',
 'min': 4,
 'median': 4,
 'mean': 4,
 'max': 6,
 'recommended': 50,
 'count': 832}

- Expected the max value to be 4. For the year column the datetype YEAR(4) will be used and not VARCHAR

In [29]:
# 16) 
# 16.1) Analyze 'mci.conference.date' -> MySQL 'conference_date'
analyze_varchar_length('mci.conference.date', list_metadata_files, df_metadata_list, 'conference_date')

VARCHAR LENGTH ANALYSIS: 'mci.conference.date' -> MySQL 'conference_date'

Files containing this column: 15/17
Total values analyzed: 705

Data types found:
  metadata-lni-132.xlsx: object
  metadata-lni-52.xlsx: object
  metadata-delfi-2020.xlsx: object
  metadata-lni-37.xlsx: object
  metadata-lni-218.xlsx: object
  metadata_lni-322.xlsx: object
  metadata-lni-188.xlsx: object
  metadata-lni-153.xlsx: object
  metadata-lni-66.xlsx: object
  metadata_lni-356.xlsx: object
  metadata-DeLFI2018.xlsx: object
  metadata-lni-207.xlsx: object
  metadata-lni-297.xlsx: object
  metadata-lni-87.xlsx: object
  metadata-lni-316.xlsx: object

Character length statistics:
  Minimum: 20
  Median:  22
  Mean:    21
  Maximum: 24

✓ Recommended VARCHAR size: 50

Longest value preview: '07. - 10. September 2008'


{'column': 'mci.conference.date',
 'mysql_column': 'conference_date',
 'min': 20,
 'median': 22,
 'mean': 21,
 'max': 24,
 'recommended': 50,
 'count': 705}

In [30]:
# 16) 
# 16.2) Analyze 'gi.conference.date' -> MySQL 'conference_date'
analyze_varchar_length('gi.conference.date', list_metadata_files, df_metadata_list, 'conference_date')

VARCHAR LENGTH ANALYSIS: 'gi.conference.date' -> MySQL 'conference_date'

Files containing this column: 2/17
Total values analyzed: 128

Data types found:
  metadata_lni-338.xlsx: object
  metadata_lni-369.xlsx: object

Character length statistics:
  Minimum: 21
  Median:  22
  Mean:    21
  Maximum: 22

✓ Recommended VARCHAR size: 50

Longest value preview: '11.-13. September 2023'


{'column': 'gi.conference.date',
 'mysql_column': 'conference_date',
 'min': 21,
 'median': 22,
 'mean': 21,
 'max': 22,
 'recommended': 50,
 'count': 128}

In [32]:
# 17) 
# 17.1) Analyze ''mci.conference.location' -> MySQL 'conference_location'
analyze_varchar_length('mci.conference.location', list_metadata_files, df_metadata_list, 'conference_location')

VARCHAR LENGTH ANALYSIS: 'mci.conference.location' -> MySQL 'conference_location'

Files containing this column: 15/17
Total values analyzed: 705

Data types found:
  metadata-lni-132.xlsx: object
  metadata-lni-52.xlsx: object
  metadata-delfi-2020.xlsx: object
  metadata-lni-37.xlsx: object
  metadata-lni-218.xlsx: object
  metadata_lni-322.xlsx: object
  metadata-lni-188.xlsx: object
  metadata-lni-153.xlsx: object
  metadata-lni-66.xlsx: object
  metadata_lni-356.xlsx: object
  metadata-DeLFI2018.xlsx: object
  metadata-lni-207.xlsx: object
  metadata-lni-297.xlsx: object
  metadata-lni-87.xlsx: object
  metadata-lni-316.xlsx: object

Character length statistics:
  Minimum: 5
  Median:  7
  Mean:    9
  Maximum: 20

✓ Recommended VARCHAR size: 50


{'column': 'mci.conference.location',
 'mysql_column': 'conference_location',
 'min': 5,
 'median': 7,
 'mean': 9,
 'max': 20,
 'recommended': 50,
 'count': 705}

In [33]:
# 17.2) Analyze 'gi.conference.location' -> MySQL 'conference_location'
analyze_varchar_length('gi.conference.location', list_metadata_files, df_metadata_list, 'conference_location')

VARCHAR LENGTH ANALYSIS: 'gi.conference.location' -> MySQL 'conference_location'

Files containing this column: 2/17
Total values analyzed: 128

Data types found:
  metadata_lni-338.xlsx: object
  metadata_lni-369.xlsx: object

Character length statistics:
  Minimum: 6
  Median:  6
  Mean:    6
  Maximum: 8

✓ Recommended VARCHAR size: 50


{'column': 'gi.conference.location',
 'mysql_column': 'conference_location',
 'min': 6,
 'median': 6,
 'mean': 6,
 'max': 8,
 'recommended': 50,
 'count': 128}

In [34]:
# 18.1) Analyze 'mci.conference.sessiontitle' -> MySQL 'session_title'
analyze_varchar_length('mci.conference.sessiontitle', list_metadata_files, df_metadata_list, 'session_title')

VARCHAR LENGTH ANALYSIS: 'mci.conference.sessiontitle' -> MySQL 'session_title'

Files containing this column: 15/17
Total values analyzed: 591

Data types found:
  metadata-lni-132.xlsx: object
  metadata-lni-52.xlsx: object
  metadata-delfi-2020.xlsx: float64
  metadata-lni-37.xlsx: object
  metadata-lni-218.xlsx: object
  metadata_lni-322.xlsx: float64
  metadata-lni-188.xlsx: object
  metadata-lni-153.xlsx: object
  metadata-lni-66.xlsx: object
  metadata_lni-356.xlsx: object
  metadata-DeLFI2018.xlsx: object
  metadata-lni-207.xlsx: object
  metadata-lni-297.xlsx: object
  metadata-lni-87.xlsx: object
  metadata-lni-316.xlsx: object

Character length statistics:
  Minimum: 7
  Median:  23
  Mean:    22
  Maximum: 49

✓ Recommended VARCHAR size: 50


{'column': 'mci.conference.sessiontitle',
 'mysql_column': 'session_title',
 'min': 7,
 'median': 23,
 'mean': 22,
 'max': 49,
 'recommended': 50,
 'count': 591}

In [35]:
# 18.2) Analyze 'gi.conference.sessiontitle' -> MySQL 'session_title'
analyze_varchar_length('gi.conference.sessiontitle', list_metadata_files, df_metadata_list, 'session_title')

VARCHAR LENGTH ANALYSIS: 'gi.conference.sessiontitle' -> MySQL 'session_title'

Files containing this column: 2/17
Total values analyzed: 128

Data types found:
  metadata_lni-338.xlsx: object
  metadata_lni-369.xlsx: object

Character length statistics:
  Minimum: 8
  Median:  15
  Mean:    19
  Maximum: 45

✓ Recommended VARCHAR size: 50

Longest value preview: 'Learning Analytics und Künstliche Intelligenz'


{'column': 'gi.conference.sessiontitle',
 'mysql_column': 'session_title',
 'min': 8,
 'median': 15,
 'mean': 19,
 'max': 45,
 'recommended': 50,
 'count': 128}

- Will make VARCHAR(100) to have some buffer

In [36]:
# 19) Analyze 'dc.type'  -> MySQL 'publication:type'
analyze_varchar_length('dc.type' , list_metadata_files, df_metadata_list, 'publication_type')

VARCHAR LENGTH ANALYSIS: 'dc.type' -> MySQL 'publication_type'

Files containing this column: 17/17
Total values analyzed: 831

Data types found:
  metadata-lni-132.xlsx: object
  metadata_lni-338.xlsx: object
  metadata-lni-52.xlsx: object
  metadata-delfi-2020.xlsx: object
  metadata-lni-37.xlsx: object
  metadata-lni-218.xlsx: object
  metadata_lni-322.xlsx: object
  metadata-lni-188.xlsx: object
  metadata-lni-153.xlsx: object
  metadata-lni-66.xlsx: object
  metadata_lni-369.xlsx: object
  metadata_lni-356.xlsx: object
  metadata-DeLFI2018.xlsx: object
  metadata-lni-207.xlsx: object
  metadata-lni-297.xlsx: object
  metadata-lni-87.xlsx: object
  metadata-lni-316.xlsx: object

Character length statistics:
  Minimum: 15
  Median:  21
  Mean:    21
  Maximum: 24

✓ Recommended VARCHAR size: 50


{'column': 'dc.type',
 'mysql_column': 'publication_type',
 'min': 15,
 'median': 21,
 'mean': 21,
 'max': 24,
 'recommended': 50,
 'count': 831}

In [37]:
# 20) Analyze 'gi.conference.review'  -> MySQL 'peer_review_status'
analyze_varchar_length('gi.conference.review' , list_metadata_files, df_metadata_list, 'peer_review_status')

VARCHAR LENGTH ANALYSIS: 'gi.conference.review' -> MySQL 'peer_review_status'

Files containing this column: 2/17
Total values analyzed: 124

Data types found:
  metadata_lni-338.xlsx: object
  metadata_lni-369.xlsx: object

Character length statistics:
  Minimum: 4
  Median:  4
  Mean:    4
  Maximum: 4

✓ Recommended VARCHAR size: 50

Longest value preview: 'full'


{'column': 'gi.conference.review',
 'mysql_column': 'peer_review_status',
 'min': 4,
 'median': 4,
 'mean': 4,
 'max': 4,
 'recommended': 50,
 'count': 124}

- fix length -> will use CHAR(4)