In [15]:
# Load libraries

import pandas as pd 
import os 
from pathlib import Path
import warnings

## 1) Metadata files inspection

In [None]:
# pattern of files: metadata-lni-37.xlsx, metadata-lni-52.xlsx, metadata_lni-338, metadata-delfi-2020.xlsx

# Ignore pandas read excel warnings
warnings.filterwarnings("ignore")

# Get current wd
os.getcwd()

# Get the data directory as Path object
data_dir = Path("../data")

# List all sub-directories (contain the annual publications)
data_subdir = [a.name for a in data_dir.iterdir() if a.is_dir()]

print("Sub-directories in current directory:")
print(data_subdir)
print("Number of sub-directories:")
print(len(data_subdir))

# Create list to store metadatafiles
list_metadata_files = []

# Pattern to find all metadata files (starts with "metadata" and ends with .xlsx)
metadata_paths = list(data_dir.glob("*/metadata*.xlsx"))

# Loop through metadata paths
for metadata_path in metadata_paths:
    print(f" {metadata_path}")
    # Append to list
    list_metadata_files.append(metadata_path)

print("Numter of metadata files found:")
print(len(list_metadata_files))

Sub-directories in current directory:
['lni132', 'lni338', 'lni52', 'lni308', 'lni37', 'lni218', 'lni322', 'lni188', 'lni153', 'lni66', 'lni369', 'lni356', 'lni284', 'lni207', 'lni297', 'lni87', 'lni316']
Number of sub-directories:
17
 ../data/lni132/metadata-lni-132.xlsx
 ../data/lni338/metadata_lni-338.xlsx
 ../data/lni52/metadata-lni-52.xlsx
 ../data/lni308/metadata-delfi-2020.xlsx
 ../data/lni37/metadata-lni-37.xlsx
 ../data/lni218/metadata-lni-218.xlsx
 ../data/lni322/metadata_lni-322.xlsx
 ../data/lni188/metadata-lni-188.xlsx
 ../data/lni153/metadata-lni-153.xlsx
 ../data/lni66/metadata-lni-66.xlsx
 ../data/lni369/metadata_lni-369.xlsx
 ../data/lni356/metadata_lni-356.xlsx
 ../data/lni284/metadata-DeLFI2018.xlsx
 ../data/lni207/metadata-lni-207.xlsx
 ../data/lni297/metadata-lni-297.xlsx
 ../data/lni87/metadata-lni-87.xlsx
 ../data/lni316/metadata-lni-316.xlsx
Numter of metadata files found:
17


In [22]:
# Check, whether all metadata files have the same columns 

## List to store column names
list_of_columns = []

for metadata_file in list_metadata_files:
    df = pd.read_excel(metadata_file)
    list_of_columns.append(set(df.columns))

## Check if all sets are equal
first = list_of_columns[0]
all_same = all(cols == first for cols in list_of_columns)

print("All files have the same columns:", all_same)

All files have the same columns: False


In [None]:
# Check, which columns are different
all_cols = []
for f in list_metadata_files:
    df = pd.read_excel(f)
    all_cols.append(set(df.columns))

common = set.intersection(*all_cols)
union  = set.union(*all_cols)

print("Common columns in ALL files:", common)
print("All unique columns across files:", union)
print("Columns that vary (union - common):", union - common)


Common columns in ALL files: {'dc.title', 'dc.date.issued', 'dc.contributor.editor', 'dc.relation.ispartofseries', 'dc.contributor.author', 'dc.relation.ispartof', 'dc.language.iso', 'dc.publisher', 'dc.description.abstract', 'dc.subject', 'filename', 'dc.type'}
All unique columns across files: {'gi.conference.sessiontitle', 'dc.pubPlace', 'mci.conference.date', 'gi.conference.review', 'dc.title', 'dc.date.issued', 'Nr.', 'dc.contributor.editor', 'gi.citation.startPage', 'gi.conference.location', 'mci.document.quality', 'gi.conference.date', 'dc.subject', 'dc.relation.ispartofseries', 'mci.conference.sessiontitle', 'dc.contributor.author', 'dc.relation.ispartof', 'dc.description.abstract', 'dc.identifier.doi', 'gi.citation.publisherPlace', 'filename', 'dc.identifier.pissn', 'gi.tag', 'ID', 'dc.identifier.isbn', 'mci.conference.location', 'dc.identifier.issn', 'Beitragsart', 'gi.citation.endPage', 'mci.reference.pages', 'dc.language.iso', 'dc.publisher', 'dc.type'}
Columns that vary (un

- Pattern: use of "gi." vs. "mci."

In [None]:
# Print out varying number and types of columns:

## Store pandas df's in a list
df_metadata_list = []

for metadata_file in list_metadata_files:
    df = pd.read_excel(metadata_file)
    print(f"Number of columns: {len(df.columns)}")
    print(f"Columns: {df.columns.tolist()}")
    df_metadata_list.append(df)

Number of columns: 20
Columns: ['dc.title', 'dc.contributor.author', 'dc.language.iso', 'dc.relation.ispartof', 'dc.contributor.editor', 'mci.reference.pages', 'dc.description.abstract', 'dc.subject', 'filename', 'dc.identifier.doi', 'dc.identifier.isbn', 'dc.identifier.issn', 'dc.relation.ispartofseries', 'dc.publisher', 'dc.pubPlace', 'dc.date.issued', 'mci.conference.date', 'mci.conference.location', 'mci.conference.sessiontitle', 'dc.type']
Number of columns: 23
Columns: ['dc.title', 'dc.contributor.author', 'dc.language.iso', 'dc.relation.ispartof', 'dc.contributor.editor', 'dc.relation.ispartofseries', 'dc.publisher', 'gi.citation.publisherPlace', 'dc.date.issued', 'dc.description.abstract', 'dc.subject', 'gi.tag', 'dc.identifier.doi', 'dc.identifier.pissn', 'dc.identifier.isbn', 'gi.citation.startPage', 'gi.citation.endPage', 'gi.conference.date', 'gi.conference.location', 'gi.conference.sessiontitle', 'dc.type', 'gi.conference.review', 'filename']
Number of columns: 19
Columns:

### To-Do's:

- 1) manually inspecting different column names: "gi." vs "mci." -> same content?
    - patterns? which years have use which?

- 2) checking the df's without "filename" column -> is extremely important for mapping files with metadata

- 3) inspecting some distributions:
    - amount of papers over the years
    - language distribution over the years
    - dc.type -> relevant for filtering 
    - gi./msc. conference.sessiontitle

- 4) deciding which columns to keep for all papers with metadata
    - also keeping in mind which columns will be part of table for all papers without metadata:
        - id
        - authors
        - title
        - year
        - pages?
        - abstract?
        - text
        - references?