# Create Document-Level Metadata

Here we create the document level metadata by processing the metadata for all corpuses. This is useful to only include the problems where we have a full dataset.

In [18]:
import sys

import pandas as pd

from from_root import from_root
from pathlib import Path

sys.path.insert(0, str(from_root("src")))

from read_and_write_docs import read_jsonl, read_rds, write_rds
from utils import apply_temp_doc_id, build_metadata_df

In [19]:
base_loc = "/Volumes/BCross/datasets/author_verification"

## Function to create the metadata

Here we create the metadata and save it to create the document level metadata.

In [20]:
def build_split_metadata(
    base_loc: str | Path,
    split: str,  # "training" or "test"
    *,
    verbose: bool = True,
    write_file: bool = True
) -> pd.DataFrame:
    base_loc = Path(base_loc)

    if split not in {"training", "test"}:
        raise ValueError("split must be 'training' or 'test'")

    metadata_rds_loc = base_loc / split / "metadata.rds"
    metadata_save_loc = base_loc / split / "doc_level_metadata.rds"
    
    metadata = read_rds(metadata_rds_loc)
    if "corpus" not in metadata.columns:
        raise ValueError("metadata is missing required column: 'corpus'")

    corpora = metadata["corpus"].dropna().drop_duplicates().tolist()
    metadata_list: list[pd.DataFrame] = []

    for corpus in corpora:
        if verbose:
            print(f"Current corpus is: {corpus}")

        try:
            filtered_metadata = metadata[metadata["corpus"] == corpus]

            known_path = base_loc / split / corpus / "known_raw.jsonl"
            unknown_path = base_loc / split / corpus / "unknown_raw.jsonl"

            known_data = read_jsonl(str(known_path))
            unknown_data = read_jsonl(str(unknown_path))

            known_data = apply_temp_doc_id(known_data)
            unknown_data = apply_temp_doc_id(unknown_data)

            agg_metadata = build_metadata_df(filtered_metadata, known_data, unknown_data)
            
            # add split as data_type, placed immediately before 'problem'
            agg_metadata["data_type"] = split
            cols = agg_metadata.columns.tolist()
            cols.insert(cols.index("problem"), cols.pop(cols.index("data_type")))
            agg_metadata = agg_metadata[cols]
            agg_metadata['filename'] = agg_metadata['known_doc_id'] + ' vs ' + agg_metadata['unknown_doc_id'] + '.xlsx'
            
            metadata_list.append(agg_metadata)

        except Exception as e:
            print(f"[FAILED] corpus='{corpus}' split='{split}': {type(e).__name__}: {e}")
            continue

    if not metadata_list:
        if verbose:
            print("No corpora succeeded; returning empty DataFrame.")
        return pd.DataFrame()

    metadata_df = pd.concat(metadata_list, ignore_index=True)
    
    if write_file:
        print("Saving file")
        write_rds(metadata_df, metadata_save_loc)
    else:
        return metadata_df


## Create the Training Metadata

In [21]:
build_split_metadata(
    base_loc=base_loc,
    split= "training",
    verbose = True,
    write_file = True
)

Current corpus is: ACL
Current corpus is: Amazon
Current corpus is: The Apricity
Current corpus is: Koppel's Blogs
Current corpus is: Enron
Current corpus is: Perverted Justice
Current corpus is: Reddit
Current corpus is: StackExchange
Current corpus is: The Telegraph
Current corpus is: Yelp
Current corpus is: All-the-news
[FAILED] corpus='All-the-news' split='training': IndexError: single positional indexer is out-of-bounds
Current corpus is: IMDB
Current corpus is: TripAdvisor
Current corpus is: Wiki
Saving file


## Create the Test Metadata

In [22]:
build_split_metadata(
    base_loc=base_loc,
    split= "test",
    verbose = True,
    write_file = True
)

Current corpus is: ACL
Current corpus is: Amazon
Current corpus is: The Apricity
Current corpus is: Koppel's Blogs
Current corpus is: Enron
Current corpus is: Perverted Justice
Current corpus is: Reddit
Current corpus is: StackExchange
Current corpus is: The Telegraph
Current corpus is: Yelp
Current corpus is: All-the-news
Current corpus is: IMDB
Current corpus is: TripAdvisor
Current corpus is: Wiki
Saving file
