# Create the Corpus Problem Lists

The notebook below creates the lists of documents for the corpus and datatype.

In [58]:
import sys
import pandas as pd

from from_root import from_root

sys.path.insert(0, str(from_root("src")))

from utils import get_base_location, build_metadata_df, apply_temp_doc_id
from read_and_write_docs import read_jsonl, read_rds

In [59]:

corpus      = "Enron"
data_type   = "test"

# Set NAS so can run on Windows laptop seamlessly
nas_base_loc = get_base_location()

known_loc = f"{nas_base_loc}/datasets/author_verification/{data_type}/{corpus}/known_raw.jsonl"
unknown_loc = f"{nas_base_loc}/datasets/author_verification/{data_type}/{corpus}/unknown_raw.jsonl"
metadata_loc = f"{nas_base_loc}/datasets/author_verification/{data_type}/metadata.rds"

save_loc = f"{nas_base_loc}/datasets/author_verification/{data_type}/{corpus}"

## Read Data

In [60]:
metadata = read_rds(metadata_loc)
filtered_metadata = metadata[metadata['corpus'] == corpus]

known = read_jsonl(known_loc)
unknown = read_jsonl(unknown_loc)

## Create Metadata

Quite a convoluted process.

In [61]:
# Build the dataframe
complete_metadata = build_metadata_df(filtered_metadata, known, unknown)

# Set blank text column for function to work
complete_metadata['text'] = ''

# Rename the known column and create the new doc_id
complete_metadata.rename(columns={"known_doc_id": "orig_doc_id"}, inplace=True)
complete_metadata = apply_temp_doc_id(complete_metadata)
complete_metadata.rename(columns={
    "orig_doc_id": "orig_known_doc_id",
    "doc_id": "known_doc_id",
    "unknown_doc_id": "orig_doc_id"
}, inplace=True)

# Do the same for the unknown
complete_metadata = apply_temp_doc_id(complete_metadata)
complete_metadata.rename(columns={
    "orig_doc_id": "orig_unknown_doc_id",
    "doc_id": "unknown_doc_id",
}, inplace=True)

# Sort columns
complete_metadata = complete_metadata[["sample_id", "problem", "corpus", "known_doc_id", "unknown_doc_id"]]

## View the data

In [62]:
complete_metadata.head()

Unnamed: 0,sample_id,problem,corpus,known_doc_id,unknown_doc_id
0,1,Kevin.hyatt vs Kevin.hyatt,Enron,kevin_hyatt_mail_1,kevin_hyatt_mail_2
1,2,Kevin.hyatt vs Kevin.hyatt,Enron,kevin_hyatt_mail_3,kevin_hyatt_mail_2
2,3,Kevin.hyatt vs Kevin.hyatt,Enron,kevin_hyatt_mail_4,kevin_hyatt_mail_2
3,4,Kevin.hyatt vs Kevin.hyatt,Enron,kevin_hyatt_mail_5,kevin_hyatt_mail_2
4,5,Kevin.hyatt vs Kimberly.watson,Enron,kevin_hyatt_mail_1,kimberly_watson_mail_3


## Get Number of Rows in the Dataset

This is used for the jobscript.

In [63]:
num_rows_for_jobscript = complete_metadata.shape[0]
print(f"Number of rows needed in jobscript: {num_rows_for_jobscript}")

Number of rows needed in jobscript: 340


## Save the Lists

In [64]:
pd.Series(complete_metadata["known_doc_id"].astype(str)).to_csv(f"{save_loc}/known_doc_list.txt", index=False, header=False)
pd.Series(complete_metadata["unknown_doc_id"].astype(str)).to_csv(f"{save_loc}/unknown_doc_list.txt", index=False, header=False)
print("Wrote known_doc_list.txt and unknown_doc_list.txt")

Wrote known_doc_list.txt and unknown_doc_list.txt
