In [1]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from warnings import warn

client = MongoClient()

In [2]:
#System-specific setup

db = client.forsaken
book_schemas_collection = db.rawSchemas
book_links_collection = db.rawLinks

In [3]:
def convert_fields_to_projection_without_id(fields=dict()):
    remove_id_projection = {"_id": 0}
    field_projection = {field: 1 for field in fields}
    projection = remove_id_projection | field_projection
    
    return projection

In [4]:
def get_entries(collection, projection):
    return list(collection.find(projection=projection))

In [5]:
book_schema_fields = ["title",
"era",
"compDate",
"dependence",
"categories"]

In [6]:
book_schemas_projection = convert_fields_to_projection_without_id(book_schema_fields)
book_schemas_list = get_entries(book_schemas_collection, book_schemas_projection)

book_links_projection = convert_fields_to_projection_without_id()
book_links_list = get_entries(book_links_collection, book_links_projection)

In [7]:
book_schemas_dataframe = pd.DataFrame(book_schemas_list)
book_links_dataframe = pd.DataFrame(book_links_list)

In [8]:
bs = book_schemas_dataframe
bl = book_links_dataframe

In [9]:
def regex_replace_in_field(pattern, replacement, field, dataframe):
    dataframe[field].replace(pattern, replacement, regex=True, inplace=True)

In [10]:
from functools import partial

replace_date = partial(regex_replace_in_field, field="compDate", dataframe=bs)

positive_capture = r"\1"
negative_capture = r"-\1"

earlier_date_in_range_pattern = r"(-?\d+)-\d+"
replace_date(earlier_date_in_range_pattern, positive_capture)

date_from_circa_date_pattern = r"ca?\.\s*(-?\d+)"
replace_date(date_from_circa_date_pattern, positive_capture)

date_from_bce_date_pattern = r"(\d+) BCE?"
replace_date(date_from_bce_date_pattern, negative_capture)

In [11]:
invalid_date_pattern = r"\d*[^-\d]+\d*"
missed_entries = bs[bs["compDate"].str.match(invalid_date_pattern, na=False)]
missed_compDates = missed_entries["compDate"]

if len(missed_compDates) > 0:
    warn(f"These compDate values will be dismissed and converted to NaN: {list(missed_compDates)}")

In [12]:
numeric_dates = pd.to_numeric(bs["compDate"], errors="coerce")
int_dates = numeric_dates.astype("Int64")
bs["compDate"] = int_dates

In [13]:
era_string = {"A": "Amoraim", "AH": "Ahronim", "CO": "Contemporary", "RI": "Rishonim", "T": "Tanaim"}
era_int = {"T": 1, "A": 2, "RI": 3, "AH": 4, "CO": 5}

In [14]:
era_as_mixed_int = bs["era"].replace(era_int)
era_as_numeric = pd.to_numeric(era_as_mixed_int, errors="coerce")
era_as_int = era_as_numeric.astype("Int64")

bs["era"] = era_as_int

In [15]:
if not "is_commentary" in bs.columns:
    is_dependence_commentary = bs.dependence == "Commentary"

    #Have to use apply because pandas doesn't have operations that support lists in columns.
    has_category_commentary = bs.categories.apply(lambda category_list: "Commentary" in category_list)

    #Have to assign to temporary variable, direct assignment makes the whole column False.
    is_any_commentary = is_dependence_commentary | has_category_commentary

    bs["is_commentary"] = is_any_commentary
    bs.drop("dependence", axis=1, inplace=True)

In [16]:
bs["is_modern"] = bs.categories.apply(lambda category_list: "Modern Works" in category_list)

In [17]:
#Drop all titles without any dating info

no_date_mask = bs.era.isna() & bs.compDate.isna() & ~bs.is_modern & ~bs.is_commentary
no_date_entries = bs[no_date_mask]
no_date_titles = no_date_entries["title"]

if len(no_date_entries) > 0:
    warn(f"The following entries have no dating info and will be dropped: {no_date_titles}")
    
bs.drop(no_date_entries.index, inplace=True)

1495         Kinnot for Tisha B'Av (Ashkenaz)
1792           Machzor Rosh Hashanah Ashkenaz
1793    Machzor Rosh Hashanah Ashkenaz Linear
1794             Machzor Rosh Hashanah Sefard
1796              Machzor Yom Kippur Ashkenaz
1797       Machzor Yom Kippur Ashkenaz Linear
1798                Machzor Yom Kippur Sefard
3423        Seder Tisha B'Av (Edot HaMizrach)
3453                  Selichot Edot HaMizrach
3454            Selichot Nusach Ashkenaz Lita
3455                    Selichot Nusach Polin
Name: title, dtype: object
  warn(f"The following entries have no dating info and will be dropped: {no_date_titles}")


In [18]:
self_link_mask = bl["Text 1"] == bl["Text 2"]
self_link_entries = bl[self_link_mask]

bl.drop(self_link_entries.index, inplace=True)

In [19]:
bs.sort_values(["compDate", "is_modern", "era", "is_commentary"], ascending=True, ignore_index=True, inplace=True)
bs["index"] = bs.index

In [20]:
minimal_bs_text_1 = bs[["index", "title"]]
minimal_bs_text_2 = bs[["index", "title"]]

In [21]:
minimal_bs_text_1 = minimal_bs_text_1.rename(columns = {"title": "Text 1", "index": "Publication Order 1"})
minimal_bs_text_2 = minimal_bs_text_2.rename(columns = {"title": "Text 2", "index": "Publication Order 2"})

In [22]:
bl = bl.merge(minimal_bs_text_1, on="Text 1")
bl = bl.merge(minimal_bs_text_2, on="Text 2")

In [23]:
links_to_nonexisting_books_mask = bl["Publication Order 1"].isna() | bl["Publication Order 2"].isna()
links_to_nonexisting_books_entries = bl[links_to_nonexisting_books_mask]

bl.drop(links_to_nonexisting_books_entries.index, inplace=True)

In [24]:
text_date_diff = bl["Publication Order 1"] - bl["Publication Order 2"]

In [25]:
if not {"Text Title", "Cited Text Title"}.issubset(bl.columns):
    text_title = np.where(text_date_diff.gt(0), bl["Text 1"], bl["Text 2"])
    cited_title = np.where(text_date_diff.le(0), bl["Text 1"], bl["Text 2"])

    bl.insert(0, "Text Title", text_title)
    bl.insert(1, "Cited Text Title", cited_title)

    bl.drop(["Text 1", "Text 2", "Publication Order 1", "Publication Order 2"], axis=1, inplace=True)