In [121]:
import pandas as pd
import numpy as np
from pymongo import MongoClient

client = MongoClient()

In [2]:
#System-specific setup

db = client.forsaken
book_schemas_collection = db.rawSchemas
book_links_collection = db.rawLinks

In [3]:
def convert_fields_to_projection_without_id(fields):
    remove_id_projection = {"_id": 0}
    field_projection = {field: 1 for field in fields}
    projection = remove_id_projection | field_projection
    
    return projection

In [4]:
def get_entries(collection, projection):
    return list(collection.find(projection=projection))

In [5]:
book_schema_fields = ["title",
"era",
"compDate",
"dependence",
"categories"]

In [6]:
projection = convert_fields_to_projection_without_id(book_schema_fields)
book_schemas_list = get_entries(book_schemas_collection, projection)

In [155]:
book_schemas_dataframe = pd.DataFrame(book_schemas_list)

In [156]:
bs = book_schemas_dataframe

In [133]:
def regex_replace_in_field(pattern, replacement, field, dataframe):
    dataframe.replace(pattern, replacement, regex=True, inplace=True)

In [166]:
from functools import partial

replace_date = partial(regex_replace_in_field, field="compDate", dataframe=bs)

positive_capture = r"\1"
negative_capture = r"-\1"

earlier_date_in_range_pattern = r"(-?\d+)-\d+"
replace_date(earlier_date_in_range_pattern, positive_capture)

date_from_circa_date_pattern = r"ca?\.\s*(-?\d+)"
replace_date(date_from_circa_date_pattern, positive_capture)

date_from_bce_date_pattern = r"(\d+) BCE?"
replace_date(date_from_bce_date_pattern, negative_capture)

In [180]:
invalid_date_pattern = r"\d*[^-\d]+\d*"
missed_entries = bs[bs["compDate"].str.match(invalid_date_pattern, na=False)]
missed_compDates = missed_entries["compDate"]

if len(missed_compDates) > 0:
    from warnings import warn
    warn(f"These values will be dismissed and converted to NaN: {list(missed_compDates)}")

In [188]:
numeric_dates = pd.to_numeric(bs["compDate"], errors="coerce")
int_dates = numeric_dates.astype("Int64")
bs["compDate"] = int_dates

In [201]:
bs[bs["era"] == "AH"]["compDate"].sort_values(ascending=False)

2814    1930
746     1930
3437    1925
1395    1920
1394    1920
        ... 
4597    <NA>
4598    <NA>
4599    <NA>
4600    <NA>
4601    <NA>
Name: compDate, Length: 3102, dtype: Int64

In [199]:
bs.iloc[2816]

title                    Otzar Midrashim
categories    [Midrash, Aggadic Midrash]
era                                   AH
compDate                             800
dependence                           NaN
Name: 2816, dtype: object