# Give a date range

Goal: Turn dates from Karrow's index that are imprecise in some way and turn them into a range

In [None]:
import pandas as pd
import numpy as np
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)

In [None]:
# load in the file from Index to Place and Date combined with mapmaker's names
index_with_mapmakers = pd.read_csv('../outputs/index_to_place_and_date_and_mapmaker_names.csv')
index_with_mapmakers

In [None]:
# add empty columns for estimates of earliest and latest dates for map publication
index_with_mapmakers['earliest_date_estimate'] = np.nan
index_with_mapmakers['latest_date_estimate'] = np.nan
index_with_mapmakers

In [None]:
# iterate through the rows of the pandas dataframe and fill in the easier date estimates
# NOTE: in general, iterating through rows is poor practice in pandas, but in this case it makes the code easier, and the data set is small, so I'm not worried about performance.

total_data_count = len(index_with_mapmakers)

for index, row in index_with_mapmakers.iterrows():
    # get the date string
    date = row['date']

    # if the date is an integer (ie single year) between 1400 and 1900, then we can use it as the earliest and latest date estimate
    if date.isnumeric() and int(date) >= 1400 and int(date) <= 1900:
        index_with_mapmakers.at[index, 'earliest_date_estimate'] = int(date)
        index_with_mapmakers.at[index, 'latest_date_estimate'] = int(date)

# report out the number of single year dates that we found
single_year_dates_count = index_with_mapmakers['earliest_date_estimate'].notnull().sum()

logging.info(f'Found {single_year_dates_count} single year dates out of {total_data_count} total data points')
    
logging.info(f"Value that should be empty or NaN or not a number: {index_with_mapmakers['earliest_date_estimate'].iloc[2318]}")
index_with_mapmakers

In [None]:
# Now, fill in the straightforward ranges

for index, row in index_with_mapmakers.iterrows():
    # get the date string
    date = row['date']

    # if the date is a range, then we can use it as the earliest and latest date estimate
    if '-' in date:
        # split the date range
        date_range = date.split('-')
        earliest_date = date_range[0]
        latest_date = date_range[1]

        # if the earliest date is a single year, then we can use it as the earliest date estimate
        if earliest_date.isnumeric() and int(earliest_date) >= 1400 and int(earliest_date) <= 1900:
            index_with_mapmakers.at[index, 'earliest_date_estimate'] = int(earliest_date)

            # Karrow generally used abbreviations for the second half of the date range, so we need to convert them to full years
            if latest_date.isnumeric():
                if int(latest_date) < 10:
                    # if it's a single digit, replace the final digit of the earliest date with the single digit
                    index_with_mapmakers.at[index, 'latest_date_estimate'] = int(str(earliest_date)[:-1] + latest_date)
                elif int(latest_date) < 100:
                    # if it's a double digit, replace the final two digits of the earliest date with the double digit
                    index_with_mapmakers.at[index, 'latest_date_estimate'] = int(str(earliest_date)[:-2] + latest_date)
                elif int(latest_date) < 1000:
                    # if it's a triple digit, replace the final three digits of the earliest date with the triple digit
                    # this would look really weird, and I don't think Karrow actually did this
                    index_with_mapmakers.at[index, 'latest_date_estimate'] = int(str(earliest_date)[:-3] + latest_date)
                elif int(latest_date) > int(earliest_date):
                    # if it's a full year, then we can use it as the latest date estimate
                    index_with_mapmakers.at[index, 'latest_date_estimate'] = int(latest_date)
                else:
                    # if it's a full year, but it's less than the earliest date, then it's probably a typo
                    logging.warning(f"Latest date is less than earliest date: {earliest_date} - {latest_date}")
            else:
                # if the latest date is not a number (but the earliest date was a number), then it's probably a typo
                logging.warning(f"Latest date is not a number: {earliest_date} - {latest_date}")

# report out the number of range dates that we found
range_dates_count = index_with_mapmakers['earliest_date_estimate'].notnull().sum() - single_year_dates_count

logging.info(f'Found {range_dates_count} range dates out of {total_data_count} total data points')
logging.info(f"Now, {single_year_dates_count+range_dates_count} out of {total_data_count} total data points have date estimates")

index_with_mapmakers

In [None]:
# Print out the rows that have not been filled in
empty_dates = index_with_mapmakers[index_with_mapmakers['earliest_date_estimate'].isnull()]
empty_dates

In [None]:
# handle the dates in the format "1548 or 49"

for index, row in empty_dates.iterrows():
    # check if there's an "or" in the date
    date = row['date']

    if 'or' in date:
        earliest_date = date.split('or')[0].strip()
        latest_date = date.split('or')[1].strip()
        index_with_mapmakers.at[index, 'earliest_date_estimate'] = int(earliest_date)

        if int(latest_date) < 10:
            # if it's a single digit, replace the final digit of the earliest date with the single digit
            index_with_mapmakers.at[index, 'latest_date_estimate'] = int(str(earliest_date)[:-1] + latest_date)
        elif int(latest_date) < 100:
            # if it's a double digit, replace the final two digits of the earliest date with the double digit
            index_with_mapmakers.at[index, 'latest_date_estimate'] = int(str(earliest_date)[:-2] + latest_date)
        elif int(latest_date) < 1000:
            # if it's a triple digit, replace the final three digits of the earliest date with the triple digit
            # this would look really weird, and I don't think Karrow actually did this
            index_with_mapmakers.at[index, 'latest_date_estimate'] = int(str(earliest_date)[:-3] + latest_date)
        elif int(latest_date) > int(earliest_date):
            # if it's a full year, then we can use it as the latest date estimate
            index_with_mapmakers.at[index, 'latest_date_estimate'] = int(latest_date)
        else:
            logging.warning(f"Unable to parse date that contains 'or': {date}")

# report out the number of "or" dates that we found
or_dates_count = index_with_mapmakers['earliest_date_estimate'].notnull().sum() - single_year_dates_count - range_dates_count

logging.info(f'Found {or_dates_count} "or" dates out of {total_data_count} total data points')
logging.info(f"Now, {single_year_dates_count+range_dates_count+or_dates_count} out of {total_data_count} total data points have date estimates")

index_with_mapmakers