# Extraction Playground
February 15, 2024

A notebook to figure out the right regular expressions to extract information from the Index to Place and Date of Publication. 

Goal: Output a csv where each row is a map, and the columns are the place and the date of publication

In [None]:
import re
import pandas as pd

In [None]:
with open("index_to_place_and_date.txt") as f:
    index = f.read()

In [None]:
index

## Extract each city 

Goal 1: Use regular expressions (?) to pull out each city

In [None]:
# — Antwerp:

city_regex_short_dash = "— ?(.*):" 
city_regex_long_dash = "— ?(.*):"

city_regex = city_regex_short_dash + "|" + city_regex_long_dash

In [None]:
sample_string = """Austria
— Güssing: 1583, 26/E
— Salzburg: 1551, 68/1; 1650, 68/1.6
— Vienna: 1494, 24/А; 1508, 24/B; 1520, 7/1;
1522, 24/C; 1541, 28/G.3; 1545?, 49/455;
1546, 40/4; 1547, 40/4.1, 40/6, 40/7; 1549,
40/4.2; 1552, 40/5.3; 1556, 49/D, 49/7; 1557,
40/4.8; 1558, 49/11, 49/12, 49/14, 49/G, 49/
H, 65/B, 65/D; by 1561, 49/J, 49/15-26;
1566, 65/1, 65/2; 1571, 65/3; 1692, 16/H.1;
1746, 65/1.3
Germany
— Augsburg: 1518, 9/A; 1521?, 64/C; 1530, 3/A;
1547, 16/E; 1621, 79/8.9; 1684, 8/2.7a, 8/
2.7b; 1776, 54/1.4; 1784, 54/1.4; 1866, 8/2.5
— Berlin: 1766, 8/2.13
— Celle: 1583, 55/A; 1593, 55/7""" 
sample_string

In [None]:
re.findall(city_regex, sample_string)

In [None]:

within_country_regex = ""

# Attempt 2: Mostly string splitting

Strategy for parsing
1. Split the string based on the countries 
2. Grab the country, remove it from the rest of the string
3. Split by cities 
4. For each city, grab the name and remove it from the rest of the string
5. For each city, split into years? Each year ends with a colon if there’s not another
6. For each year, strip whitespace (since entries are separated by commas) and split on commas
7. For each entry, grab the mapmaker (first part), map/book, and then any notes (like editions or alternate years?)


In [None]:
with open("index_to_place_and_date.txt") as f:
    index = f.read()

In [None]:
by_country = index.split("\n\n")
by_country

In [None]:
# make a dictionary to store the data (eventually we will convert this to a pandas dataframe)
# columns are: mapmaker, map/book, country, city, date, notes
index_data = {"mapmaker_id": [], "first_ed_map": [], "country": [], "city": [], "date": []}

for country_data in by_country:
    # grab the country name (first line)
    country_name = country_data.split("\n")[0]

    # remove the country name from the data
    country_data = country_data.replace(country_name, "").strip()
    
    # split the data by city
    by_city = re.split("—|一", country_data)
    
    for city_data in by_city[1:]: # skip the first element, which is empty
        # grab the city name
        city_name = re.split(":", city_data)[0]
        city_name = city_name.strip() # remove leading/trailing whitespace
        print("Next city")
        print(city_name)
        
        # remove the city name from the data
        city_data = city_data.replace(city_name, "").strip()

        # remove the leading colon and space left over from the city name
        if city_data[0:2] == ": ":
            city_data = city_data[2:]
        if city_data[0:1] == ":":
            city_data = city_data[1:]

        # split the data by date
        by_date = city_data.split(";")  

        for date_data in by_date:
            # grab the date
            date = re.split(",", date_data)[0]
            date = date.strip()
            print(date)
            # print(date_data)
            # print()

            # remove the date from the data
            date_data = date_data.replace(date, "").strip()

            # remove the leading comma and space left over from the date
            if date_data[0:2] == ", ":
                date_data = date_data[2:]
            if date_data[0:1] == ",":
                date_data = date_data[1:]

            # split the data by entry 
            by_entry = date_data.split(",")

            for entry_data in by_entry:
                entry_data = entry_data.strip()
                # remove newline characters
                entry_data = entry_data.replace("\n", "")
                print(entry_data)

                mapmaker_id = entry_data.split("/")[0]

                try:
                    object_id = entry_data.split("/")[1]
                except:
                    print("Error: object id in incorrect format")
                    print(entry_data)
                    print()
                    continue

                # Only grab first edition maps (denoted with a number) for now
                # This code will miss any first edition maps that are followed by
                # a year in brackets, but this can be fixed and I don't think there's many

                first_ed_map_regex = "[0-9]{1,3}"
                if re.fullmatch(first_ed_map_regex, object_id):
                    index_data["mapmaker_id"].append(mapmaker_id)
                    index_data["first_ed_map"].append(object_id)
                    index_data["country"].append(country_name)
                    index_data["city"].append(city_name)
                    index_data["date"].append(date)

                range_of_maps_regex = "([0-9]{1,3})-([0-9]{1,3})"
                if re.fullmatch(range_of_maps_regex, object_id):
                    # grab the part before the dash and the part after the dash
                    map_range = re.split("-", object_id)
                    map_range[0] = map_range[0].strip() 
                    map_range[1] = map_range[1].strip()

                    
                    length_diff = len(map_range[0]) - len(map_range[1])

                    # if we have a range like 2-4, or 13-28, or 110-134
                    if int(map_range[1]) > int(map_range[0]):
                        for i in range(int(map_range[0]), int(map_range[1]) + 1):
                            index_data["mapmaker_id"].append(mapmaker_id)
                            index_data["first_ed_map"].append(str(i))
                            index_data["country"].append(country_name)
                            index_data["city"].append(city_name)
                            index_data["date"].append(date)

                    # if we have a range like 12-7, 120-3
                    elif length_diff > 0:
                        map_range[1] = map_range[0][0:length_diff] + map_range[1]
                        for i in range(int(map_range[0]), int(map_range[1]) + 1):
                            index_data["mapmaker_id"].append(mapmaker_id)
                            index_data["first_ed_map"].append(str(i))
                            index_data["country"].append(country_name)
                            index_data["city"].append(city_name)
                            index_data["date"].append(date)
                    
                    else:
                        print("Error: range of maps not handled")
                        print(map_range)
                        print(entry_data)
                        print()



index_df = pd.DataFrame(index_data)



   
    

In [None]:
index_df

In [None]:
index_df.to_csv("index_to_place_and_date_v1.csv", index=False)