# Simple Entity Recognition

We are going to extract a few classes of information and filter our results down to 100 records.

In [1]:
from dotenv import load_dotenv
import logging
import pandas as pd

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

load_dotenv()

True

In [2]:
# load some data
import html
import re
import pandas as pd
import numpy as np

from typing import Optional

mo_conservation = pd.read_csv('assets/mo_conservation.tsv.gz', sep='\t', index_col='id')

# Define a few regex pattern to identify size-related information
size_patterns = [
    re.compile(r'acres\s+(\d{1,3}(?:,\d{3})*(?:\.\d+)?)', re.IGNORECASE),
    re.compile(r'(\d{1,3}(?:,\d{3})*(?:\.\d+)?)[ -]acre', re.IGNORECASE),
]

def match_acres_area(info: str) -> Optional[float]:
    for s in size_patterns:
        match = s.search(info)
        if match:
            return float(match.group(1).replace(',', ''))
    return None

def match_location_type(title: str) -> str:
    if "Management Lands" in title:
        return "Management Lands"
    elif 'Towersite' in title:
        return 'Towersite'
    elif "Fish Hatchery" in title:
        return "Fish Hatchery"
    elif "Forestry Office" in title:
        return "Forestry Office"
    elif "Nature Center" in title:
        return "Nature Center"
    elif "Natural Area" in title:
        return "Natural Area"
    elif "Wildlife Area" in title:
        return "Wildlife Area"
    elif "Park" in title:
        return "Park"
    elif "Pond" in title:
        return "Pond"
    elif "Headquarters" in title:
        return "Headquarters"
    elif "Regional Office" in title:
        return "Regional Office"
    elif "Range" in title:
        return "Range"
    elif "Office" in title:
        return "Office"
    elif "Discovery Center" in title:
        return "Discovery Center"
    elif "Education Area" in title:
        return "Education Area"
    elif "Education Center" in title:
        return "Education Center"
    elif "Service Center" in title:
        return "Service Center"
    elif "Prairie" in title:
        return "Prairie"
    elif "Reservation" in title:
        return "Reservation"
    elif "Blue Hole" in title:
        return "Blue Hole"
    elif "Reservoir" in title:
        return "Reservoir"
    elif "Forest" in title:
        return "Forest"
    elif "Bend" in title:
        return "Bend"
    elif "Nursery" in title:
        return "Nursery"
    elif "Preserve" in title:
        return "Preserve"
    elif "Tract" in title:
        return "Tract"
    elif 'Access' in title:
        return 'Access'
    elif 'Conservation Area' in title:
        return 'Conservation Area'
    elif 'Lake' in title:
        return 'Lake'
    elif "Memorial" in title:
        return "Memorial"
    else:
        return 'Unknown'

def fix_html(value: str) -> str:
    """
    Decodes HTML entities in a name string, converting them to their proper characters.

    Args:
        value (str): The name string potentially containing HTML entities.

    Returns:
        str: The decoded name string.
    """
    # Decode HTML entities such as &#039; into proper characters
    return html.unescape(value)

# Fix our names
mo_conservation['area_name'] = mo_conservation['area_name'].apply(fix_html)

# Apply the function to extract size information
mo_conservation['acres'] = mo_conservation['area_info'].apply(match_acres_area)
mo_conservation['location_type'] = mo_conservation['area_name'].apply(match_location_type)

print(f"{mo_conservation['acres'].isna().sum()} missing acres")
for r in mo_conservation[mo_conservation['acres'].isna() ]['area_info']:
    print(r)
    
print(f"{(mo_conservation['location_type'] == "Unknown").sum()} unknown locations")
for r in mo_conservation[mo_conservation['location_type'] == "Unknown"]['area_name']:
    print(r)

mo_conservation.value_counts('location_type')

0 missing acres
0 unknown locations


location_type
Conservation Area    357
Access               250
Lake                 134
Park                  71
Prairie               50
Towersite             29
Management Lands      24
Wildlife Area         21
Natural Area          20
Bend                  19
Pond                  17
Forest                10
Range                  8
Regional Office        6
Fish Hatchery          6
Headquarters           5
Nature Center          4
Forestry Office        4
Reservoir              4
Office                 3
Education Center       3
Memorial               2
Reservation            2
Blue Hole              2
Education Area         1
Discovery Center       1
Service Center         1
Tract                  1
Preserve               1
Name: count, dtype: int64

In [4]:
def sample_groups(location_type: str) -> int:
    return {
        'Access': 0,
        'Bend': 0,
        'Blue Hole': 2,
        'Conservation Area': 0,
        'Discovery Center': 3,
        'Education Area': 3,
        'Education Center': 3,
        'Fish Hatchery': 3,
        'Forest': 0,
        'Forestry Office': 1,
        'Headquarters': 1,
        'Lake': 0,
        'Management Lands': 2,
        'Memorial': 3,
        'Natural Area': 0,
        'Nature Center': 3,
        'Office': 1,
        'Park': 0,
        'Pond': 0,
        'Prairie': 0,
        'Preserve': 4,
        'Range': 3,
        'Regional Office': 1,
        'Reservation': 0,
        'Reservoir': 0,
        'Service Center': 1,
        'Towersite': 5,
        'Tract': 0,
        'Wildlife Area': 4,
    }.get(location_type, -1)

mo_conservation['sample_group'] = mo_conservation['location_type'].apply(sample_groups)

group_names = {
    1: 'Administration',
    2: 'Natural Areas',
    3: 'Education',
    4: 'Wildlife',
    5: 'Towersites',
}

mo_conservation['category'] = "Missouri Conservation Areas"
mo_conservation['subcategory'] = mo_conservation['sample_group'].apply(lambda x: group_names.get(x, 'Unclassified'))

sample_mask = mo_conservation['sample_group'] > 0
print(f"{sample_mask.sum()} in sample pool")
mo_conservation.value_counts('sample_group')

121 in sample pool


sample_group
0    935
5     29
2     26
3     25
4     22
1     19
Name: count, dtype: int64

In [5]:
sample_pool = mo_conservation[mo_conservation['sample_group'] > 0].reset_index(drop=True).copy()
sample_pool['selected'] = 0
sample_pool.head()

Unnamed: 0,area_id,area_name,area_info,acres,location_type,sample_group,category,subcategory,selected
0,runge-conservation-nature-center,Runge Conservation Nature Center,Runge Conservation Nature Center Runge Conser...,0.0,Nature Center,3,Missouri Conservation Areas,Education,0
1,conservation-commission-headquarters,Conservation Commission Headquarters,Conservation Commission Headquarters Conserva...,153.7,Headquarters,1,Missouri Conservation Areas,Administration,0
2,proctor-towersite,Proctor Towersite,Proctor Towersite Proctor Towersite Area Map1...,115.0,Towersite,5,Missouri Conservation Areas,Towersites,0
3,camdenton-conservation-service-center,Camdenton Conservation Service Center,Camdenton Conservation Service Center Camdent...,45.5,Service Center,1,Missouri Conservation Areas,Administration,0
4,freeburg-towersite,Freeburg Towersite,Freeburg Towersite Freeburg Towersite Visitor...,11.8,Towersite,5,Missouri Conservation Areas,Towersites,0


In [7]:
import numpy as np

pd.set_option('display.max_rows', 5000)

# for each of our 5 bins, we will get the indices of all of the elements, and then randomly select 20, and use the resulting set to update our index
# setting the value of selected to 1

# clear the 'selected' column if not already present
sample_pool['selected'] = 0

# For each of our 5 bins, we will get the indices of all elements, randomly select 20, 
# and use the resulting set to update our DataFrame by setting the value of 'selected' to 1
need = 20
for i in range(1, 6):
    candidates = sample_pool[sample_pool['sample_group'] == i].index.values

    n = min(need, len(candidates))
    if n < 20:
        print(f"Not enough candidates ({n}) for group {i}, overflowing to next group")
        need = 20 + (20 - n)
    else:
        need = 20

    # Using numpy's random choice, we get our selections
    selected_indices = np.random.choice(candidates, n, replace=False)

    # Now with a pandas location indexer, we set our selected values
    sample_pool.loc[selected_indices, 'selected'] = 1

result_df = sample_pool[sample_pool['selected'] == 1]\
    .sort_values(['category', 'subcategory', 'location_type', 'area_name'])\
    .reset_index(drop=True)\
    [['area_id', 'area_name', 'category', 'subcategory', 'location_type', 'area_info']]\
    .copy()
result_df.to_csv('assets/random_100_areas.tsv', sep='\t', index_label='id')
result_df


Not enough candidates (19) for group 1, overflowing to next group


Unnamed: 0,area_id,area_name,category,subcategory,location_type,area_info
0,bolivar-forestry-office,Bolivar Forestry Office,Missouri Conservation Areas,Administration,Forestry Office,Bolivar Forestry Office Bolivar Forestry Offi...
1,branson-forestry-office,Branson Forestry Office,Missouri Conservation Areas,Administration,Forestry Office,Branson Forestry Office Branson Forestry Offi...
2,lebanon-forestry-office,Lebanon Forestry Office,Missouri Conservation Areas,Administration,Forestry Office,Lebanon Forestry Office Lebanon Forestry Offi...
3,new-madrid-forestry-office,New Madrid Forestry Office,Missouri Conservation Areas,Administration,Forestry Office,New Madrid Forestry Office New Madrid Forestr...
4,clearwater-district-headquarters,Clearwater District Headquarters,Missouri Conservation Areas,Administration,Headquarters,Clearwater District Headquarters Clearwater D...
5,conservation-commission-headquarters,Conservation Commission Headquarters,Missouri Conservation Areas,Administration,Headquarters,Conservation Commission Headquarters Conserva...
6,gasconade-district-headquarters,Gasconade District Headquarters,Missouri Conservation Areas,Administration,Headquarters,Gasconade District Headquarters Gasconade Dis...
7,neosho-district-headquarters,Neosho District Headquarters,Missouri Conservation Areas,Administration,Headquarters,Neosho District Headquarters Neosho District ...
8,perryville-district-headquarters,Perryville District Headquarters,Missouri Conservation Areas,Administration,Headquarters,Perryville District Headquarters Perryville D...
9,alton-forestry-sub-office,Alton Forestry Sub-Office,Missouri Conservation Areas,Administration,Office,Alton Forestry Sub-Office Alton Forestry Sub-...


In [7]:
pd.set_option('display.max_rows', 5000)
lakes = mo_conservation[
    mo_conservation['location_type'] == 'Lake'
    ]\
    .sort_values('acres', ascending=False)\
    .reset_index(drop=True)[['area_id', 'area_name', 'location_type', 'acres', 'area_info']]
lakes[:100].to_csv('assets/top_100_lakes.tsv', sep='\t', index_label='id')
lakes = pd.read_csv('assets/top_100_lakes.tsv', sep='\t', index_col='id')
lakes

Unnamed: 0_level_0,area_id,area_name,location_type,acres,area_info
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,lake-jacomo-jackson-county,Lake Jacomo (Jackson County),Lake,970.0,Lake Jacomo (Jackson County) Lake Jacomo (Jac...
1,jackson-county-longview-lake,Jackson County (Longview Lake),Lake,953.0,Jackson County (Longview Lake) Jackson County...
2,fellows-lake-springfield-city-utilities,Fellows Lake (Springfield City Utilities),Lake,820.0,Fellows Lake (Springfield City Utilities) Fel...
3,hazel-creek-lake-kirksville,Hazel Creek Lake (Kirksville),Lake,530.0,Hazel Creek Lake (Kirksville) Hazel Creek Lak...
4,hazel-hill-lake,Hazel Hill Lake,Lake,502.0,Hazel Hill Lake Hazel Hill Lake The Conservat...
5,holden-city-lake,Holden City Lake,Lake,380.0,Holden City Lake Holden City Lake This area o...
6,nodaway-county-community-lake,Nodaway County Community Lake,Lake,320.0,Nodaway County Community Lake Nodaway County ...
7,sugar-creek-lake-moberly,Sugar Creek Lake (Moberly),Lake,320.0,Sugar Creek Lake (Moberly) Sugar Creek Lake (...
8,perry-county-community-lake,Perry County Community Lake,Lake,309.5,Perry County Community Lake Perry County Comm...
9,harrison-county-lake,Harrison County Lake,Lake,280.0,Harrison County Lake Harrison County Lake Har...
