In [None]:
#Black Teacher Archvive - School Finder
##Inputs historical school listings and identifies current locations
##https://curiosity.lib.harvard.edu/black-teacher-archive

In [None]:
#GEONAMES Search Script
##Takes schools table and returns lat/long + GeoNames name
###Test Input @: 
###https://docs.google.com/spreadsheets/d/1VHF2QYGxQdMeTR1kGhoncK21sidmrk6Mis0h5VY4AiE/edit#gid=2140551881

#Libraries
import pandas as pd
import requests

#API Use Counter
request_counter = 0
def increment_request_counter():
    global request_counter
    request_counter += 1

def is_county_in_hierarchy(username, geonameId, target_county):
    hierarchy_url = "http://api.geonames.org/hierarchyJSON"
    params = {
        'geonameId': geonameId,
        'username': username
    }
    increment_request_counter()  # Increment the request counter
    response = requests.get(hierarchy_url, params=params)
    if response.status_code == 200:
        hierarchy_data = response.json()
        for place in hierarchy_data.get('geonames', []):
            if target_county.lower() in place.get('name', '').lower():
                return True
    return False

def get_school_info(username, school, target_county, fuzzy):
    global request_counter
    # Extract the first word of the school's name for broader search
    base_school_name = school.split()[0]
    school_name_extension = school.split()[1:]
    s = " "
    school_name_extension = s.join(school_name_extension)
    # Educational institution types to include in the search
    institution_types = [school_name_extension, "School", "College", "Academy", school_name_extension + " (historical)"]
    
    # Attempt searches for each institution type with the base school name
    for institution_type in institution_types:
        print("Searching ",f"{base_school_name} {institution_type}","\n")
        search_url = "http://api.geonames.org/searchJSON"
        search_params = {
            'q': f"{base_school_name} {institution_type}",
            'country': 'US',
            'adminCode1': 'TN',
            'username': username,
            'fuzzy': fuzzy,
            'maxRows': 100
        }
        increment_request_counter()
        search_response = requests.get(search_url, params=search_params)
        if search_response.status_code == 200:
            search_data = search_response.json()
            for result in search_data.get('geonames', []):
                if is_county_in_hierarchy(username, result['geonameId'], target_county) and result.get('fcode', '') == 'SCH':
                    return result['lat'], result['lng'], result.get('name', ''), result.get('fcode', ''), result.get('fcl', '')

    return None, None, None, None, None

# Load your CSV file into a DataFrame
csv_file_path = '...csv'  # Update with the actual path
df = pd.read_csv(csv_file_path)

geonames_username = '...'  # Update with your actual GeoNames username
fuzzy_value = 0.7  # Adjust based on your needs

for index, row in df.iterrows():
    school = row['SCHOOL']
    county = row['COUNTY']
    
    lat, lng, geoname_name, fcode, fcl = get_school_info(geonames_username, school, county, fuzzy_value)
    if lat and lng:
        df.at[index, 'Latitude'] = lat
        df.at[index, 'Longitude'] = lng
        df.at[index, 'GeoNames Name'] = geoname_name
        df.at[index, 'Feature Code'] = fcode
        df.at[index, 'Feature Class'] = fcl
    else:
        print(f"No valid results found for {school} in {county}")

# Save the updated DataFrame back to CSV
updated_csv_file_path = '...csv'  # Update with the actual path
df.to_csv(updated_csv_file_path, index=False)
print(f"Updated CSV file saved to {updated_csv_file_path}.")

# Print the total number of requests made to the GeoNames API
print(f"Total requests made to GeoNames API: {request_counter}")


In [None]:
#Historical Marker Database Search
##Searches HMDB(.org) for remaining (i.e., missing) schools and returns lat/long

#Libraries
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re

# Function to parse latitude and longitude from text
def parse_lat_long(text):
    text = text.replace('′', '').replace('″', '').replace('°', '')
    parts = text.split(',')
    lat_text = parts[0].strip()
    lng_text = parts[1].strip()

    lat_deg, lat_min = map(float, re.findall(r"[-+]?\d*\.\d+|\d+", lat_text))
    lat = lat_deg + (lat_min / 60)
    if 'S' in lat_text:
        lat = -lat

    lng_deg, lng_min = map(float, re.findall(r"[-+]?\d*\.\d+|\d+", lng_text))
    lng = lng_deg + (lng_min / 60)
    if 'W' in lng_text:
        lng = -lng

    return lat, lng

# Function to extract latitude and longitude from the HMDB page
def extract_lat_long(driver):
    try:
        location_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="mainblock"]/article/span[contains(text(), "Location.")]'))
        )
        location_text = location_element.find_element(By.XPATH, 'following-sibling::node()').text.strip()
        print(f"Location text: {location_text}")
        
        lat_lng_match = re.search(r'([-\d.]+°\s*[\d.]+′\s*[NS]),\s*([-\d.]+°\s*[\d.]+′\s*[EW])', location_text)
        if lat_lng_match:
            lat_lng_text = lat_lng_match.group()
            print(f"Lat/Long text: {lat_lng_text}")
            print('\n')
            lat, lng = parse_lat_long(lat_lng_text)
            return lat, lng
        
        return None, None
    except Exception as e:
       # print(f"Error extracting lat/long: {e}")
        return None, None

# Function to perform the HMDB search and extract lat/long
def perform_hmdb_search(driver, keyword, country, state, county):
    wait = WebDriverWait(driver, 10)
    
    show_filters = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'show filters')]")))
    show_filters.click()
    
    keyword_search_label = wait.until(EC.presence_of_element_located((By.XPATH, "//h4[text()='Keyword Search']")))
    search_box = keyword_search_label.find_element(By.XPATH, "./following-sibling::div//input[@type='text']")
    search_box.send_keys(keyword)
    
    country_input = wait.until(EC.visibility_of_element_located((By.NAME, "FilterCountry")))
    country_input.send_keys(country)
    
    state_input = wait.until(EC.visibility_of_element_located((By.NAME, "FilterState")))
    state_input.send_keys(state)
    
    county_input = wait.until(EC.visibility_of_element_located((By.NAME, "FilterCounty")))
    county_input.send_keys(county)
    
    search_button = wait.until(EC.element_to_be_clickable((By.ID, "TheButton1")))
    search_button.click()
    
    time.sleep(2)  # Adjust the sleep time if necessary
    headers = driver.find_elements(By.TAG_NAME, "h1")
    headers += driver.find_elements(By.TAG_NAME, "h2")
    
    for header in headers:
        if keyword.lower() in header.text.lower():
            print(f"Found match: {header.text}")  # Debugging statement
            lat, lng = extract_lat_long(driver)
            return header.text, lat, lng
    
    return None, None, None

# Load your CSV file into a DataFrame
csv_file_path = '...csv'
df = pd.read_csv(csv_file_path)

# Initialize the Selenium WebDriver
driver = webdriver.Safari()
driver.get('https://www.hmdb.org/search.asp')
wait = WebDriverWait(driver, 10)

start_time = time.time()

for index, row in df.iterrows():
    school_name = row['SCHOOL']
    country = 'United States'
    state = row['STATE'] if 'STATE' in row else 'Tennessee'
    county = row['COUNTY']
    
    if pd.isna(row['GeoNames Name']):  # Check if GeoNames entry is missing
        base_school_name = " ".join(school_name.split()[:2])
        header, lat, lng = perform_hmdb_search(driver, base_school_name, country, state, county)
        if header and lat and lng:
            df.at[index, 'HMDB Name'] = header
            df.at[index, 'Latitude'] = lat
            df.at[index, 'Longitude'] = lng
        else:
            print(f"No matches found for {school_name} in {county}, {state}")
            print('\n')
            
        driver.get('https://www.hmdb.org/search.asp')
        time.sleep(2)  # Give some time for the page to load properly

elapsed_time = time.time() - start_time
print(f"Time elapsed: {elapsed_time} seconds")

updated_csv_file_path = '...csv'
df.to_csv(updated_csv_file_path, index=False)
print(f"Updated CSV file saved to {updated_csv_file_path}")

driver.quit()


In [None]:
#TO DO
#Find Nearest Address:
##https://www.geonames.org/maps/us-reverse-geocoder.html#findNearestAddress
#Table image OCR?
#Process "State Association Black Schools" docs
#update github README

In [None]:
##Cook 2024
###mncook.net