<a href="https://colab.research.google.com/github/Akshaykumarmundrathi/Oklahoma-Well-Locations/blob/main/oklahoma_well_location_map.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ===============================================
# 🔹 Oklahoma Well Locations Extraction + Map (v2)
# ===============================================
# Run once (Colab only):
!pip install PyMuPDF pandas openpyxl tqdm folium

# ------------------------------------------------
# STEP 1 — Imports
# ------------------------------------------------
import os, re, fitz, pandas as pd
from tqdm import tqdm
import folium
from folium.plugins import MarkerCluster

# ------------------------------------------------



In [3]:

# STEP 2 — Input PDF and output filenames
# ------------------------------------------------
pdf_path = "/content/PDFs_Well_Location_20250523_140405.pdf"     # your input PDF
out_csv  = "oklahoma_wells_text_extract_with_latlon.csv"
out_html = "oklahoma_wells_map.html"

# ------------------------------------------------
# STEP 3 — Helper: extract lat/lon from randymajors link
# ------------------------------------------------
def extract_lat_lon_from_link_text(link_text):
    """
    Extract latitude/longitude from a randymajors.org URL string.
    Handles various encodings like y=, y;, etc.
    Input:  full link string
    Output: (lat, lon) floats or (None, None)
    """
    pattern = re.compile(
        r"x\s*=\s*([-+]?\d*\.\d+|\d+)[^y]*y[;=]\s*([-+]?\d*\.\d+|\d+)",
        re.IGNORECASE
    )
    link_text = link_text.replace("\n", "")
    m = pattern.search(link_text)
    if not m:
        return None, None
    try:
        lon = float(m.group(1))
        lat = float(m.group(2))
        return lat, lon
    except Exception:
        return None, None

# ------------------------------------------------
# STEP 4 — Read entire PDF text
# ------------------------------------------------
try:
    doc = fitz.open(pdf_path)
except Exception as e:
    raise SystemExit(f"❌ Cannot open PDF: {pdf_path}\n{e}")

all_text = "\n".join(page.get_text("text") for page in doc)
doc.close()

# ------------------------------------------------
# STEP 5 — Split into individual report blocks
# ------------------------------------------------
raw_blocks = re.split(r"(?i)report\s*for\s*[:\-]?\s*", all_text)
blocks = [b.strip() for b in raw_blocks if b.strip()]

print(f"📄 Found {len(blocks)} report blocks in PDF")

# ------------------------------------------------
# STEP 6 — Parse each report for metadata and coords
# ------------------------------------------------
records = []

for block in tqdm(blocks, desc="Processing reports"):
    lines = [ln.strip() for ln in block.split("\n") if ln.strip()]
    report_name = lines[0] if lines else None
    county = section = township = rng = link = lat = lon = None

    # Look for County/TRS info
    for i, line in enumerate(lines):
        if "county" in line.lower():
            # Match patterns with flexible spacing
            county_match   = re.search(r"County\s*:\s*([A-Za-z \-]+)", line, re.IGNORECASE)
            section_match  = re.search(r"Section\s*:\s*([0-9]{1,3})", line, re.IGNORECASE)
            township_match = re.search(r"Township\s*:\s*([0-9]+)\s*([NS])", line, re.IGNORECASE)
            range_match    = re.search(r"Range\s*:\s*([0-9]+)\s*([EW])", line, re.IGNORECASE)

            if county_match:
                county = county_match.group(1).strip()
            if section_match:
                section = section_match.group(1).strip()
            if township_match:
                township = township_match.group(1).strip() + " " + township_match.group(2)
            if range_match:
                rng = range_match.group(1).strip() + " " + range_match.group(2)

            # Try next line for link
            if i + 1 < len(lines):
                next_line = lines[i + 1]
                if "randymajors.org" in next_line:
                    link = next_line.strip()
                    lat, lon = extract_lat_lon_from_link_text(next_line)
            break

    # If link not found yet, scan entire block
    if not link:
        link_search = re.search(r"https?://[^\s]*randymajors\.org[^\s]+", block)
        if link_search:
            link = link_search.group(0)
            lat, lon = extract_lat_lon_from_link_text(link)

    records.append({
        "Report Name": report_name,
        "County": county,
        "Section": section,
        "Township": township,
        "Range": rng,
        "Link": link,
        "Latitude": lat,
        "Longitude": lon
    })

# ------------------------------------------------
# STEP 7 — Diagnostic info
# ------------------------------------------------
print(f"\nRecords collected: {len(records)}")
if len(records) > 0:
    print(f"Keys in first record: {list(records[0].keys())}")
else:
    print("⚠️ No report blocks were parsed — check PDF text content.")

# ------------------------------------------------
# STEP 8 — Save extracted data
# ------------------------------------------------
df = pd.DataFrame(records)

lat_cols = [col for col in df.columns if col.lower() in ("latitude", "lat")]
lon_cols = [col for col in df.columns if col.lower() in ("longitude", "lon")]

if lat_cols and lon_cols:
    coord_count = df[lat_cols + lon_cols].dropna().shape[0]
else:
    coord_count = 0

df.to_csv(out_csv, index=False)
print(f"✅ Extracted {len(df)} reports.")
print(f"✅ {coord_count} have coordinates.")
print("Columns:", df.columns.tolist())
print(df.head(10))

# ------------------------------------------------
# STEP 9 — Create interactive Folium map
# ------------------------------------------------
oklahoma_center = [35.4676, -97.5164]
m = folium.Map(location=oklahoma_center, zoom_start=7, tiles="OpenStreetMap")
marker_cluster = MarkerCluster().add_to(m)

for _, row in df.iterrows():
    if pd.notna(row.get("Latitude")) and pd.notna(row.get("Longitude")):
        popup_html = (
            f"<b>{row.get('Report Name')}</b><br>"
            f"County: {row.get('County')}<br>"
            f"Section: {row.get('Section')}<br>"
            f"Township: {row.get('Township')}<br>"
            f"Range: {row.get('Range')}<br>"
            f"Lat: {row.get('Latitude')}<br>"
            f"Lon: {row.get('Longitude')}<br>"
            f"<a href='{row.get('Link')}' target='_blank'>Open in RandyMajors</a>"
        )
        folium.Marker(
            location=[row["Latitude"], row["Longitude"]],
            popup=popup_html,
            icon=folium.Icon(color="red", icon="tint", prefix="fa")
        ).add_to(marker_cluster)

m.save(out_html)
print(f"✅ Map saved as '{out_html}'.  Open this file in a browser to view markers.")


📄 Found 3002 report blocks in PDF


Processing reports: 100%|██████████| 3002/3002 [00:00<00:00, 27255.14it/s]


Records collected: 3002
Keys in first record: ['Report Name', 'County', 'Section', 'Township', 'Range', 'Link', 'Latitude', 'Longitude']





✅ Extracted 3002 reports.
✅ 0 have coordinates.
Columns: ['Report Name', 'County', 'Section', 'Township', 'Range', 'Link', 'Latitude', 'Longitude']
                Report Name County Section Township Range  \
0  00000000_BUNCH 1_1400935   None    None     None  None   
1  00000000_No Data_1030447   None    None     None  None   
2  00000000_No Data_1220780   None    None     None  None   
3  00000000_No Data_1220867   None    None     None  None   
4  00000000_No Data_1220868   None    None     None  None   
5  00000000_No Data_1220958   None    None     None  None   
6  00000000_No Data_1220959   None    None     None  None   
7  00000000_No Data_1221148   None    None     None  None   
8  00000000_No Data_1221162   None    None     None  None   
9  00000000_No Data_1221203   None    None     None  None   

                                                Link Latitude Longitude  
0                                               None     None      None  
1  https://www.randymajors.org/t

In [None]:
# ========================================
# 🔹 Oklahoma Well Locations Extraction + Interactive Map (Improved)
# ========================================

# Step 0: Install dependencies (run once)
!pip install PyMuPDF pytesseract pdf2image pandas openpyxl requests pyproj opencv-python-headless tqdm folium

import os
import re
import fitz  # PyMuPDF
import pandas as pd
from tqdm import tqdm
import folium
from folium.plugins import MarkerCluster

In [6]:


# Input/output settings
pdf_path = "/content/PDFs_Well_Location_20250523_140405.pdf"
out_csv = "oklahoma_wells_text_extract_with_latlon.csv"
out_html = "oklahoma_wells_map.html"

# Robust regex to extract latitude and longitude from randymajors link (handles y= and y;=)
def extract_lat_lon_from_link_text(link_text):
    pattern = re.compile(
        r"x=([-+]?\d*\.\d+|\d+)[;&]?\s*y;?=([-+]?\d*\.\d+|\d+)",
        re.IGNORECASE,
    )
    link_text = link_text.replace('\n', '')
    m = pattern.search(link_text)
    if not m:
        return None, None
    try:
        lon = float(m.group(1))
        lat = float(m.group(2))
        return lat, lon
    except Exception:
        return None, None

# Read entire PDF text
try:
    doc = fitz.open(pdf_path)
except Exception as e:
    raise SystemExit(f"❌ Cannot open PDF: {pdf_path}\n{e}")

all_text = "\n".join(page.get_text("text") for page in doc)
doc.close()

# Split into reports by 'Report for:' header
raw_blocks = re.split(r"Report for:\s*", all_text, flags=re.IGNORECASE)
blocks = [b.strip() for b in raw_blocks if b.strip()]

records = []
for block in tqdm(blocks, desc="Processing reports"):
    lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
    report_name = lines[0] if lines else None
    county = section = township = rng = link = lat = lon = None

    # Flexible field extraction to handle different spacings and colon placements
    for i, line in enumerate(lines):
        if "County" in line and "Section" in line and "Township" in line and "Range" in line:
            county_match = re.search(r"County\s*[:]? *([^,]+)", line, re.IGNORECASE)
            section_match = re.search(r"Section\s*[:]? *([0-9]{1,3})", line, re.IGNORECASE)
            township_match = re.search(r"Township\s*[:]? *([\d]+[ ]?[NS])", line, re.IGNORECASE)
            range_match = re.search(r"Range\s*[:]? *([\d]+[ ]?[EW])", line, re.IGNORECASE)

            county = county_match.group(1).strip() if county_match else None
            section = section_match.group(1).strip() if section_match else None
            township = township_match.group(1).strip() if township_match else None
            rng = range_match.group(1).strip() if range_match else None

        # Flexible link detection anywhere following these lines
        if "randymajors.org" in line.lower():
            lat_tmp, lon_tmp = extract_lat_lon_from_link_text(line)
            if lat_tmp and lon_tmp:
                lat, lon = lat_tmp, lon_tmp
                link = line

    records.append({
        "Report Name": report_name,
        "County": county,
        "Section": section,
        "Township": township,
        "Range": rng,
        "Link": link,
        "Latitude": lat,
        "Longitude": lon
    })

df = pd.DataFrame(records)

# Crash-proof diagnostics: check columns before printing
required_cols = ["Report Name", "County", "Section", "Township", "Range", "Latitude", "Longitude"]
available_cols = [col for col in required_cols if col in df.columns]

print(f"✅ Extracted {len(df)} reports.")
print(f"{df[['Latitude','Longitude']].dropna().shape[0]} reports have coordinates.")
print("Sample data columns present:", available_cols)
print(df[available_cols].head())

# Save CSV
df.to_csv(out_csv, index=False)

# Create interactive map with marker clustering
oklahoma_center = [35.4676, -97.5164]
m = folium.Map(location=oklahoma_center, zoom_start=7, tiles="OpenStreetMap")
marker_cluster = MarkerCluster().add_to(m)

for _, row in df.iterrows():
    if pd.notna(row["Latitude"]) and pd.notna(row["Longitude"]):
        popup_text = (
            f"<b>{row['Report Name']}</b><br>"
            f"County: {row['County']}<br>"
            f"Section: {row['Section']}<br>"
            f"Township: {row['Township']}<br>"
            f"Range: {row['Range']}<br>"
            f"Lat: {row['Latitude']}<br>"
            f"Lon: {row['Longitude']}"
        )
        folium.Marker(
            location=[row["Latitude"], row["Longitude"]],
            popup=popup_text,
            icon=folium.Icon(color="red", icon="tint", prefix="fa"),
        ).add_to(marker_cluster)

m.save(out_html)
print(f"✅ Map saved as '{out_html}'. Open it in a browser to view.")



Processing reports: 100%|██████████| 3002/3002 [00:00<00:00, 51299.05it/s]


✅ Extracted 3002 reports.
1155 reports have coordinates.
Sample data columns present: ['Report Name', 'County', 'Section', 'Township', 'Range', 'Latitude', 'Longitude']
                Report Name           County Section Township Range  \
0  00000000_BUNCH 1_1400935             None    None     None  None   
1  00000000_No Data_1030447  Okmulgee County      11     14 N  12 E   
2  00000000_No Data_1220780             None    None     None  None   
3  00000000_No Data_1220867     Tulsa County      34     19 N  12 E   
4  00000000_No Data_1220868  Okmulgee County      21     15 N  12 E   

    Latitude  Longitude  
0        NaN        NaN  
1  35.701756 -96.004664  
2        NaN        NaN  
3  36.076980 -96.026723  
4  35.767142 -96.050405  
✅ Map saved as 'oklahoma_wells_map.html'. Open it in a browser to view.
