#### VMT

In [None]:
import pdfplumber
import pandas as pd
import re
from pathlib import Path

In [None]:
input_dir = Path(".")     # folder where *tvt.pdf files are
output_csv = "FHWA_VMT_All.csv"

# month dictionary (case-insensitive)
month_map = {
    "jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6,
    "jul": 7, "aug": 8, "sep": 9, "sept": 9, "oct": 10, "nov": 11, "dec": 12
}s

def extract_month_year_from_filename(filename: str):s    """
    Robustly extract month and year from filenames like:
    25augtvt.pdf, 24SeptTVT.pdf, 23JANTVT.pdf, etc.
    """
    stem = filename.lower()

    # year
    m = re.match(r"(\d{2})", stem)
    if not m:
        return None, None
    year = 2000 + int(m.group(1))

    # month
    month_match = re.search(r"(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)", stem)
    if not month_match:
        return year, None  # missing month as NaN 

    month_str = month_match.group(1)
    month = month_map.get(month_str, None)

    return year, month


def extract_vmt_from_pdf(pdf_path):
    year, month = extract_month_year_from_filename(pdf_path.name)

    rows = []

    try:
        with pdfplumber.open(pdf_path) as pdf:
            # table 5 on page 5
            if len(pdf.pages) <= 5:
                print(f"Warning: {pdf_path.name} has fewer than 6 pages.")
                return pd.DataFrame()

            text = pdf.pages[5].extract_text()
            if not text:
                print(f"Warning: No text extracted from {pdf_path.name} page 6.")
                return pd.DataFrame()
    except Exception as e:
        print(f"Error opening {pdf_path.name}: {e}")
        return pd.DataFrame()

    # clean lines
    lines = [re.sub(r"\s+", " ", l.strip()) for l in text.split("\n") if l.strip()]

    # find state table boundaries
    try:
        start = next(i for i, l in enumerate(lines) if "Connecticut" in l)
    except StopIteration:
        print(f"Warning: Could not find state start in {pdf_path.name}")
        return pd.DataFrame()

    end = next((i for i, l in enumerate(lines) if "TOTALS" in l), len(lines))
    data_lines = lines[start:end]

    # only extract state + VMT
    pattern = (
        r"([A-Za-z\s]+)\s+"      
        r"(\d+|-)\s+"             
        r"([\d,]+)\s+"           
    )

    for line in data_lines:
        m = re.match(pattern, line)
        if m:
            state = m.group(1).strip()
            vmt = m.group(3).replace(",", "")
            try:
                vmt = float(vmt)
            except ValueError:
                vmt = None

            rows.append({
                "year": year,
                "month": month,
                "state": state,
                "vmt": vmt
            })

    return pd.DataFrame(rows)


# loop for all files
all_dfs = []
for pdf_path in sorted(input_dir.glob("*tvt.pdf")):
    df = extract_vmt_from_pdf(pdf_path)
    if not df.empty:
        all_dfs.append(df)
        print(f"Extracted {pdf_path.name} ({len(df)} rows)")
    else:
        print(f"Skipped {pdf_path.name} (no data extracted)")

# combine and save results
if all_dfs:
    final = pd.concat(all_dfs, ignore_index=True)
    final.to_csv(output_csv, index=False)
    print(f"\nSaved combined file: {output_csv} ({len(final)} rows)")
else:
    print("No data extracted from any PDF.")
