In [38]:
# Extract Function: 

In [39]:
# 👇 Setup sys.path to recognize config folder
import sys
import os

PROJECT_ROOT = "/Users/borismartinez/Documents/real-estate"
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

# 👇 Now import from config
from config.paths import DATA_DIR, FILENAME_DATE_FORMAT, DEFAULT_EXTRACT_LABEL, PARQUET_ENABLED

# 👇 Your function
import pandas as pd
from datetime import datetime

def load_latest_xlsx_by_modified_date(dtype=str) -> pd.DataFrame:
    xlsx_files = [
        os.path.join(DATA_DIR, f) for f in os.listdir(DATA_DIR)
        if f.endswith(".xlsx") and os.path.isfile(os.path.join(DATA_DIR, f))
    ]
    if not xlsx_files:
        raise FileNotFoundError(f"No .xlsx files found in {DATA_DIR}")

    latest_file = max(xlsx_files, key=os.path.getmtime)
    modified_time = os.path.getmtime(latest_file)
    extract_date = datetime.fromtimestamp(modified_time).date()
    date_str = extract_date.strftime(FILENAME_DATE_FORMAT)

    clean_filename = f"{date_str}_{DEFAULT_EXTRACT_LABEL}.xlsx"
    clean_path = os.path.join(DATA_DIR, clean_filename)

    if os.path.basename(latest_file) != clean_filename:
        os.rename(latest_file, clean_path)
        print(f"Renamed '{os.path.basename(latest_file)}' → '{clean_filename}'")
    else:
        clean_path = latest_file

    df = pd.read_excel(clean_path, dtype=dtype)
    df["extract_date"] = extract_date

    import pyarrow as pa
    import pyarrow.parquet as pq

    if PARQUET_ENABLED:
        parquet_path = clean_path.replace(".xlsx", ".parquet")
        table = pa.Table.from_pandas(df, preserve_index=False, safe=False)
        pq.write_table(table, parquet_path)
        print(f"Saved Parquet version: {parquet_path}")

    return df

In [None]:
# df = load_latest_xlsx_by_modified_date()
# df.head()

  warn("Workbook contains no default style, apply openpyxl's default")


Saved Parquet version: /Users/borismartinez/Documents/real-estate/data/20250924_extract.parquet


Unnamed: 0,Address,Unit #,City,State,Zip,County,FIPS,APN,Owner Occupied,Owner 1 First Name,...,Pre-FC Auction Opening Bid,Pre-FC Trustee-Attorney Name,Pre-FC Trustee Ref Number,Pre-FC Attorney Case Number,Pre-FC Trustee-Attorney Address,Pre-FC Borrower 1 Name,Marketing Lists,Date Added to List,Method of Add,extract_date
0,4941 River Glen Dr #7,7.0,Las Vegas,NV,89103,Clark,32003,163-24-612-007,No,Nonnette,...,,,,,,,1,2025-09-24 17:48:16.043593,Manual,2025-09-24
1,7885 W Flamingo Rd #1050,1050.0,Las Vegas,NV,89147,Clark,32003,163-21-516-094,No,Wendv,...,,,,,,,1,2025-09-24 17:47:55.032821,Manual,2025-09-24
2,3021 Aloha Ave,,Las Vegas,NV,89121,Clark,32003,161-08-715-009,No,,...,,,,,,,1,2025-09-24 17:49:20.075063,Manual,2025-09-24
3,3037 Aloha Ave,,Las Vegas,NV,89121,Clark,32003,161-08-715-013,No,,...,,,,,,,1,2025-09-24 17:49:20.075063,Manual,2025-09-24
4,4670 Monterey Cir #1,1.0,Las Vegas,NV,89169,Clark,32003,162-21-714-089,No,Bonita,...,,,,,,,1,2025-09-24 17:47:55.032821,Manual,2025-09-24


In [None]:
# for col in df.columns:
#     print(f"{col} TEXT,")

Address TEXT,
Unit # TEXT,
City TEXT,
State TEXT,
Zip TEXT,
County TEXT,
FIPS TEXT,
APN TEXT,
Owner Occupied TEXT,
Owner 1 First Name TEXT,
Owner 1 Last Name TEXT,
Owner 2 First Name TEXT,
Owner 2 Last Name TEXT,
Mailing Care of Name TEXT,
Mailing Address TEXT,
Mailing Unit # TEXT,
Mailing City TEXT,
Mailing State TEXT,
Mailing Zip TEXT,
Mailing Zip+4 TEXT,
Mailing County TEXT,
Property Type TEXT,
Bedrooms TEXT,
Total Bathrooms TEXT,
Building Sqft TEXT,
Lot Size Sqft TEXT,
Effective Year Built TEXT,
Vacant TEXT,
HOA Present TEXT,
Total Assessed Value TEXT,
Assessed Improvement Value TEXT,
Improvement to Tax Value % TEXT,
Last Sale Date TEXT,
Last Sale Recording Date TEXT,
Last Sale Amount TEXT,
Last Cash Buyer TEXT,
Prior Sale Date TEXT,
Prior Sale Amount TEXT,
Prior Sale Cash Buyer TEXT,
Loan 1 Date TEXT,
Loan 1 Balance TEXT,
Loan 1 Type TEXT,
Loan 1 Lender TEXT,
Loan 1 Rate TEXT,
Loan 1 Rate Type TEXT,
Loan 2 Date TEXT,
Loan 2 Balance TEXT,
Loan 2 Type TEXT,
Loan 2 Lender TEXT,
Loan 