In [1]:
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm
import seaborn as sns
from statsmodels.stats.weightstats import DescrStatsW
from matplotlib import pyplot as plt
import glob

In [2]:
dbs = glob.glob("../models/usfia/fvs_outputs/*.db")
len(dbs)

400221

In [3]:
def scrape_sqlite(path_to_db, **kwargs):
    """
    Scrapes a sqlite database, converting it into a dictionary of Pandas DataFrames.

    Args:
        path_to_db (str): path to sqlite database
        **kwargs: optional kwargs to pass to Pandas `read_sql_table`

    Returns:
        res (dict): dictionary with table names as keys and Pandas DataFrames as values.
    """
    conn_str = f"sqlite:///{path_to_db}"
    tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'", conn_str)[
        "name"
    ].values

    res = {t: pd.read_sql_table(t, conn_str, **kwargs) for t in tables}
    return res

In [4]:
TABLES = ["FVS_Cases", "FVS_CalibStats", "FVS_Summary"]
dfs = {t: [] for t in TABLES}

with ThreadPoolExecutor(128) as executor:
    with tqdm(total=len(dbs)) as pbar:
        jobs = [executor.submit(scrape_sqlite, db) for db in dbs]
        for job in as_completed(jobs):
            res = job.result()
            if res is not None:
                for t in TABLES:
                    if t in res:
                        dfs[t].append(res[t])
            pbar.update()

  0%|          | 0/400221 [00:00<?, ?it/s]

In [10]:
calib = pd.concat(dfs["FVS_CalibStats"], ignore_index=True)
calib.to_csv("../data/processed/usfia_fvs_calibstats.csv", index=False, header=True)
calib.head()

Unnamed: 0,CaseID,StandID,TreeSize,SpeciesFVS,SpeciesPLANTS,SpeciesFIA,NumTrees,ScaleFactor,StdErrRatio,WeightToInput,ReadCorMult
0,973503c6-e24e-4e02-852f-71583ea0b042,00131999080201179000431,LG,TS,NYBI,694,6,1.266709,0.362172,0.842223,1.324072
1,973503c6-e24e-4e02-852f-71583ea0b042,00131999080201179000431,SM,TS,NYBI,694,5,0.593122,,,0.593122
2,b9257b30-152d-4c89-879c-6c14fdab3f62,00272007130402061203771,LG,QA,POTR5,746,10,0.856385,0.835733,0.711943,0.804316
3,b9257b30-152d-4c89-879c-6c14fdab3f62,00272007130402061203771,LG,PB,BEPA,375,5,1.010776,1.109627,0.447708,1.024229
4,57916b6a-0732-413d-8afc-7de38e3fd2fd,00272016150301137252081,LG,QA,POTR5,746,59,1.175822,0.594758,0.886386,1.200488


In [11]:
cases = pd.concat(dfs["FVS_Cases"], ignore_index=True)
cases.to_csv("../data/processed/usfia_fvs_cases.csv", index=False, header=True)
cases.head()

Unnamed: 0,CaseID,Stand_CN,StandID,MgmtID,RunTitle,KeywordFile,SamplingWt,Variant,Version,RV,Groups,RunDateTime
0,973503c6-e24e-4e02-852f-71583ea0b042,238814838010854,00131999080201179000431,NONE,,/mnt/data/github/calibrate_fvs/models/usfia/ke...,4377.50928,SN,FS2023.3,20230728,,2023-11-20-22:11:13
1,6bb8e698-c8e1-4809-bad7-96964592637f,70334058010538,00232003050505021031941,NONE,,/mnt/data/github/calibrate_fvs/models/usfia/ke...,5646.60547,NE,FS2023.3,20230728,,2023-11-21-05:25:34
2,a2cec0d0-e9ac-47d3-92cc-5a3290de6b95,51658703020004,00352005030102057874171,NONE,,/mnt/data/github/calibrate_fvs/models/usfia/ke...,5968.20654,CR,FS2023.3,20230728,,2023-11-21-11:57:39
3,2784b588-125e-4898-a6ca-63c2b75790ed,722469697290487,00552019090502099204271,NONE,,/mnt/data/github/calibrate_fvs/models/usfia/ke...,2810.7439,LS,FS2023.3,20230728,,2023-11-22-01:10:48
4,b9257b30-152d-4c89-879c-6c14fdab3f62,156002713010661,00272007130402061203771,NONE,,/mnt/data/github/calibrate_fvs/models/usfia/ke...,2203.0481,LS,FS2023.3,20230728,,2023-11-21-08:30:59


In [12]:
summ = pd.concat(dfs["FVS_Summary"], ignore_index=True)
summ.to_csv("../data/processed/usfia_fvs_summary.csv", index=False, header=True)
summ.head()

Unnamed: 0,CaseID,StandID,Year,Age,Tpa,BA,SDI,CCF,TopHt,QMD,...,ATCCF,ATTopHt,ATQMD,PrdLen,Acc,Mort,MAI,ForTyp,SizeCls,StkCls
0,a2cec0d0-e9ac-47d3-92cc-5a3290de6b95,00352005030102057874171,2010,19,2299,20,71,25,8,1.271596,...,25,8,1.271596,10,2,0,0.0,926,3,4
1,a2cec0d0-e9ac-47d3-92cc-5a3290de6b95,00352005030102057874171,2020,29,2269,45,137,49,10,1.901678,...,49,10,1.901678,0,0,0,0.0,926,3,3
2,f08436b5-5949-477f-9633-bcd619ff03ad,00062009050906037737111,2009,0,0,0,0,0,0,0.0,...,0,0,0.0,10,0,0,0.0,999,5,5
3,f08436b5-5949-477f-9633-bcd619ff03ad,00062009050906037737111,2019,10,0,0,0,0,0,0.0,...,0,0,0.0,0,0,0,0.0,999,5,5
4,5bd108c2-9bed-4133-bbd5-886dff6c40ee,00022016030201261523652,2017,257,3029,142,278,213,64,2.931952,...,213,64,2.931952,10,26,11,7.980545,270,1,3
