In [1]:
import pandas as pd
import sqlite3

In [2]:
# Define DB
conn = sqlite3.connect(r'C:\Users\bbrown\Documents\python_scripts\db-build-trial\db_trial_v1\ga_harvest.db')
c = conn.cursor()

In [3]:
# Create a mock-up GIS dataset
data = {"INTAKE_ID": "BBB241127",
        "UNIQUE_ID": "BBB241127TH01",
        "STATE": "GA",
        "COUNTY": "Heard",
        "COMP": None,
        "TRACT": "River",
        "STAND": "1",
        "CLIENT_ACR": 36,
        "GA_ACRES": 50,
        "EST_AGE": 23,
        "SPECIES": "Loblolly Pine",
        "ORIGIN": "Planted",
        "PRE_BA": 100,
        "PRE_TPA": 120,
        "PRE_GTA": 112,
        "PLN_BA": 50,
        "PLN_TPA": 50,
        "PLN_GTA": 50,
        "GT_CURRENT": 400,
        "GT_FUTURE": 0,
        "RP_REQUEST": 9,
        "COMMENTS": None
        }

df = pd.DataFrame(data, index=[0])

In [4]:
df

Unnamed: 0,INTAKE_ID,UNIQUE_ID,STATE,COUNTY,COMP,TRACT,STAND,CLIENT_ACR,GA_ACRES,EST_AGE,...,PRE_BA,PRE_TPA,PRE_GTA,PLN_BA,PLN_TPA,PLN_GTA,GT_CURRENT,GT_FUTURE,RP_REQUEST,COMMENTS
0,BBB241127,BBB241127TH01,GA,Heard,,River,1,36,50,23,...,100,120,112,50,50,50,400,0,9,


In [5]:
df.columns

Index(['INTAKE_ID', 'UNIQUE_ID', 'STATE', 'COUNTY', 'COMP', 'TRACT', 'STAND',
       'CLIENT_ACR', 'GA_ACRES', 'EST_AGE', 'SPECIES', 'ORIGIN', 'PRE_BA',
       'PRE_TPA', 'PRE_GTA', 'PLN_BA', 'PLN_TPA', 'PLN_GTA', 'GT_CURRENT',
       'GT_FUTURE', 'RP_REQUEST', 'COMMENTS'],
      dtype='object')

In [6]:
client_df = df[
    ['UNIQUE_ID',
     'TRACT',
     'COMP',
     'STAND',
     'CLIENT_ACR',
     'EST_AGE',
     'COUNTY',
     'STATE',
     'SPECIES',
     'ORIGIN',
     'PRE_BA',
     'PRE_TPA',
     'PRE_GTA',
     'PLN_BA',
     'PLN_TPA',
     'PLN_GTA',
     'GT_CURRENT',
     'GT_FUTURE',
     'RP_REQUEST'
     ]
].copy()

client_df.rename(columns={
    'UNIQUE_ID': 'client_uid',
    'COMP': 'cmp',
    'EST_AGE': 'age',
    'GT_CURRENT': 'client_gt',
    'GT_FUTURE': 'future_gt',
    'RP_REQUEST': 'reporting_period'
}, inplace=True
)

client_df.columns = [x.lower().replace(' ', '_') for x in client_df.columns]

In [7]:
client_df

Unnamed: 0,client_uid,tract,cmp,stand,client_acr,age,county,state,species,origin,pre_ba,pre_tpa,pre_gta,pln_ba,pln_tpa,pln_gta,client_gt,future_gt,reporting_period
0,BBB241127TH01,River,,1,36,23,Heard,GA,Loblolly Pine,Planted,100,120,112,50,50,50,400,0,9


In [13]:
# normalize the species, origin, tract, county, and state

# first pull the associated tables from the DB
for table in ['species', 'origin', 'tract', 'county', 'state']:
    mapping_df = pd.read_sql(f"SELECT * from {table}", conn)
    mapping_map = dict(zip(mapping_df.iloc[:, 0], mapping_df.iloc[:, 1]))
    
    # Check to make sure the value in client_df is appropriate (should be string for all of these)
    if client_df[f'{table}'].dtype != str:
        error = f"Dtype of column {table} is incorrect"
        print(error)
    else:
        # lowercase the value from the client_df
        client_df[f'{table}'] = client_df[f'{table}'].str.lower()
        
        # See if the client_df value is in the mapping_map
        
        client_df[f"{table}"] = client_df[f"{table}"].map(mapping_map)
    
    




Dtype of column species is incorrect
Dtype of column origin is incorrect
Dtype of column tract is incorrect
Dtype of column county is incorrect
Dtype of column state is incorrect


In [9]:
client_df

Unnamed: 0,client_uid,tract,cmp,stand,client_acr,age,county,state,species,origin,pre_ba,pre_tpa,pre_gta,pln_ba,pln_tpa,pln_gta,client_gt,future_gt,reporting_period
0,BBB241127TH01,10,,1,36,23,4,0,,,100,120,112,50,50,50,400,0,9


In [None]:
mapping_map

## Thoughts

- I want to make sure the data going into the database is not already there
- I want to make sure that the data in the data frame is of the correct type
- I want to make sure that the data in the database is protected and not goign to get lost.

In [None]:
client_df.to_sql('client_intake', conn, if_exists='append', index=False)