In [1]:
import sys
sys.path.append("/home/srelins/docs/cy_fdm_builder/src")
from testing_helpers import *
from FDM_builder import *

## Current work in progress:

Date parser + tool to add dates to src tables:

In [None]:
def add_uuid_to_table(full_table_id):
    sql = f"""
        SELECT GENERATE_UUID() AS uuid, *
        FROM {full_table_id}
    """
    run_sql_query(sql, destination=full_table_id)
    
    
def parse_date(x):
    if type(x) is datetime.datetime:
        x = x.date
    return parse(str(x), dayfirst=True, yearfirst=True)
    
    
def get_datetime_df(date_source, full_table_id):
    
    table = CLIENT.get_table(full_table_id)
    col_data = {field.name: field.field_type 
                for field in table.schema}
    if type(date_source) == list and len(date_source) == 3:
        cast_cols_sql = []
        for col in date_source:
            if col in col_data.keys() and col_data[col] == "STRING":
                cast_cols_sql.append(col)
            elif col in col_data.keys(): 
                cast_cols_sql.append(f"CAST({col} AS STRING)")
            else:
                cast_cols_sql.append(f'"{col}"')
        to_concat_sql = ', "-", '.join(cast_cols_sql) 
        sql = f"""
            SELECT uuid, CONCAT({to_concat_sql}) AS date
            FROM {full_table_id}
        """
    else:
        sql = f"""
            SELECT uuid, {date_source} AS date
            FROM {full_table_id}
        """
        
    dates_df = pd.read_gbq(query=sql, project_id=PROJECT)
    date_str_lens = dates_df.date.apply(len)
    if all(date_str_lens <= 8):
        print("WARNING: 2 character years are ambiguous e.g. 75 will be parsed\n" 
              "as 1975 but 70 will be parsed as 2070. Consider converting year.")
    dates_df["parsed_date"] = dates_df.date.apply(parse_date)
    return dates_df[["uuid", "parsed_date"]]
            
    
def add_parsed_date_to_table(date_source, table_id, dataset_id):
    
    full_table_id = f"{PROJECT}.{dataset_id}.{table_id}"
    add_uuid_to_table(full_table_id)
    
    dates_df = get_datetime_df(date_source, full_table_id)
    
    temp_dates_id = f"{PROJECT}.{dataset_id}.tmp_dates"
    dates_df.to_gbq(destination_table=temp_dates_id,
                    project_id=PROJECT,
                    table_schema=[{"name":"parsed_date", "type":"DATE"}],
                    if_exists="replace")
    
    join_dates_sql = f"""
        SELECT dates.parsed_date, src.*
        FROM `{full_table_id}` AS src
        LEFT JOIN {temp_dates_id} as dates
        ON src.uuid = dates.uuid
    """
    run_sql_query(join_dates_sql, destination=full_table_id)
    
    drop_uuid_sql = f"""
        ALTER TABLE {full_table_id}
        DROP COLUMN uuid
    """
    run_sql_query(drop_uuid_sql)
    
    CLIENT.delete_table(temp_dates_id)

## Testing:

need to fix ambiguity of input format - dayfirst/yearfirst

In [None]:
build_test_environment()

In [None]:
table_1 = FDMTable(
    source_table_full_id = f"{PROJECT}.CY_TESTS_SRC.src_table_1",
    dataset_id = "CY_TESTS_FDM"
)
table_1.build()

In [None]:
table_1.add_parsed_date_to_table("date")

In [None]:
table_2 = FDMTable(
    source_table_full_id = f"{PROJECT}.CY_TESTS_SRC.src_table_2",
    dataset_id = "CY_TESTS_FDM"
)
table_2.build()

In [None]:
table_2.add_parsed_date_to_table(["year", "month", "day"])

In [3]:
table_3 = FDMTable(
    source_table_full_id = f"{PROJECT}.CY_TESTS_SRC.src_table_3",
    dataset_id = "CY_TESTS_FDM"
)
table_3.build()
table_3.add_parsed_date_to_table(["year", "month", "1"])

		 ##### BUILDING TABLE src_table_3 #####
________________________________________________________________________________

1. Copying src_table_3 to CY_TESTS_FDM

	* src_table_3 already exists in CY_TESTS_FDM.

	NOTE: Working from the existing version of src_table_3
	in CY_TESTS_FDM. If you wish to begin from scratch with a
	fresh copy, drop the existing table in CY_TESTS_FDM and run
	.build() again.

2. Checking identifier name syntax:

	* person_id found - syntax correct
	* EDRN found - syntax correct

3. Adding person_id column:

	* src_table_3 already contains person_id column

	NOTE: EDRN also found in src_table_3. If you
	wish to rebuild the person_id column from EDRN, drop the existing
	person_id column in src_table_3 and run .build() again

________________________________________________________________________________

	 ##### BUILD PROCESS FOR src_table_3 COMPLETE! #####



In [10]:
parse("10-1-5", dayfirst=True)

datetime.datetime(2005, 1, 10, 0, 0)