### Load libraries

In [None]:
# import pandas as pd
import os
import errno
import yaml
import time

# import function files
import sys
file = 'etl_functions.py'
sys.path.insert(0,os.path.dirname(os.path.abspath(file)))
import etl_functions

### Define variables

In [None]:
config_et_file_name = 'config_extract-transform.yml'
item_ADDITIONAL_INFORMATION_FILES = 'ADDITIONAL_INFORMATION_FILES'
item_FILES_TO_PROCESS = 'FILES_TO_PROCESS'
item_JOIN_FILES_COMMON_COLUMNS = 'JOIN_FILES_COMMON_COLUMNS'
item_COLUMNS_TO_DROP = 'COLUMNS_TO_DROP'
item_UPDATE_COLUMN_VALUES = 'UPDATE_COLUMN_VALUES'
item_NEW_COLUMNS = 'NEW_COLUMNS'
item_OUTPUT_FILE_NAME = 'OUTPUT_FILE_NAME'
item_UPDATE_COLUMN_NAMES = 'UPDATE_COLUMN_NAMES'
item_UPDATE_PRIMARY_KEY_VALUES = 'UPDATE_PRIMARY_KEY_VALUES'
item_PRIMARY_KEY_COLUMN = 'PRIMARY_KEY_COLUMN'
item_CREATE_PRIMARY_KEY_IF_NEEDED = 'CREATE_PRIMARY_KEY_IF_NEEDED'

### 1. Extract

#### 1.1 Dataframe column names checking
Makes a comparison of the column names in all files located in the _FILES_TO_PROCESS_ block of the **config_et.yml** file. 

In [None]:
etl_functions.compare_column_names(config_et_file_name , item_FILES_TO_PROCESS )

#### 1.2 Join data files and concatenate additional information 
Concatenates all files stored in the path specified in the FILES_TO_PROCESS block and then joins them with the additional information from files specified in the _ADDITIONAL_INFORMATION_FILES_ block using the columns specified at the _JOIN_FILES_COMMON_COLUMNS_ block of the **config_et.yml** file. 

In [None]:
extract_and_join_files = etl_functions.extract_and_join_files(config_et_file_name, item_FILES_TO_PROCESS, item_UPDATE_COLUMN_NAMES, item_ADDITIONAL_INFORMATION_FILES, item_JOIN_FILES_COMMON_COLUMNS)

### 2. Transform

#### 2.1 Drop not desired columns from the previous step resulting data frame 
Drop columns that you do not want to keep in your final Dataframe using the column names specified in the _COLUMNS_TO_DROP_ of the **config_et.yml** file.  

In [None]:
drop_not_used_columns = etl_functions.drop_not_used_columns(config_et_file_name, item_COLUMNS_TO_DROP, extract_and_join_files)

#### 2. 2 Update column names 
Update column names according to what is specified in the _UPDATE_COLUMN_NAMES_ block of the **config_et.yml** file.

In [None]:
update_column_names = etl_functions.update_column_names(config_et_file_name, item_UPDATE_COLUMN_NAMES, drop_not_used_columns)

#### 2.3 Insert new columns

Insert new columns and fill their rows according to what was specified at the _NEW_COLUMNS_ block of the **config_et.yml** file.

In [None]:
add_new_columns = etl_functions.add_new_columns(config_et_file_name, item_NEW_COLUMNS, update_column_names)

#### 2.4 Update column values
Update the column values specified at the _UPDATE_COLUMN_VALUES_ block of the **config_et.yml** file.

In [None]:
update_column_values = etl_functions.update_column_values(config_et_file_name, item_UPDATE_COLUMN_VALUES, add_new_columns)

#### 2.5 Create a primary key if it is needed. 

Especially for files for data that were not collected using [AgTC](https://github.com/Purdue-LuisVargas/AgTC).

In [None]:
create_primary_key = etl_functions.create_primary_key_if_needed(config_et_file_name, item_CREATE_PRIMARY_KEY_IF_NEEDED, item_PRIMARY_KEY_COLUMN, update_column_values)

#### 2.6 Update primary_key_values
Function that updates some characters of the primary key string. Useful when more than one trait is collected using the same template.  

In [None]:
update_primary_key_values = etl_functions.update_primary_key_values(config_et_file_name, item_UPDATE_PRIMARY_KEY_VALUES, item_PRIMARY_KEY_COLUMN, create_primary_key )

#### 2.6 Export the final data frame 
Export the final data frame to a location specified at the _OUTPUT_FILE_NAME_ block of the **config_et.yml** file.

In [None]:
etl_functions.export_dataframe(config_et_file_name, item_OUTPUT_FILE_NAME, update_primary_key_values)