### Load libraries

In [12]:
# import pandas as pd
import os
import errno
import yaml
import time

# import function files
import sys
file = 'etl_functions.py'
sys.path.insert(0,os.path.dirname(os.path.abspath(file)))
import etl_functions

### Define variables

In [13]:
config_et_file_name = 'config_extract-transform_canopyTemp.yml'
item_ADDITIONAL_INFORMATION_FILES = 'ADDITIONAL_INFORMATION_FILES'
item_FILES_TO_PROCESS = 'FILES_TO_PROCESS'
item_JOIN_FILES_COMMON_COLUMNS = 'JOIN_FILES_COMMON_COLUMNS'
item_COLUMNS_TO_DROP = 'COLUMNS_TO_DROP'
item_UPDATE_COLUMN_VALUES = 'UPDATE_COLUMN_VALUES'
item_NEW_COLUMNS = 'NEW_COLUMNS'
item_OUTPUT_FILE_NAME = 'OUTPUT_FILE_NAME'
item_UPDATE_COLUMN_NAMES = 'UPDATE_COLUMN_NAMES'
item_UPDATE_PRIMARY_KEY_VALUES = 'UPDATE_PRIMARY_KEY_VALUES'
item_PRIMARY_KEY_COLUMN = 'PRIMARY_KEY_COLUMN'

### 1. Extract

#### 1.1 Dataframe column names checking
Makes a comparison of the column names in all files located in the _FILES_TO_PROCESS_ block of the **config_et.yml** file. 

In [14]:
etl_functions.compare_column_names(config_et_file_name , item_FILES_TO_PROCESS )

Output explanation: 
 ([List columns on first file], 
 [List columns NOT on first file])


(['id_sample',
  'plot',
  'sample_name',
  'rep',
  'entry',
  'experiment',
  'season',
  'environment',
  'measurement',
  'sampling_identifier',
  'Temperature (C)',
  'date',
  'plot_picture',
  'experiment_picture',
  'Start Time (HH:MM)',
  'Start Humidity (%)',
  'Start Temperature (C)',
  'Start Water Temperature (C)',
  'Start Oil Temperature (C)',
  'End Time (HH:MM)',
  'End Humidity (%)',
  'End Temperature (C)',
  'End Water Temperature (C)',
  'End Oil Temperature (C)',
  'picture',
  'notes'],
 ['selection_history', 'notes_Experiment'])

#### 1.2 Join data files and concatenate additional information 
Concatenates all files stored in the path specified in the FILES_TO_PROCESS block and then joins them with the additional information from files specified in the _ADDITIONAL_INFORMATION_FILES_ block using the columns specified at the _JOIN_FILES_COMMON_COLUMNS_ block of the **config_et.yml** file. 

In [15]:
extract_and_join_files = etl_functions.extract_and_join_files(config_et_file_name, item_FILES_TO_PROCESS, item_UPDATE_COLUMN_NAMES, item_ADDITIONAL_INFORMATION_FILES, item_JOIN_FILES_COMMON_COLUMNS)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2940 entries, 0 to 2939
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id_sample                 2940 non-null   object 
 1   plot                      2940 non-null   int64  
 2   sample_name               2940 non-null   object 
 3   rep                       2940 non-null   int64  
 4   entry_x                   2940 non-null   int64  
 5   experiment                2940 non-null   object 
 6   season                    2940 non-null   object 
 7   environment               2940 non-null   object 
 8   variable_name             2940 non-null   object 
 9   sampling_identifier       2940 non-null   object 
 10  temperature(c)            2940 non-null   float64
 11  date                      2940 non-null   object 
 12  plot_picture              40 non-null     object 
 13  experiment_picture        31 non-null     object 
 14  starttim

In [16]:
extract_and_join_files

Unnamed: 0,id_sample,plot,sample_name,rep,entry_x,experiment,season,environment,variable_name,sampling_identifier,...,endhumidity(%),endtemperature(c),endwatertemperature(c),endoiltemperature(c),picture,notes,selection_history,notes_experiment,entry_y,selectionhistory
0,1_B_Roots Anatomy_Drought_Winter 2021-2022_Can...,1,B,1,1,Roots Anatomy,Winter 2021-2022,Drought,Canopy temperature,Sampling 10,...,,,,,,,,,1,CGSS03B00074T-099Y-099M-099Y-099M-7WGY-0B
1,2_B_Roots Anatomy_Drought_Winter 2021-2022_Can...,2,B,1,2,Roots Anatomy,Winter 2021-2022,Drought,Canopy temperature,Sampling 10,...,,,,,,,,,2,CMSS97M02949T-040Y-030M-040SY-030M-040SY-6M-0Y...
2,3_B_Roots Anatomy_Drought_Winter 2021-2022_Can...,3,B,1,3,Roots Anatomy,Winter 2021-2022,Drought,Canopy temperature,Sampling 10,...,,,,,,,,,3,CMSS96Y04084S-0Y-1B-131TLA-0B-0Y-125B-0Y-0B
3,4_A_Roots Anatomy_Drought_Winter 2021-2022_Can...,4,A,1,4,Roots Anatomy,Winter 2021-2022,Drought,Canopy temperature,Sampling 10,...,,,,,,,,,4,CMSS06B00707T-099TOPY-099ZTM-099Y-099M-2WGY-0B
4,5_A_Roots Anatomy_Drought_Winter 2021-2022_Can...,5,A,1,5,Roots Anatomy,Winter 2021-2022,Drought,Canopy temperature,Sampling 10,...,,,,,,,,,5,CMSS06Y00946T-099TOPM-099Y-099ZTM-099Y-099M-8W...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2935,40_B_Roots Anatomy_Irrigation_Winter 2022-2023...,40,B,3,1,Roots Anatomy,Winter 2022-2023,Irrigation,Canopy temperature,Sampling 8,...,,,,,,,,,1,CGSS03B00074T-099Y-099M-099Y-099M-7WGY-0B
2936,41_A_Roots Anatomy_Irrigation_Winter 2022-2023...,41,A,3,10,Roots Anatomy,Winter 2022-2023,Irrigation,Canopy temperature,Sampling 8,...,,,,,,,,,10,CMSS96Y04084S-0Y-1B-131TLA-0B-0Y-123B-0Y
2937,41_B_Roots Anatomy_Irrigation_Winter 2022-2023...,41,B,3,10,Roots Anatomy,Winter 2022-2023,Irrigation,Canopy temperature,Sampling 8,...,,,,,,,,,10,CMSS96Y04084S-0Y-1B-131TLA-0B-0Y-123B-0Y
2938,42_A_Roots Anatomy_Irrigation_Winter 2022-2023...,42,A,3,2,Roots Anatomy,Winter 2022-2023,Irrigation,Canopy temperature,Sampling 8,...,,,,,,,,,2,CMSS97M02949T-040Y-030M-040SY-030M-040SY-6M-0Y...


### 2. Transform

#### 2.1 Drop not desired columns from the previous step resulting data frame 
Drop columns that you do not want to keep in your final Dataframe using the column names specified in the _COLUMNS_TO_DROP_ of the **config_et.yml** file.  

In [17]:
drop_not_used_columns = etl_functions.drop_not_used_columns(config_et_file_name, item_COLUMNS_TO_DROP, extract_and_join_files)

Columns dataframe:  Index(['id_sample', 'plot', 'sample_name', 'rep', 'entry_x', 'experiment',
       'season', 'environment', 'variable_name', 'sampling_identifier',
       'temperature(c)', 'date', 'plot_picture', 'experiment_picture',
       'starttime(hh:mm)', 'starthumidity(%)', 'starttemperature(c)',
       'startwatertemperature(c)', 'startoiltemperature(c)', 'endtime(hh:mm)',
       'endhumidity(%)', 'endtemperature(c)', 'endwatertemperature(c)',
       'endoiltemperature(c)', 'picture', 'notes', 'selection_history',
       'notes_experiment', 'entry_y', 'selectionhistory'],
      dtype='object')
Columns to drop ['plot_picture', 'experiment_picture', 'starttime(hh:mm)', 'starthumidity(%)', 'starttemperature(c)', 'startwatertemperature(c)', 'startoiltemperature(c)', 'endtime(hh:mm)', 'endhumidity(%)', 'endtemperature(c)', 'endwatertemperature(c)', 'endoiltemperature(c)', 'picture', 'notes', 'selection_history', 'units', 'notes_experiment', 'entry_y', 'notes_experiment']
Column t

#### 2. 2 Update column names 
Update column names according to what is specified in the _UPDATE_COLUMN_NAMES_ block of the **config_et.yml** file.

In [18]:
update_column_names = etl_functions.update_column_names(config_et_file_name, item_UPDATE_COLUMN_NAMES, drop_not_used_columns)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2940 entries, 0 to 2939
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   date                 2940 non-null   object
 1   entry                2940 non-null   object
 2   environment          2940 non-null   object
 3   experiment           2940 non-null   object
 4   id_sample            2940 non-null   object
 5   plot                 2940 non-null   object
 6   rep                  2940 non-null   object
 7   sample_name          2940 non-null   object
 8   sampling_identifier  2940 non-null   object
 9   season               2940 non-null   object
 10  selectionhistory     2940 non-null   object
 11  variable_name        2940 non-null   object
 12  variable_value       2940 non-null   object
dtypes: object(13)
memory usage: 321.6+ KB
None


#### 2.3 Insert new columns

Insert new columns and fill their rows according to what was specified at the _NEW_COLUMNS_ block of the **config_et.yml** file.

In [19]:
add_new_columns = etl_functions.add_new_columns(config_et_file_name, item_NEW_COLUMNS, update_column_names)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2940 entries, 0 to 2939
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   date                 2940 non-null   object
 1   entry                2940 non-null   object
 2   environment          2940 non-null   object
 3   experiment           2940 non-null   object
 4   id_sample            2940 non-null   object
 5   plot                 2940 non-null   object
 6   rep                  2940 non-null   object
 7   sample_name          2940 non-null   object
 8   sampling_identifier  2940 non-null   object
 9   season               2940 non-null   object
 10  selectionhistory     2940 non-null   object
 11  variable_name        2940 non-null   object
 12  variable_value       2940 non-null   object
 13  variable_units       2940 non-null   object
dtypes: object(14)
memory usage: 344.5+ KB
None


#### 2.4 Update primary_key_values
Function that updates some characters of the primary key string. Useful when more than one trait is collected using the same template.  

In [20]:
update_primary_key_values = etl_functions.update_primary_key_values(config_et_file_name, item_UPDATE_PRIMARY_KEY_VALUES, item_PRIMARY_KEY_COLUMN, add_new_columns)

        date entry environment     experiment  \
0  3/11/2022     1     Drought  Roots Anatomy   
1  3/11/2022     2     Drought  Roots Anatomy   
2  3/11/2022     3     Drought  Roots Anatomy   
3  3/11/2022     4     Drought  Roots Anatomy   
4  3/11/2022     5     Drought  Roots Anatomy   

                                           id_sample plot rep sample_name  \
0  1_B_Roots Anatomy_Drought_Winter 2021-2022_Can...    1   1           B   
1  2_B_Roots Anatomy_Drought_Winter 2021-2022_Can...    2   1           B   
2  3_B_Roots Anatomy_Drought_Winter 2021-2022_Can...    3   1           B   
3  4_A_Roots Anatomy_Drought_Winter 2021-2022_Can...    4   1           A   
4  5_A_Roots Anatomy_Drought_Winter 2021-2022_Can...    5   1           A   

  sampling_identifier            season  \
0         Sampling 10  Winter 2021-2022   
1         Sampling 10  Winter 2021-2022   
2         Sampling 10  Winter 2021-2022   
3         Sampling 10  Winter 2021-2022   
4         Sampling 10  Wint

#### 2.5 Update column values
Update the column values specified at the _UPDATE_COLUMN_VALUES_ block of the **config_et.yml** file.

In [21]:
update_column_values = etl_functions.update_column_values(config_et_file_name, item_UPDATE_COLUMN_VALUES, update_primary_key_values)

        date entry environment     experiment  \
0  3/11/2022     1     Drought  Roots Anatomy   
1  3/11/2022     2     Drought  Roots Anatomy   
2  3/11/2022     3     Drought  Roots Anatomy   
3  3/11/2022     4     Drought  Roots Anatomy   
4  3/11/2022     5     Drought  Roots Anatomy   

                                           id_sample plot rep sample_name  \
0  1_B_Roots Anatomy_Drought_Winter 2021-2022_Can...    1   1           B   
1  2_B_Roots Anatomy_Drought_Winter 2021-2022_Can...    2   1           B   
2  3_B_Roots Anatomy_Drought_Winter 2021-2022_Can...    3   1           B   
3  4_A_Roots Anatomy_Drought_Winter 2021-2022_Can...    4   1           A   
4  5_A_Roots Anatomy_Drought_Winter 2021-2022_Can...    5   1           A   

  sampling_identifier            season  \
0         Sampling 10  Winter 2021-2022   
1         Sampling 10  Winter 2021-2022   
2         Sampling 10  Winter 2021-2022   
3         Sampling 10  Winter 2021-2022   
4         Sampling 10  Wint

#### 2.6 Export the final data frame 
Export the final data frame to a location specified at the _OUTPUT_FILE_NAME_ block of the **config_et.yml** file.

In [22]:
etl_functions.export_dataframe(config_et_file_name, item_OUTPUT_FILE_NAME, update_column_values)

Dataframe: ./et_output/20230319-213524_Canopy temperature_CIMMYT_RootsAnatomy_clean.csv created successfully!
