Skip to content

Commit

Permalink
Merge branch 'master' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
Tom Schenk Jr committed Dec 9, 2015
2 parents 7c08216 + 8b20bb7 commit f5b05cd
Show file tree
Hide file tree
Showing 15 changed files with 141 additions and 19 deletions.
Binary file not shown.
Empty file.
2 changes: 1 addition & 1 deletion ETL/Utilities/DataSyncFTP.ktr
Expand Up @@ -112,7 +112,7 @@ words, it does not use a config.json file.</note>
<schema_name/>
</partitioning>
<formula><field_name>DataSyncBatchCommand</field_name>
<formula_string>&quot;java -jar &quot; &amp; [DataSyncDirectory] &amp; &quot;datasync.jar&quot; &amp; &quot; --fileToPublish &quot; &amp; [file] &amp; &quot; --fileToPublishHasHeaderRow &quot; &amp; &quot;false&quot; &amp; &quot; --datasetID &quot; &amp; [datasetid] &amp; &quot; --publishMethod &quot; &amp; &quot;replace&quot; &amp; &quot; --publishViaFTP true&quot; &amp; &quot; --pathToFTPControlFile &quot; &amp; [controlfile]</formula_string>
<formula_string>&quot;java -jar &quot; &amp; [DataSyncDirectory] &amp; &quot;datasync.jar&quot; &amp; &quot; --fileToPublish &quot; &amp; [file] &amp; &quot; --datasetID &quot; &amp; [datasetid] &amp; &quot; --publishMethod &quot; &amp; &quot;replace&quot; &amp; &quot; --publishViaFTP true&quot; &amp; &quot; --pathToFTPControlFile &quot; &amp; [controlfile]</formula_string>
<value_type>String</value_type>
<value_length>-1</value_length>
<value_precision>-1</value_precision>
Expand Down
4 changes: 2 additions & 2 deletions ETL/Utilities/DataSyncHTTP.ktr
Expand Up @@ -112,7 +112,7 @@ words, it does not use a config.json file.</note>
<schema_name/>
</partitioning>
<formula><field_name>DataSyncBatchCommand</field_name>
<formula_string>&quot;java -jar -Xmx1g &quot; &amp; [DataSyncDirectory] &amp; &quot;datasync.jar&quot; &amp; &quot; --fileToPublish &quot; &amp; [file] &amp; &quot; --fileToPublishHasHeaderRow &quot; &amp; &quot;false&quot; &amp; &quot; --datasetID &quot; &amp; [datasetid] &amp; &quot; --publishMethod &quot; &amp; &quot;replace&quot; &amp; &quot; --publishViaHttp true&quot; &amp; &quot; --pathToControlFile &quot; &amp; [controlfile]</formula_string>
<formula_string>&quot;java -jar -Xmx1g &quot; &amp; [DataSyncDirectory] &amp; &quot;datasync.jar&quot; &amp; &quot; --fileToPublish &quot; &amp; [file] &amp; &quot; --datasetID &quot; &amp; [datasetid] &amp; &quot; --publishMethod &quot; &amp; &quot;replace&quot; &amp; &quot; --publishViaHttp true&quot; &amp; &quot; --pathToControlFile &quot; &amp; [controlfile]</formula_string>
<value_type>String</value_type>
<value_length>-1</value_length>
<value_precision>-1</value_precision>
Expand All @@ -121,7 +121,7 @@ words, it does not use a config.json file.</note>
<cluster_schema/>
<remotesteps> <input> </input> <output> </output> </remotesteps> <GUI>
<xloc>74</xloc>
<yloc>276</yloc>
<yloc>277</yloc>
<draw>Y</draw>
</GUI>
</step>
Expand Down
3 changes: 1 addition & 2 deletions ETL/_control.json
Expand Up @@ -2,7 +2,6 @@
"action" : "Replace",
"csv" :
{
"ignoreServerLatLong" : false,
"fixedTimestampFormat" : "MM/dd/yyyy",
"separator" : ",",
"timezone" : "UTC",
Expand All @@ -19,4 +18,4 @@
"floatingTimestampFormat" : "MM/dd/yyyy",
"syntheticLocations" : {}
}
}
}
7 changes: 7 additions & 0 deletions Log/A_DatasetLogs.bat
@@ -0,0 +1,7 @@
@echo off
IF [%1] == [] GOTO USAGE
dir /b | find "%1"
exit /B

:USAGE
echo Usage: A_DatasetLogs.bat StringToFindInLogName
8 changes: 8 additions & 0 deletions Log/A_ETLRuntimes.bat
@@ -0,0 +1,8 @@
@echo off
IF [%1] == [] GOTO USAGE
type *%1* 2>nul | find "Processing ended after"
exit /B

:USAGE
echo Usage: A_ETLRuntimes.bat StringToFindInLogName
exit /B 1
16 changes: 16 additions & 0 deletions Log/A_RunETL.bat
@@ -0,0 +1,16 @@
@echo off
IF [%1] == [] GOTO USAGE

REM Sanity check
echo %1 | findstr /r "\<[a-Z0123456789][a-Z0123456789][a-Z0123456789][a-Z0123456789]-[a-Z0123456789][a-Z0123456789][a-Z0123456789][a-Z0123456789]\>"
IF %ERRORLEVEL% NEQ 0 (
echo Invalid Dataset4x4
exit /B 1
)

schtasks /Query /FO list /V | FOR /F "tokens=2 DELIMS=:" %%f in ('findstr /RC:"^Task To Run"') do @FOR /F "tokens=* delims= " %%a in ('@echo %%f') do @echo %%a | FOR /F "tokens=*" %%b in ('find "%1"') do @call %%b
exit /B

:USAGE
echo Usage: A_RunETL.bat Dataset4x4
exit /B 1
16 changes: 16 additions & 0 deletions Log/A_RunETL.sh
@@ -0,0 +1,16 @@
if [ -z "$1" ]
then
echo "Usage: A_RunETL.sh Dataset4x4"
exit 1
else
# Sanity check
if [ -z $(echo "$1" | grep "^[a-zA-Z0-9]\{4\}-[a-zA-Z0-9]\{4\}$") ]; then
echo "Invalid Dataset4x4"
exit 1
fi
CRONTAB_LINE="crontab -l | grep $1 | cut -d ' ' -f 6-" #Returns the 6th and subsequent "fields" from the crontab line containing the 4x4, with the field delimiter being a space. Result: Skips the scheduling part and returns the actual command.
COMMAND_TO_RUN=$(eval $CRONTAB_LINE)
eval $COMMAND_TO_RUN
echo "Command executed: " $COMMAND_TO_RUN
exit 0
fi
18 changes: 18 additions & 0 deletions Log/A_TodayLogs.bat
@@ -0,0 +1,18 @@
@echo off
REM Specify the 4x4 values to exclude, separated by space characters
set DATASETS_TO_EXCLUDE="n4j6-wkkf t2qx-9pjd qmqz-2xku 97wa-y6ff"
set TODAY=%date:~10,4%%date:~4,2%%date:~7,2%

IF [%1] == [] GOTO DEFAULT
IF [%1] == [-e] GOTO EXCLUDE

echo Usage: A_Today.bat [-e]
echo -e Exclude a pre-specified set of datasets by 4x4. generally those that run frequently and would clutter the output.
exit /B 1

:DEFAULT
dir /b | find "%TODAY%"
exit /B

:EXCLUDE
dir /b | find "%TODAY%" | findstr /V %DATASETS_TO_EXCLUDE%
4 changes: 2 additions & 2 deletions README.md
Expand Up @@ -19,11 +19,11 @@ The ETL framework is organized so each function can be modified in one file that
The requirements for the recommended configuration require the following pieces of software:
* [Kettle (or Pentaho) data integration](http://community.pentaho.com/projects/data-integration/) - _Note_: This framework has only been tested with Kettle 4.4.0 and lower.
* Java 1.6 or higher
* [DataSync (for use with Socrata)](http://socrata.github.io/datasync/)
* [DataSync (for use with Socrata)](http://socrata.github.io/datasync/) - _Note_: This framework is designed for the version of DataSync in the DataSync directory and will not necessarily work with earlier or later versions.
* MacOS X, Linux, or Unix (only required for full automation with included scripts)

## Kettle Compatibility
This framework has only been tested using Kettle 4.3.0 and Kettle 4.4.0. It is possible that this framework is fully compatible with Kettle 5.x, but has not been tested. If you would like to contribute, please see the issue page.

## Errors / Bugs
Experiencing issues with the included files? Report it on our [issue tracker](https://github.com/Chicago/open-data-etl-utility-kit/issues)
Experiencing issues with the included files? Report it on our [issue tracker](https://github.com/Chicago/open-data-etl-utility-kit/issues)
12 changes: 4 additions & 8 deletions docs/index.rst
@@ -1,6 +1,6 @@
===============================
Open Data ETL Toolkit |release|
===============================
=====================
Open Data ETL Toolkit
=====================

.. toctree::
:maxdepth: 1
Expand All @@ -13,10 +13,6 @@ Open Data ETL Toolkit |release|
This toolkit provides several utilities and framework to help governments deploy automated ETLs using the open-source Pentaho data integration (Kettle) software.

Namely, this toolkit will assist with:
* Load data from a database an load it to a Socrata data portal
* Steps to integrate with an SMTP server to provide e-mail alerts on the outcome of ETL scripts
* Handles deployment issues when using multiple operating systems during development
* Utilities to allow administrators to quickly analyze the log files of ETLs for quick diagnostics

* Load data from a database and transfer it to a Socrata data portal
* Steps to integrate with an Exchange server to provide e-mail alerts
Expand Down Expand Up @@ -61,4 +57,4 @@ Errors / Bugs

Experiencing issues with the included files? Report it on our `issue
tracker
<https://github.com/Chicago/open-data-etl-utility-kit/issues>`__
<https://github.com/Chicago/open-data-etl-utility-kit/issues>`__
4 changes: 4 additions & 0 deletions docs/installation-configuration.rst
Expand Up @@ -66,6 +66,10 @@ utility is *only* compatible with Socrata portals. Fortunately, this
utility handles incremental updates and upserting without additional
logic in the ETL.

This framework will work with the version of DataSync in the DataSync
directory of this repository, which is named in a way to make the
version clear. It should be renamed to ``datasync.jar`` in an actual deployment.

You may install DataSync to any directory. Later configuration will
direct Kettle to the correct location.

Expand Down
6 changes: 3 additions & 3 deletions docs/setting-up-automation.rst
Expand Up @@ -36,7 +36,7 @@ Setting-up Timing

The timing of the automated script is manged through cron jobs. Edit the cron job manager in the terminal by typing ``crontab -e`` in the shell. The cron job contains the starting script and also instructs the logging to be directed to the appropriate log files. For example::
* * * * * /path/to/directory/open-data-etl-utility-kit/ETL/Data_Set_Name_abcd-1234/abcd-1234.sh >> /path/to/directory/ETL/Data_Set_Name_abcd-1234/abcd-1234.log
* * * * * /path/to/directory/open-data-etl-utility-kit/ETL/Data_Set_Name_abcd-1234/abcd-1234.sh >> /path/to/directory/ETL/Data_Set_Name_abcd-1234/abcd-1234.log 2>&1

The astrisks should be edited to meet the desired update schedules. A quick guides of those settings can be found on `Wikipedia <http://en.wikipedia.org/wiki/Cron#Predefined_scheduling_definitions>`_.

Expand All @@ -45,6 +45,6 @@ Testing process

A simple way to test the process is to execute the following line in the command prompt::
/path/to/directory/open-data-etl-utility-kit/ETL/Data_Set_Name_abcd-1234/abcd-1234.sh >> /path/to/directory/ETL/Data_Set_Name_abcd-1234/abcd-1234.log
/path/to/directory/open-data-etl-utility-kit/ETL/Data_Set_Name_abcd-1234/abcd-1234.sh >> /path/to/directory/ETL/Data_Set_Name_abcd-1234/abcd-1234.log 2>&1

If correctly configured, the dataset should be updated, log files should be updated, and users should receive email alerts.
If correctly configured, the dataset should be updated, log files should be updated, and users should receive email alerts.
60 changes: 59 additions & 1 deletion docs/utilities-for-administering-etls.rst
Expand Up @@ -49,10 +49,21 @@ Show all log files

**Returns:** Will list the log files associated for a user-specified ETL job. The output is displayed in the terminal.

**File:** Log/A_DatasetLogs.bat (Windows only)

**Description:** Shows all of the log files associated with a dataset.

**Usage:** Open the command prompt window and type the name of a dataset::
> cd \path\to\directory\open-data-etl-utility-kit\
> \Log\A_DatasetLogs.bat Name_of_dataset
**Returns:** Will list the log files associated for a user-specified ETL job. The output is displayed in the command prompt window.

Summarize ETL run times
-----------------------

**File:** Log/A_DatasetLogs.sh (MacOS X/Linux/Unix only)
**File:** Log/A_ETLRuntimes.sh (MacOS X/Linux/Unix only)

**Description:** Shows the runtime for ETLs with a dataset.

Expand All @@ -63,6 +74,17 @@ Summarize ETL run times

**Returns:** The output will show the total run-times recorded in log files for the user-specified ETL. The output is displayed in the terminal.

**File:** Log/A_ETLRuntimes.bat (Windows only)

**Description:** Shows the runtime for ETLs with a dataset.

**Usage:** Open the command prompt window and type the name of a dataset::

> cd \path\to\directory\open-data-etl-utility-kit\
> Log\A_ETLRuntimes.bat Name_of_dataset

**Returns:** The output will show the total run-times recorded in log files for the user-specified ETL. The output is displayed in the command prompt window.

Show today's ETL logs
---------------------

Expand All @@ -75,3 +97,39 @@ Show today's ETL logs
$ sh /path/to/directory/open-data-etl-utility-kit/Log/A_TodayLogs.sh [-e]

**Returns:** The output will show the list of log files which were generated today. With the *-e* parameter, a group of datasets specified in a parameter at the beginning of the script will be excluded (generally, those that run frequently and would clutter the output). The output is displayed in the terminal.

**File:** Log/A_TodayLogs.bat (Windows only)

**Description:** Shows log files which were created today

**Usage:** Open the command prompt window and run the command::

> sh \path\to\directory\open-data-etl-utility-kit\Log\A_TodayLogs.bat [-e]

**Returns:** The output will show the list of log files which were generated today. With the *-e* parameter, a group of datasets specified in a parameter at the beginning of the script will be excluded (generally, those that run frequently and would clutter the output). The output is displayed in the command prompt window.

Run a specific ETL
---------------------

**File:** Log/A_RunETL.sh (MacOS X/Linux/Unix only)

**Description:** Performs a one-time run of an ETL normally run on a scheduled basis through the crontab file. This file need not be in the Log directory to run correctly. It does not use the log files and is in the Log directory only to keep it with other scripts.

**Usage:** Open the terminal and type the name of a dataset::

$ cd /path/to/directory/open-data-etl-utility-kit/
$ sh Log/A_RunETL.sh Name_of_dataset

**Returns:** The script will find and run the ETL command for the specified dataset. The output will show the command run so the user can confirm it was the intended dataset ETL.


**File:** Log/A_RunETL.bat (Windows only)

**Description:** Performs a one-time run of an ETL normally run on a scheduled basis through by the Windows task scheduler. This file need not be in the Log directory to run correctly. It does not use the log files and is in the Log directory only to keep it with other scripts.

**Usage:** Open the terminal and type the name of a dataset::

$ cd \path\to\directory\open-data-etl-utility-kit\
$ Log\A_RunETL.bat Name_of_dataset

**Returns:** The script will find and run the ETL command for the specified dataset. The output will show the command run so the user can confirm it was the intended dataset ETL.

0 comments on commit f5b05cd

Please sign in to comment.