# Publish SQL-based endpoints on NGINX log analysis

https://blog.tinybird.co/2021/01/28/nginx-log-analysis/

In [1]:
#@title Mount your Google Drive to save and use local files
from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)

% cd "/content/gdrive/My Drive/Colab Notebooks/Tinybird/tb_examples"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/Colab Notebooks/Tinybird/tb_examples


In [2]:
#@title Install Tinybird CLI and your token
!pip install tinybird-cli -q -U

import json
import os

if not os.path.isfile('.tinyb'):
  !tb auth

if not os.path.isdir('datasources'):
  !tb init



In [3]:
#@title Helper function
def write_text_to_file(filename, text):
  with open(filename, 'w') as f: f.write(text)

# Publish SQL-based endpoints on NGINX log analysis

Tinybird can be used to analyze logs-like datasets at scale. 

Here we use Tinybird to:
 - analyze NGINX logs
 - to publish SQL queries as API endpoints to be used in other tools
 - model your data sources and endpoints to make it 10X faster.


## Build a data source from a sample NGINX log

In [4]:
!tb datasource generate https://raw.githubusercontent.com/tinybirdco/log_parsing_template/main/access.log.csv

[92m** Generated datasources/access_log.datasource
** => Create it on the server running: $ tb push datasources/access_log.datasource
** => Append data using: $ tb datasource append access.log https://raw.githubusercontent.com/tinybirdco/log_parsing_template/main/access.log.csv`
[0m


In [5]:
!tb push datasources/access_log.datasource

[0m** Processing datasources/access_log.datasource[0m
[0m** Building dependencies[0m
[0m** Running access_log [0m
[92m** 'access_log' created[0m
[0m** Not pushing fixtures[0m


In [6]:
!cat datasources/access_log.datasource

DESCRIPTION generated from https://raw.githubusercontent.com/tinybirdco/log_parsing_template/main/access.log.csv

SCHEMA >
    `column_00` String,
    `column_01` String,
    `column_02` String,
    `column_03` String,
    `column_04` String,
    `column_05` String,
    `column_06` Int32,
    `column_07` Int32,
    `column_08` String,
    `column_09` String

In [7]:
!tb datasource append access_log 'https://raw.githubusercontent.com/tinybirdco/log_parsing_template/main/access.log.csv'

[0m** 🥚 starting import process[0m
[92m** 🐥 done[0m
[92m** Total rows in access_log: 67351[0m
[92m** Data appended to Data Source 'access_log' successfully![0m
[0m** Data pushed to access_log[0m


In [8]:
!tb sql "select * from access_log limit 1" --stats

[0m** Query took 0.000422682 seconds
** Rows read: 1
** Bytes read: 218 bytes[0m
---------------------------------------------------------------------
[1;32mcolumn_00:[0m 10.86.160.14
[1;32mcolumn_01:[0m -
[1;32mcolumn_02:[0m -
[1;32mcolumn_03:[0m [19/Jan/2021:06:25:37
[1;32mcolumn_04:[0m +0000]
[1;32mcolumn_05:[0m GET /v0/pipes/pro__ct___v0.json?id_c=18&token=XYZ HTTP/1.1
[1;32mcolumn_06:[0m 200
[1;32mcolumn_07:[0m 455
[1;32mcolumn_08:[0m -
[1;32mcolumn_09:[0m Apache-HttpClient/4.5.10 (Java/11.0.8)
---------------------------------------------------------------------


In [23]:
!tb sql "SELECT count() FROM access_log"

-----------
| [1;32mcount()[0m |
-----------
|   67351 |
-----------


In [9]:
filename="pipes/access_log_transform.pipe"
text='''
DESCRIPTION extract column data from raw access log
NODE extract_column_data
SQL >
    select
        IPv4StringToNum(column_00) as ip,
        parseDateTimeBestEffort(replaceOne(substring(column_03, 2), ':', ' ')) as time,
        splitByChar(' ', column_05) as tt,
        tt[1] as method,
        tt[2] as path,
        tt[3] as protocol,
        column_06 as status_code,
        column_07 as bytes,
        column_09 as user_agent
    from access_log
'''

write_text_to_file(filename, text)

!tb push pipes/access_log_transform.pipe

[0m** Processing pipes/access_log_transform.pipe[0m
[0m** Building dependencies[0m
[0m** Running access_log_transform [0m
[92m** => Test endpoint at https://api.tinybird.co/v0/pipes/access_log_transform.json[0m
[92m** 'access_log_transform' created[0m
[0m** Not pushing fixtures[0m


In [10]:
!tb sql "select * from access_log_transform limit 1" --stats

[0m** Query took 0.000828297 seconds
** Rows read: 2,004
** Bytes read: 472.33 KB[0m
---------------------------------
[1;32mip:[0m 1382418444
[1;32mtime:[0m 2021-01-19 11:48:24
[1;32mtt:[0m ['GET', '/', 'HTTP/1.1']
[1;32mmethod:[0m GET
[1;32mpath:[0m /
[1;32mprotocol:[0m HTTP/1.1
[1;32mstatus_code:[0m 301
[1;32mbytes:[0m 194
[1;32muser_agent:[0m Mozilla/5.0 zgrab/0.x
---------------------------------


In [11]:
filename="pipes/requests_per_endpoint.pipe"
text='''
DESCRIPTION requests per endpoint
NODE grouping
SQL >
    %
    SELECT
        ip,
        count() AS request_count,
        avg(bytes) as avg_bytes
    FROM access_log_transform
    GROUP BY ip
    ORDER BY request_count DESC

NODE endpoint
SQL >
    select IPv4NumToString(ip) as ip_address,
           request_count
    from grouping
'''

write_text_to_file(filename, text)

!tb push pipes/requests_per_endpoint.pipe

[0m** Processing pipes/requests_per_endpoint.pipe[0m
[0m** Building dependencies[0m
[0m** Running requests_per_endpoint [0m
[92m** => Test endpoint at https://api.tinybird.co/v0/pipes/requests_per_endpoint.json[0m
[92m** 'requests_per_endpoint' created[0m
[0m** Not pushing fixtures[0m


In [12]:
filename="pipes/requests_per_endpoint_mv.pipe"
text='''
DESCRIPTION materialzed view
NODE matview
SQL >
    SELECT
        ip,
        countState() AS request_count,
        avgState(bytes) as avg_bytes
    FROM access_log_transform
    GROUP BY ip

TYPE Materialized
DATASOURCE requests_per_endpoint_ds
'''

write_text_to_file(filename, text)

filename="datasources/requests_per_endpoint_ds.datasource"
text='''
DESCRIPTION materialized view
SCHEMA >
    ip UInt32,
    request_count AggregateFunction(count),
    avg_bytes AggregateFunction(avg, Int32)

ENGINE AggregatingMergeTree
ENGINE_SORTING_KEY ip
'''

write_text_to_file(filename, text)

In [13]:
!tb push pipes/requests_per_endpoint_mv.pipe --push-deps --populate

[0m** Processing pipes/requests_per_endpoint_mv.pipe[0m
[0m** Processing pipes/access_log_transform.pipe[0m
[0m** Processing datasources/access_log.datasource[0m
[0m** Processing datasources/requests_per_endpoint_ds.datasource[0m
[0m** Building dependencies[0m
[0m** Running requests_per_endpoint_ds [0m
[92m** 'requests_per_endpoint_ds' created[0m
[0m** Running requests_per_endpoint_mv [0m
[0m** Materialized node 'matview' used the Data Source 'requests_per_endpoint_ds'[0m
[0m** Populating job url https://api.tinybird.co/v0/jobs/d0c52be0-77a6-4b71-bdd8-4ff48ebde6ba[0m
[92m** 'requests_per_endpoint_mv' created[0m
[0m** Not pushing fixtures[0m


In [14]:
!tb sql "select count() from requests_per_endpoint_ds" --stats

[0m** Query took 0.000207226 seconds
** Rows read: 1
** Bytes read: 4.1 KB[0m
-----------
| [1;32mcount()[0m |
-----------
|      23 |
-----------


In [15]:
!tb sql "select count() from access_log_transform" --stats

[0m** Query took 0.00209141 seconds
** Rows read: 67,351
** Bytes read: 1.42 MB[0m
-----------
| [1;32mcount()[0m |
-----------
|   67351 |
-----------


In [16]:
filename="pipes/requests_per_endpoint_fast.pipe"
text='''
DESCRIPTION requests per endpoint fast
NODE grouping
SQL >
    %
    SELECT
        ip,
        countMerge(request_count) AS request_count,
        avgMerge(avg_bytes) as avg_bytes
    FROM requests_per_endpoint_ds
    GROUP BY ip
    ORDER BY request_count DESC

NODE endpoint
SQL >
    select IPv4NumToString(ip) as ip_address,
           request_count
    from grouping
'''

write_text_to_file(filename, text)

!tb push pipes/requests_per_endpoint_fast.pipe 

[0m** Processing pipes/requests_per_endpoint_fast.pipe[0m
[0m** Building dependencies[0m
[0m** Running requests_per_endpoint_fast [0m
[92m** => Test endpoint at https://api.tinybird.co/v0/pipes/requests_per_endpoint_fast.json[0m
[92m** 'requests_per_endpoint_fast' created[0m
[0m** Not pushing fixtures[0m


In [20]:
!curl https://api.tinybird.co/v0/pipes/requests_per_endpoint.json\?token\=$TOKEN > tmp
with open('tmp') as f:
  data = json.load(f)
data['statistics']

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  4931  100  4931    0     0   7889      0 --:--:-- --:--:-- --:--:--  7876


{'bytes_read': 1416760, 'elapsed': 0.004044706, 'rows_read': 67351}

In [22]:
!curl https://api.tinybird.co/v0/pipes/requests_per_endpoint_fast.json\?token\=$TOKEN > tmp
with open('tmp') as f:
  data = json.load(f)
data['statistics']

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  4925  100  4925    0     0   7969      0 --:--:-- --:--:-- --:--:--  7956


{'bytes_read': 9044, 'elapsed': 0.000796761, 'rows_read': 71}