### Enable debug logging

In [1]:
import logging
logging.basicConfig(level=logging.DEBUG)


### Initialise the Explorer with a connection to the Aircloak server

In [2]:

from explorer.connection import AircloakConnection

aircloak = AircloakConnection(dbname='GiveMeSomeCredit')
print(aircloak.column_info('loans', 'MonthlyIncome'))

DEBUG:root:Connecting to Aircloak: user=daniel-613C7ADF4535BB56DBCD, host=attack.aircloak.com, port=9432, dbname=GiveMeSomeCredit
DEBUG:root:Sending query: 
        SHOW TABLES
    
DEBUG:root:Sending query: 
        SHOW COLUMNS FROM "loans"
    
ColumnInfo(type='integer', isolator='false', id=None)


In [3]:
from explorer.numeric_explorer import NumericColumnExplorer
e = NumericColumnExplorer(aircloak_connection=aircloak, table='loans', column='MonthlyIncome')


DEBUG:root:Sending query: 
        SELECT
            min("MonthlyIncome")
        ,   max("MonthlyIncome")
        ,   avg("MonthlyIncome")
        ,   count(*)
        ,   count_noise(*)
        FROM "loans"
    
DEBUG:root:Sending query: 
        SELECT
            "MonthlyIncome"
        ,   count(*)
        FROM "loans"
        WHERE "MonthlyIncome" IS NOT NULL
        GROUP BY 1
    
DEBUG:root:Estimating bucket size for:
                range 57538,
                count 150000,
                num buckets 100,
                min bucket count 20
DEBUG:root:Precision bound: 575.38, Size bound: 7.671733333333333
DEBUG:root:Options are: [100, 200, 10, 50, 20, 500]
DEBUG:root:Returning bucket size 500


### Explore some buckets: the following will dig three levels deep in the bucket hierarchy

In [4]:
e.explore()

DEBUG:root:Sending query: 
    SELECT
        bucket("MonthlyIncome" by 50) as "bucket_50"
, bucket("MonthlyIncome" by 100) as "bucket_100"
, bucket("MonthlyIncome" by 500) as "bucket_500"
    ,   count(*)
    ,   count_noise(*)
    ,   min("MonthlyIncome")
    ,   max("MonthlyIncome")
    ,   avg("MonthlyIncome")
    FROM "loans"
    WHERE "MonthlyIncome" IS NOT NULL
    GROUP BY GROUPING SETS (1, 2, 3)
    
DEBUG:root:Received query results, processing...
DEBUG:root:... finished processing query results.


### Dig a little deeper

In [5]:
e.explore(1)

DEBUG:root:Sending query: 
    SELECT
        bucket("MonthlyIncome" by 10) as "bucket_10"
    ,   count(*)
    ,   count_noise(*)
    ,   min("MonthlyIncome")
    ,   max("MonthlyIncome")
    ,   avg("MonthlyIncome")
    FROM "loans"
    WHERE "MonthlyIncome" IS NOT NULL
    GROUP BY GROUPING SETS (1)
    
DEBUG:root:Received query results, processing...
DEBUG:root:... finished processing query results.


### Extract to pandas dataframe for analysis

In [6]:
import pandas as pd

df = pd.DataFrame(**e.extract_to_dataframe())
df = df.set_index(['bucket_size'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13304 entries, 500 to 10
Data columns (total 6 columns):
lower_bound    13304 non-null float64
count          13304 non-null float64
count_noise    2633 non-null float64
min            2122 non-null float64
max            2122 non-null float64
avg            2633 non-null float64
dtypes: float64(6)
memory usage: 727.6 KB


### Plot column distribution as a bar chart

In [8]:
import plotly.express as px

bucket_size_to_plot = 500
px.bar(df.loc[[bucket_size_to_plot]], x='lower_bound', y='count', hover_data=['lower_bound', 'count', 'min', 'max', 'count_noise'])

### Get a sum of the counts by bucket size

In [9]:
df['count'].groupby('bucket_size').sum()

bucket_size
10     118677.0
50     119646.0
100    119847.0
500    120064.0
Name: count, dtype: float64