In [22]:
import duckdb

from utils import  wrap_to_string

conn = duckdb.connect("dataset.db")

# Structural Investigation

Let's look at the general structure of the dataset

In [2]:
query = "SELECT COUNT(column_name) FROM information_schema.columns WHERE table_name='dataset'"
conn.sql(query).show()

┌────────────────────┐
│ count(column_name) │
│       int64        │
├────────────────────┤
│                 67 │
└────────────────────┘



In [3]:
query  = "SELECT COUNT(*) FROM dataset"
conn.sql(query).show()

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       363243 │
└──────────────┘



Count how many times each data type is present in the dataset

In [4]:
query = "SELECT data_type,COUNT(*) FROM information_schema.columns GROUP BY data_type"
conn.sql(query).show()

┌───────────┬──────────────┐
│ data_type │ count_star() │
│  varchar  │    int64     │
├───────────┼──────────────┤
│ VARCHAR   │            6 │
│ BIGINT    │            6 │
│ DOUBLE    │           55 │
└───────────┴──────────────┘



## 1.1. Structure of non-numerical features

In [23]:
query = """
SELECT column_name
FROM information_schema.columns
WHERE data_type IN ('BIGINT', 'DOUBLE')
"""
conn.sql(query).show()

┌────────────────────────────────────┐
│            column_name             │
│              varchar               │
├────────────────────────────────────┤
│ Vehicle_Reference_df_res           │
│ Vehicle_Reference_df               │
│ Casualty_Reference                 │
│ Casualty_Class                     │
│ Casualty_Severity                  │
│ Casualty_Type                      │
│ Vehicle_Type                       │
│ Towing_and_Articulation            │
│ Vehicle_Manoeuvre                  │
│ Vehicle_Location-Restricted_Lane   │
│        ·                           │
│        ·                           │
│        ·                           │
│ Sex_of_Casualty                    │
│ Age_of_Casualty                    │
│ Age_Band_of_Casualty               │
│ Pedestrian_Location                │
│ Pedestrian_Movement                │
│ Car_Passenger                      │
│ Bus_or_Coach_Passenger             │
│ Pedestrian_Road_Maintenance_Worker │
│ Casualty_Home_Area_Type

We have all the necessary column names from the above table. Unfortunately all my attempts to transform the above results to columns results in catastrophically bad looking and unintuitve code. So we'll tamely switch to python for a moment

In [29]:
query = """
SELECT column_name
FROM information_schema.columns
WHERE data_type IN ('BIGINT', 'DOUBLE')
"""
df = conn.sql(query).df()
numeric_columns =  wrap_to_string(df)

# DcukDB has a nice summarize command to help out here
query = f"""
SUMMARIZE SELECT {numeric_columns} FROM dataset
"""
conn.sql(query).show()

┌──────────────────────┬─────────────┬─────────┬─────────┬───┬───────────────────┬────────┬─────────────────┐
│     column_name      │ column_type │   min   │   max   │ … │        q75        │ count  │ null_percentage │
│       varchar        │   varchar   │ varchar │ varchar │   │      varchar      │ int64  │     varchar     │
├──────────────────────┼─────────────┼─────────┼─────────┼───┼───────────────────┼────────┼─────────────────┤
│ Vehicle_Reference_…  │ BIGINT      │ 1       │ 37      │ … │ 2                 │ 363243 │ 0.0%            │
│ Vehicle_Reference_df │ BIGINT      │ 1       │ 32      │ … │ 2                 │ 363243 │ 0.0%            │
│ Casualty_Reference   │ BIGINT      │ 1       │ 38      │ … │ 2                 │ 363243 │ 0.0%            │
│ Casualty_Class       │ BIGINT      │ 1       │ 3       │ … │ 2                 │ 363243 │ 0.0%            │
│ Casualty_Severity    │ BIGINT      │ 1       │ 3       │ … │ 3                 │ 363243 │ 0.0%            │
│ Casualty

Let's do the same for non_numeric variables

In [30]:
query = """
SELECT column_name
FROM information_schema.columns
WHERE data_type NOT IN ('BIGINT', 'DOUBLE')
"""
df = conn.sql(query).df()
non_numeric_columns =  wrap_to_string(df)

# DcukDB has a nice summarize command to help out here
query = f"""
SUMMARIZE SELECT {non_numeric_columns} FROM dataset
"""
conn.sql(query).show()

┌──────────────────────┬─────────────┬───────────────┬───────────────┬───┬───────┬───────┬────────┬─────────────────┐
│     column_name      │ column_type │      min      │      max      │ … │  q50  │  q75  │ count  │ null_percentage │
│       varchar        │   varchar   │    varchar    │    varchar    │   │ int32 │ int32 │ int64  │     varchar     │
├──────────────────────┼─────────────┼───────────────┼───────────────┼───┼───────┼───────┼────────┼─────────────────┤
│ Accident_Index       │ VARCHAR     │ 201501BS70001 │ 2015984141415 │ … │  NULL │  NULL │ 363243 │ 0.0%            │
│ Sex_of_Driver        │ VARCHAR     │ 1.0           │ 3.0           │ … │  NULL │  NULL │ 363243 │ 0.0%            │
│ Date                 │ VARCHAR     │ 01/01/2015    │ 31/12/2015    │ … │  NULL │  NULL │ 363243 │ 11.94%          │
│ Time                 │ VARCHAR     │ 00:01         │ 23:59         │ … │  NULL │  NULL │ 363243 │ 11.95%          │
│ Local_Authority_(H…  │ VARCHAR     │ E06000001     │ W