In [72]:
import pandas as pd
import duckdb
import sklearn


In [None]:
# Initialize a connection and create a persistent database
# Worth noting that due to the workflow I'm using, the database was/should be created externally, and then built into the Docker container
# This way, the raw database files are saved locally, but file size won't grow exponentially
con = duckdb.connect("files/titanic.duckdb")


In [86]:
# Verify the proper tables are loaded
con.sql("SELECT * FROM duckdb_tables()")

┌───────────────┬──────────────┬─────────────┬────────────┬────────────┬───────────┬─────────┬───────────────────────┬──────────┬───────────┬─────────────────┬────────────────┬──────────────┬─────────────┬────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ database_name │ database_oid │ schema_name │ schema_oid │ table_name │ table_oid │ comment │         tags          │ internal │ temporary │ has_primary_key │ estimated_size │ column_count │ index_count │ check_constraint_count │                                                                               sql                                                                                │
│    varchar    │    int64     │   varchar   │   int64    │  varchar   │   int64   │ varchar │ map(varchar, varchar) │ boolean  │  boolean  │     boolean     │     int64      │    int64     │    int64    │       

In [None]:
# Using this to DROP or Recreate train_raw, depending on the cleaning process
con.sql("DROP TABLE train_raw;")
con.sql("CREATE TABLE train_raw AS SELECT * FROM 'files/train.csv'")

In [84]:
# Generate summary statistics
con.sql("SUMMARIZE train_raw")
#con.sql("SUMMARIZE test_raw")

┌─────────────┬─────────────┬─────────────────────┬─────────────────────────────┬───────────────┬─────────────────────┬─────────────────────┬─────────┬───────────────────┬───────────────────┬───────┬─────────────────┐
│ column_name │ column_type │         min         │             max             │ approx_unique │         avg         │         std         │   q25   │        q50        │        q75        │ count │ null_percentage │
│   varchar   │   varchar   │       varchar       │           varchar           │     int64     │       varchar       │       varchar       │ varchar │      varchar      │      varchar      │ int64 │  decimal(9,2)   │
├─────────────┼─────────────┼─────────────────────┼─────────────────────────────┼───────────────┼─────────────────────┼─────────────────────┼─────────┼───────────────────┼───────────────────┼───────┼─────────────────┤
│ PassengerId │ BIGINT      │ 1                   │ 891                         │          1051 │ 446.0               │ 257.3538

In [None]:
# Examine Nulls for the Age, Cabin, and Embarked columns
con.sql("SELECT * FROM train_raw WHERE Age IS NULL") # Seems to make the most sense to use the average age here
con.sql("SELECT * FROM train_raw WHERE Cabin IS NULL") # Seems likely Cabins not as strictly recorded for lower class guests, probably unnecessary for model
con.sql("SELECT * FROM train_raw WHERE Embarked IS NULL") # This only comprises 2 records and it's unclear if they made it on in the first place, not a high enough percentage of 1st class survivors to consider keeping

In [None]:
# Update the Age column
con.sql("""UPDATE train_raw AS train_clean
        SET Age = (
            SELECT
                avg(raw.Age) AS cleanAge
            FROM train_raw as raw
            WHERE raw.Age IS NOT NULL
        )
        WHERE Age IS NULL""")

In [78]:
# Update the Age column in the test dataset
con.sql("""UPDATE test_raw AS test_clean
        SET Age = (
            SELECT
                avg(raw.Age) AS cleanAge
            FROM test_raw as raw
            WHERE raw.Age IS NOT NULL
        )
        WHERE Age IS NULL""")

In [None]:
# Remove the Cabin, Embarked, and Fare columns
con.sql("ALTER TABLE test_raw DROP Cabin")
con.sql("ALTER TABLE test_raw DROP Embarked")
con.sql("ALTER TABLE test_raw DROP Fare") # Dropping because there are nulls in the test file

In [79]:
# Remove the Cabin, Embarked, and Fare columns
con.sql("ALTER TABLE test_raw DROP Cabin")
con.sql("ALTER TABLE test_raw DROP Embarked")
con.sql("ALTER TABLE test_raw DROP Fare") # Dropping because there are nulls in the test file

In [None]:
# Creating dataframes for testing/training, I'll be using sklearn here, which needs both
train = con.sql("SELECT * FROM train_raw").df()
test = con.sql("SELECT * FROM test_raw").df()

Variable   Type                  Data/Info
------------------------------------------
con        DuckDBPyConnection    <duckdb.duckdb.DuckDBPyCo<...>object at 0x7fe171135cf0>
duckdb     module                <module 'duckdb' from '/u<...>ages/duckdb/__init__.py'>
pd         module                <module 'pandas' from '/u<...>ages/pandas/__init__.py'>
sklearn    module                <module 'sklearn' from '/<...>ges/sklearn/__init__.py'>
test       DataFrame                  PassengerId  Pclass <...>n\n[418 rows x 8 columns]
train      DataFrame                  PassengerId  Survive<...>n\n[891 rows x 9 columns]
