# Converting leaderboard databases to pandas dataframe

In [27]:
import polars as pl
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import numpy as np
from IPython.core.display import Markdown

In [28]:
pl.__version__

'0.20.10'

In [29]:
# Define the struct dtype for the materials array
materials_struct_dtype = pl.Struct({
    "fulfilled": pl.Int64,
    "required": pl.Int64,
    "tradeSymbol": pl.Utf8,
})

# Define the overall struct dtype for the JSON
json_struct_dtype = pl.Struct({
    "isComplete": pl.Boolean,
    "materials": pl.List(materials_struct_dtype),
    "symbol": pl.Utf8,
}
)

In [30]:
def read_dataframe(reset_date):
    connection_string = f"sqlite:///Users/florian_witteler/programming/spacetraders/flwi-spacetraders/data/database/flwi-spacetraders-leaderboard-reset_{reset_date}.db"
    df = pl.read_database_uri(query="""
    SELECT datetime(timestamp) as ts_string
         , agent_id
         , crawl_run_id
         , credits
         , ship_count
         , construction_json
         , agent_symbol
         , headquarters_waypoint_symbol
         , jump_gate_waypoint_symbol
    from main.crawl_run cr
             join main.leaderboard_entry le on cr.id = le.crawl_run_id
             join main.static_agent_detail sad on le.agent_id = sad.id
    """, uri=connection_string)

    return df \
      .with_columns(pl.col("ts_string").str.to_datetime('%Y-%m-%d %H:%M:%S').alias("ts")) \
      .with_columns(pl.lit(reset_date).alias("reset_date")) \
      .with_columns(pl.col("construction_json").str.json_decode(json_struct_dtype)) \
      .drop("ts_string", "agent_id", "crawl_run_id")

## create dataframe from _all_ sqlite files

In [31]:
df_all = pl.concat([
    read_dataframe("2023_11_18"),
    read_dataframe("2023_12_02"),
    read_dataframe("2023_12_16"),
    read_dataframe("2023_12_30"),
    read_dataframe("2024_01_13"),
    read_dataframe("2024_01_28"),
    read_dataframe("2024_02_11"),
    read_dataframe("2024_02_25"),
    read_dataframe("2024_03_10"),
    read_dataframe("2024_03_24"),
    read_dataframe("2024_04_09"),
    read_dataframe("2024_04_28"),
])  

### notes
Roudtrip via parquet is necessary, since we encounter a bug otherwise. Seems to be a problem with concatenation of different dataframes
```text
thread '<unnamed>' panicked at /Users/runner/work/polars/polars/crates/polars-arrow/src/chunk.rs:20:31:
called `Result::unwrap()` on an `Err` value: ComputeError(ErrString("Chunk require all its arrays to have an equal number of rows"))
```

In [32]:
df_all.write_parquet("all_resets.parquet")

In [33]:
df_all = pl.read_parquet("all_resets.parquet")

In [34]:
resets_df = df_all.select(
    pl.col("reset_date").str
    .to_date("%Y_%m_%d")
    .dt
    .strftime('%Y-%m-%d')
    .alias("reset"),
    pl.col("reset_date"),
    pl.col("ts")
).group_by(
    "reset", "reset_date"
).agg(
    pl.col("ts").min().alias("first_ts")
).sort(
    "reset", descending=False
).with_columns(
    reset_id = pl.col("reset").cum_count()
).rename(
    {"reset_date": "reset_date_str"}
)
resets_df.head(5)

reset,reset_date_str,first_ts,reset_id
str,str,datetime[μs],u32
"""2023-11-18""","""2023_11_18""",2023-11-19 12:30:00,1
"""2023-12-02""","""2023_12_02""",2023-12-02 19:10:00,2
"""2023-12-16""","""2023_12_16""",2023-12-16 20:25:00,3
"""2023-12-30""","""2023_12_30""",2023-12-30 17:15:00,4
"""2024-01-13""","""2024_01_13""",2024-01-13 17:25:00,5


In [35]:
construction_site_df = df_all.select(
    pl.col("reset_date")
    .str
    .to_date("%Y_%m_%d")
    .dt
    .strftime('%Y-%m-%d')
    .alias("reset"),
    pl.col("jump_gate_waypoint_symbol")
).unique(
    
).join(
    resets_df, on="reset"
).with_columns(
    id = pl.col("reset").cum_count()
).drop(
    "first_ts"
)
construction_site_df.head(5)

reset,jump_gate_waypoint_symbol,reset_date_str,reset_id,id
str,str,str,u32,u32
"""2023-11-18""","""X1-ZX25-I61""","""2023_11_18""",1,1
"""2023-11-18""","""X1-PP26-I55""","""2023_11_18""",1,2
"""2023-11-18""","""X1-TX89-I56""","""2023_11_18""",1,3
"""2023-11-18""","""X1-ND32-I57""","""2023_11_18""",1,4
"""2023-12-02""","""X1-DM78-I58""","""2023_12_02""",2,5


In [36]:
construction_requirement_df = df_all.group_by(
    pl.col("reset_date")
).agg(
    pl.col("construction_json").last().struct.field("materials").alias("materials")  # there was an error in one of the resets with the advanced circuits which was fixed during the reset
).explode(
    "materials"
).unnest(
    "materials"
).rename(
    {"tradeSymbol": "trade_symbol"}
).join(
    resets_df, left_on="reset_date", right_on="reset_date_str"
).drop(
    "fulfilled", "first_ts"
).with_columns(
    id = pl.col("reset").cum_count()
)

construction_requirement_df.head(5)

reset_date,required,trade_symbol,reset,reset_id,id
str,i64,str,str,u32,u32
"""2023_12_02""",2000,"""FAB_MATS""","""2023-12-02""",2,1
"""2023_12_02""",800,"""ADVANCED_CIRCU…","""2023-12-02""",2,2
"""2023_12_02""",1,"""QUANTUM_STABIL…","""2023-12-02""",2,3
"""2023_11_18""",6000,"""FAB_MATS""","""2023-11-18""",1,4
"""2023_11_18""",500,"""ADVANCED_CIRCU…","""2023-11-18""",1,5


In [37]:
static_agent_info_df = df_all.group_by(
    pl.col("reset_date", "agent_symbol")
).agg(
    pl.col("headquarters_waypoint_symbol").first(),
    pl.col("jump_gate_waypoint_symbol").first(),
    pl.col("ts").min(),
).join(
    resets_df, left_on="reset_date", right_on="reset_date_str"
).drop(
    "first_ts"
).join(
    construction_site_df.rename({"id": "construction_site_id"}), left_on=["reset_id", "jump_gate_waypoint_symbol"], right_on=["reset_id", "jump_gate_waypoint_symbol"]
).with_columns(
    id = pl.col("reset").cum_count()
).rename(
    {"ts": "query_time",
    "headquarters_waypoint_symbol": "agent_headquarters_waypoint_symbol"}
).drop(
    "reset_date","reset_date_str", "reset", "reset_right", "first_ts"
)
static_agent_info_df.head(5)

agent_symbol,agent_headquarters_waypoint_symbol,jump_gate_waypoint_symbol,query_time,reset_id,construction_site_id,id
str,str,str,datetime[μs],u32,u32,u32
"""TVRJ""","""X1-Y4-A1""","""X1-Y4-I60""",2024-04-10 11:25:00,11,130,1
"""FUZZYTESTS""","""X1-KM71-A1""","""X1-KM71-I60""",2023-12-02 19:39:59,2,211,2
"""TEST22""","""X1-KG76-A1""","""X1-KG76-I52""",2024-01-18 22:25:00,5,108,3
"""SIRTUTORIAL""","""X1-XJ17-A1""","""X1-XJ17-I54""",2024-02-03 19:55:00,6,321,4
"""SNAKEINSPACE""","""X1-XJ57-A1""","""X1-XJ57-I59""",2024-04-09 18:00:00,11,132,5


In [42]:
job_run_df = df_all.select(
    pl.col("reset_date"),
    pl.col("ts").alias("query_time")
).unique(
).with_columns(
    id = pl.col("reset_date").cum_count()
).join(
    resets_df, left_on="reset_date", right_on="reset_date_str"
).with_columns(
    event_time_minutes = (pl.col("query_time") - pl.col("first_ts")).dt.total_minutes()
)
job_run_df.head(5)

reset_date,query_time,id,reset,first_ts,reset_id,event_time_minutes
str,datetime[μs],u32,str,datetime[μs],u32,i64
"""2023_11_18""",2023-11-19 12:35:00,1,"""2023-11-18""",2023-11-19 12:30:00,1,5
"""2023_11_18""",2023-11-19 15:00:00,2,"""2023-11-18""",2023-11-19 12:30:00,1,150
"""2023_11_18""",2023-11-19 15:20:00,3,"""2023-11-18""",2023-11-19 12:30:00,1,170
"""2023_11_18""",2023-11-19 17:15:00,4,"""2023-11-18""",2023-11-19 12:30:00,1,285
"""2023_11_18""",2023-11-19 18:00:00,5,"""2023-11-18""",2023-11-19 12:30:00,1,330


In [43]:
agent_log_df = df_all.select(
    "ts", "reset_date", "agent_symbol", "credits", "ship_count", "construction_json"
).join(
    resets_df, left_on="reset_date", right_on="reset_date_str", suffix="_reset"
).drop(
    "reset_date", "reset", "first_ts"
).join(
    static_agent_info_df, on=["reset_id", "agent_symbol"], suffix="_static_agent_info"
).drop(
    "query_time", "first_ts"
).rename(
    {"id": "agent_id",
    "ts": "query_time"}
).join(
    job_run_df, on=["reset_id", "query_time"], suffix="_static_agent_info"
).rename(
    {"id": "job_id"}
)

agent_log_df.head(5)

query_time,agent_symbol,credits,ship_count,construction_json,reset_id,agent_headquarters_waypoint_symbol,jump_gate_waypoint_symbol,construction_site_id,agent_id,reset_date,job_id,reset,first_ts,event_time_minutes
datetime[μs],str,i64,i64,struct[3],u32,str,str,u32,u32,str,u32,str,datetime[μs],i64
2023-11-19 12:30:00,"""XOYCHTE""",175000,2,"{false,[{0,6000,""FAB_MATS""}, {500,0,""ADVANCED_CIRCUITRY""}, {1,1,""QUANTUM_STABILIZERS""}],""X1-XZ50-I54""}",1,"""X1-XZ50-A1""","""X1-XZ50-I54""",48,605,"""2023_11_18""",38692,"""2023-11-18""",2023-11-19 12:30:00,0
2023-11-19 12:30:00,"""TVTVTG4""",175000,2,"{false,[{0,6000,""FAB_MATS""}, {500,0,""ADVANCED_CIRCUITRY""}, {1,1,""QUANTUM_STABILIZERS""}],""X1-PU57-I59""}",1,"""X1-PU57-A1""","""X1-PU57-I59""",208,326,"""2023_11_18""",38692,"""2023-11-18""",2023-11-19 12:30:00,0
2023-11-19 12:30:00,"""BLABLABLA""",175749,2,"{false,[{0,6000,""FAB_MATS""}, {500,0,""ADVANCED_CIRCUITRY""}, {1,1,""QUANTUM_STABILIZERS""}],""X1-FV87-I58""}",1,"""X1-FV87-A1""","""X1-FV87-I58""",176,32,"""2023_11_18""",38692,"""2023-11-18""",2023-11-19 12:30:00,0
2023-11-19 12:30:00,"""BOOZE2""",175000,2,"{false,[{0,6000,""FAB_MATS""}, {500,0,""ADVANCED_CIRCUITRY""}, {1,1,""QUANTUM_STABILIZERS""}],""X1-SK36-I63""}",1,"""X1-SK36-A1""","""X1-SK36-I63""",136,562,"""2023_11_18""",38692,"""2023-11-18""",2023-11-19 12:30:00,0
2023-11-19 12:30:00,"""ESEMUSA""",175600,2,"{false,[{0,6000,""FAB_MATS""}, {500,0,""ADVANCED_CIRCUITRY""}, {1,1,""QUANTUM_STABILIZERS""}],""X1-XZ50-I54""}",1,"""X1-XZ50-A1""","""X1-XZ50-I54""",48,350,"""2023_11_18""",38692,"""2023-11-18""",2023-11-19 12:30:00,0


In [44]:
construction_details_df = agent_log_df.select(
    pl.col("jump_gate_waypoint_symbol"),
    pl.col("construction_json").struct.field("isComplete").alias("is_complete"),
    pl.col("construction_json").struct.field("materials"),
    pl.col("job_id"),
    pl.col("reset_id")
).join(
    construction_site_df, on=['reset_id', 'jump_gate_waypoint_symbol']
).rename(
    {"id": "construction_site_id"}
)

construction_details_df.head(5)

jump_gate_waypoint_symbol,is_complete,materials,job_id,reset_id,reset,reset_date_str,construction_site_id
str,bool,list[struct[3]],u32,u32,str,str,u32
"""X1-XZ50-I54""",False,"[{0,6000,""FAB_MATS""}, {500,0,""ADVANCED_CIRCUITRY""}, {1,1,""QUANTUM_STABILIZERS""}]",38692,1,"""2023-11-18""","""2023_11_18""",48
"""X1-PU57-I59""",False,"[{0,6000,""FAB_MATS""}, {500,0,""ADVANCED_CIRCUITRY""}, {1,1,""QUANTUM_STABILIZERS""}]",38692,1,"""2023-11-18""","""2023_11_18""",208
"""X1-FV87-I58""",False,"[{0,6000,""FAB_MATS""}, {500,0,""ADVANCED_CIRCUITRY""}, {1,1,""QUANTUM_STABILIZERS""}]",38692,1,"""2023-11-18""","""2023_11_18""",176
"""X1-SK36-I63""",False,"[{0,6000,""FAB_MATS""}, {500,0,""ADVANCED_CIRCUITRY""}, {1,1,""QUANTUM_STABILIZERS""}]",38692,1,"""2023-11-18""","""2023_11_18""",136
"""X1-XZ50-I54""",False,"[{0,6000,""FAB_MATS""}, {500,0,""ADVANCED_CIRCUITRY""}, {1,1,""QUANTUM_STABILIZERS""}]",38692,1,"""2023-11-18""","""2023_11_18""",48


In [45]:
construction_log_df = construction_details_df.drop(
    "materials"
).unique(
    
).with_columns(
    id = pl.col("reset").cum_count()
)
construction_log_df.head(5)

jump_gate_waypoint_symbol,is_complete,job_id,reset_id,reset,reset_date_str,construction_site_id,id
str,bool,u32,u32,str,str,u32,u32
"""X1-SK36-I63""",False,38692,1,"""2023-11-18""","""2023_11_18""",136,1
"""X1-RK11-I57""",False,38692,1,"""2023-11-18""","""2023_11_18""",347,2
"""X1-PR4-I57""",False,38692,1,"""2023-11-18""","""2023_11_18""",94,3
"""X1-ZX25-I61""",False,38692,1,"""2023-11-18""","""2023_11_18""",1,4
"""X1-BA76-I65""",False,1,1,"""2023-11-18""","""2023_11_18""",348,5


In [46]:
construction_material_log_df = construction_details_df.explode(
    "materials"
).unnest(
    "materials"
).unique(
).join(
    construction_log_df, on=["construction_site_id", "job_id"]
).rename(
    {"id": "construction_log_id", 
    "tradeSymbol": "trade_symbol"}
).join(
    construction_requirement_df, on=["reset_id", "trade_symbol"], suffix="_construction_requirement"
).rename(
    {"id": "construction_requirement_id"}
)
construction_material_log_df.head(5)

jump_gate_waypoint_symbol,is_complete,fulfilled,required,trade_symbol,job_id,reset_id,reset,reset_date_str,construction_site_id,jump_gate_waypoint_symbol_right,is_complete_right,reset_id_right,reset_right,reset_date_str_right,construction_log_id,reset_date,required_construction_requirement,reset_construction_requirement,construction_requirement_id
str,bool,i64,i64,str,u32,u32,str,str,u32,str,bool,u32,str,str,u32,str,i64,str,u32
"""X1-XZ50-I54""",False,500,0,"""ADVANCED_CIRCU…",38692,1,"""2023-11-18""","""2023_11_18""",48,"""X1-XZ50-I54""",False,1,"""2023-11-18""","""2023_11_18""",906256,"""2023_11_18""",500,"""2023-11-18""",5
"""X1-PU57-I59""",False,0,6000,"""FAB_MATS""",38692,1,"""2023-11-18""","""2023_11_18""",208,"""X1-PU57-I59""",False,1,"""2023-11-18""","""2023_11_18""",151256,"""2023_11_18""",6000,"""2023-11-18""",4
"""X1-SK36-I63""",False,500,0,"""ADVANCED_CIRCU…",38692,1,"""2023-11-18""","""2023_11_18""",136,"""X1-SK36-I63""",False,1,"""2023-11-18""","""2023_11_18""",1,"""2023_11_18""",500,"""2023-11-18""",5
"""X1-NB26-I59""",False,500,0,"""ADVANCED_CIRCU…",38692,1,"""2023-11-18""","""2023_11_18""",137,"""X1-NB26-I59""",False,1,"""2023-11-18""","""2023_11_18""",604064,"""2023_11_18""",500,"""2023-11-18""",5
"""X1-RK11-I57""",False,0,0,"""ADVANCED_CIRCU…",38692,1,"""2023-11-18""","""2023_11_18""",347,"""X1-RK11-I57""",False,1,"""2023-11-18""","""2023_11_18""",2,"""2023_11_18""",500,"""2023-11-18""",5


In [1]:
db_path = "/Users/florian_witteler/programming/rust/flwi-spacetraders-leaderboard/data/flwi-leaderboard.db"
db_connection_string = f"sqlite:///{db_path}"

In [48]:
resets_df.drop("reset_date_str").write_database(table_name="reset_date", connection=db_connection_string, if_table_exists='append')

12

In [49]:
construction_site_df.drop("reset", "reset_date_str", "first_ts").write_database(table_name="construction_site", connection=db_connection_string, if_table_exists='append')

390

In [50]:
construction_requirement_df.drop("reset", "reset_date", "first_ts").write_database(table_name="construction_requirement", connection=db_connection_string, if_table_exists='append')

36

In [51]:
static_agent_info_df.drop(
    "reset_date_str", "jump_gate_waypoint_symbol", "first_ts"
).with_columns(
    starting_faction = pl.lit("")
).write_database(
    table_name="static_agent_info", connection=db_connection_string, if_table_exists='append'
)

605

In [54]:
job_run_df.drop("reset_date", "reset", "first_ts").write_database(table_name="job_run", connection=db_connection_string, if_table_exists='append')

48287

In [55]:
agent_log_df.select(
"agent_id",
"job_id",
"credits",
"ship_count"
).write_database(table_name="agent_log", connection=db_connection_string, if_table_exists='append')

2242894

In [56]:
construction_log_df.drop("reset_date_str", "reset", "jump_gate_waypoint_symbol", "reset_id").write_database(table_name="construction_log", connection=db_connection_string, if_table_exists='append')

1510469

In [57]:
construction_material_log_df.select("construction_log_id", "construction_requirement_id", "fulfilled").write_database(table_name="construction_material_log", connection=db_connection_string, if_table_exists='append')

4531474

## execute `ANALYZE` to make use of all the indices

In [3]:
import sqlite3
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute("ANALYZE")
con.close()