In [3]:
import duckdb
import altair as alt

con = duckdb.connect("riot_ci_stats.duckdb", read_only=True)

In [4]:
# con.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'").fetchall()

con.sql("SELECT column_name, data_type FROM information_schema.columns WHERE table_name = 'worker_stats' AND table_schema = 'main' ORDER BY ordinal_position").show()

┌────────────────────┬───────────┐
│    column_name     │ data_type │
│      varchar       │  varchar  │
├────────────────────┼───────────┤
│ id                 │ INTEGER   │
│ job_uid            │ VARCHAR   │
│ name               │ VARCHAR   │
│ tasks_count        │ INTEGER   │
│ tasks_failed_count │ INTEGER   │
│ tasks_passed_count │ INTEGER   │
│ runtime_avg_s      │ DOUBLE    │
│ runtime_max_s      │ DOUBLE    │
│ runtime_min_s      │ DOUBLE    │
│ total_cpu_time_s   │ DOUBLE    │
│ fetch_date         │ TIMESTAMP │
├────────────────────┴───────────┤
│ 11 rows              2 columns │
└────────────────────────────────┘



## Metadata

In [None]:
ci_jobs_count = con.execute("SELECT COUNT(*) FROM jobs").fetchone()
ci_jobs_per_state = con.execute("SELECT state, COUNT(*) FROM jobs GROUP BY state").fetchall()

total_tasks_executed = con.execute("SELECT SUM(tasks_count) FROM worker_stats").fetchone()

print(f"Number of ci jobs that have been collected so far: {ci_jobs_count[0]}")
# print(f"{ci_jobs_per_state}")
print("number of jobs per state")
con.sql("SELECT state, COUNT(*) as count FROM jobs GROUP BY state order by count").show()
print(f"total executed build and test tasks: {total_tasks_executed[0]}")

Number of ci jobs that have been collected so far: 491
┌─────────┬───────┐
│  state  │ count │
│ varchar │ int64 │
├─────────┼───────┤
│ queued  │     4 │
│ running │     4 │
│ NULL    │    22 │
│ stopped │    38 │
│ errored │   163 │
│ passed  │   260 │
└─────────┴───────┘

total executed build and test tasks: 16022991


## CI average runtime

In [4]:

# Calculate weighted average over all ci runs
query = """
SELECT if(name like 'ZIH-HPC-node%', 'ZIH-node', name) as worker_name , SUM(runtime_avg_s * tasks_count) / SUM(tasks_count) AS "avg_runtime"
FROM worker_stats
WHERE tasks_count not null and tasks_count != 0
GROUP BY worker_name
ORDER BY avg_runtime
"""
df_runtime = con.execute(query).fetch_df()

# Plot the average runtime by worker name using a bar chart
chart = alt.Chart(df_runtime).mark_bar().encode(
    x=alt.X('worker_name:O', title='Worker Name'),
    y=alt.Y('avg_runtime:Q', title='Average Runtime'),
    tooltip=["avg_runtime"]
)
chart