## Exploring our parsed tag data (in a DuckDB table) and running SQL queries on it

In [1]:
from pathlib import Path
import duckdb
import pandas as pd
from data_pipeline.models import StackOverflowTag, StackOverflowPost

# Set the display option to show decimal numbers in  a more human readable way (no scientific notation)
pd.set_option("display.float_format", "{:,.5f}".format)

In [4]:
# Create a connection to our persistent DB file
conn = duckdb.connect(
    Path.cwd() / "../data/duckdb/stackoverflow_analysis.db"
)

In [5]:
# Show the tabs that we have available
conn.sql("""
SHOW TABLES
""")

┌─────────────────────┐
│        name         │
│       varchar       │
├─────────────────────┤
│ stackoverflow_posts │
│ stackoverflow_tags  │
└─────────────────────┘

In [23]:
# Get summary statistics on post net votes
conn.sql(
    """
    SELECT net_votes
    FROM stackoverflow_posts
    """
).df().describe()

Unnamed: 0,net_votes
count,265000.0
mean,204.33037
std,382.2436
min,65.0
25%,82.0
50%,116.0
75%,196.0
max,34970.0


In [25]:
# Find the top 15 most popular tags
conn.sql(
    """
    SELECT *
    FROM stackoverflow_tags
    ORDER BY count DESC LIMIT 15
    """
)

┌────────────┬─────────┐
│  tag_name  │  count  │
│  varchar   │  int32  │
├────────────┼─────────┤
│ javascript │ 2528894 │
│ python     │ 2192438 │
│ java       │ 1917340 │
│ c#         │ 1615192 │
│ php        │ 1464496 │
│ android    │ 1417189 │
│ html       │ 1187348 │
│ jquery     │ 1034760 │
│ c++        │  806743 │
│ css        │  804268 │
│ ios        │  687246 │
│ sql        │  670860 │
│ mysql      │  661984 │
│ r          │  505710 │
│ reactjs    │  476740 │
├────────────┴─────────┤
│ 15 rows    2 columns │
└──────────────────────┘