## Exploring our parsed tag data (in a parquet file) and running SQL queries on it

In [4]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import duckdb

# Set the display option to show decimal numbers in  a more human readable way (no scientific notation)
pd.set_option("display.float_format", "{:,.5f}".format)

In [5]:
# Create a connection to our persistent DB file
db_connection = duckdb.connect("../data/duckdb/stackoverflow_analysis.db")

# Create the tags table
db_connection.execute(
    """
    CREATE TABLE IF NOT EXISTS tags (
        id INTEGER PRIMARY KEY,
        tag_name VARCHAR UNIQUE NOT NULL,
        count INTEGER NOT NULL,
    )
""".strip()
)

# Add an index
db_connection.execute("CREATE INDEX IF NOT EXISTS idx_tag_name ON tags (tag_name)")

<_duckdb.DuckDBPyConnection at 0x10c19f5b0>

In [6]:
# Look at the table as a DF
db_connection.execute("SELECT * FROM tags").df()

Unnamed: 0,id,tag_name,count


In [9]:
# Directly query our tags parquet file
parquet_file_path = "../data/parquet_data/stackoverflow_tags.parquet"

df = duckdb.sql(f"SELECT * FROM '{parquet_file_path}'").df()
df.head(10)

Unnamed: 0,id,tag_name,count
0,2684,enumeration,1904
1,51452,filecontentresult,71
2,88825,speedtracer,3
3,23343,emacsw32,15
4,80478,apache-commons-net,415
5,113013,bungeecord,46
6,137118,spill-range,42
7,93069,cgkit,5
8,101861,simple-form-for,185
9,26227,google-website-optimizer,64


In [10]:
df["count"].describe()

count      65,675.00000
mean        1,092.43424
std        21,823.98768
min             0.00000
25%            11.00000
50%            41.00000
75%           174.00000
max     2,528,894.00000
Name: count, dtype: float64

In [11]:
# Find the 100 most popular tags
duckdb.sql(
    """
SELECT * 
FROM df 
ORDER BY 
    df.count DESC
LIMIT 100
"""
)

┌───────┬──────────────────┬─────────┐
│  id   │     tag_name     │  count  │
│ int64 │     varchar      │  int64  │
├───────┼──────────────────┼─────────┤
│     3 │ javascript       │ 2528894 │
│    16 │ python           │ 2192438 │
│    17 │ java             │ 1917340 │
│     9 │ c#               │ 1615192 │
│     5 │ php              │ 1464496 │
│  1386 │ android          │ 1417189 │
│     2 │ html             │ 1187348 │
│   820 │ jquery           │ 1034760 │
│    10 │ c++              │  806743 │
│     4 │ css              │  804268 │
│     · │  ·               │     ·   │
│     · │  ·               │     ·   │
│     · │  ·               │     ·   │
│    80 │ apache           │   92455 │
│  2535 │ entity-framework │   91793 │
│ 91905 │ android-studio   │   90547 │
│    73 │ csv              │   90258 │
│ 41127 │ maven            │   88646 │
│    28 │ linq             │   86482 │
│  1834 │ dictionary       │   86052 │
│  1158 │ qt               │   86010 │
│  2218 │ facebook       

experimenting with the `xmltodict` library for working with XML data in python

In [13]:
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element
import xmltodict
from pprint import pp

example_posts_xml = """
<posts>
  <row Id="38779" PostTypeId="1" AcceptedAnswerId="40472" CreationDate="2008-09-02T03:41:06.880" Score="6" ViewCount="6282" Body="&lt;p&gt;I have a wcf application hosted in a windows service running a local windows account. Do I need to set an SPN for this account? If so, what's the protocol the SPN needs to be set under? I know how to do this for services over HTTP, but have never done it for net.tcp.&lt;/p&gt;&#xA;" OwnerUserId="781" OwnerDisplayName="Esteban" LastEditorUserId="1116" LastEditorDisplayName="John Nolan" LastEditDate="2008-09-02T08:49:19.323" LastActivityDate="2013-06-24T17:03:55.833" Title="What SPN do I need to set for a net.tcp service?" Tags="|wcf|security|spn|" AnswerCount="2" CommentCount="0" FavoriteCount="0" ContentLicense="CC BY-SA 2.5" />
  <row Id="38781" PostTypeId="2" ParentId="23930" CreationDate="2008-09-02T03:44:26.013" Score="3" Body="&lt;p&gt;Agda 2: Functional, dependently typed.&lt;/p&gt;&#xA;&#xA;&lt;pre&gt;&lt;code&gt;data Nat = zero | suc (m::Nat)&#xA;&#xA;add (m::Nat) (n::Nat) :: Nat&#xA; = case m of&#xA;     (zero ) -&amp;gt; n&#xA;     (suc p) -&amp;gt; suc (add p n)&#xA;&#xA;mul (m::Nat) (n::Nat)::Nat&#xA;   = case m of&#xA;      (zero ) -&amp;gt; zero&#xA;      (suc p) -&amp;gt; add n (mul p n)&#xA;&#xA;factorial (n::Nat)::Nat &#xA; = case n of&#xA;    (zero ) -&amp;gt; suc zero&#xA;    (suc p) -&amp;gt; mul n (factorial p)&#xA;&lt;/code&gt;&lt;/pre&gt;&#xA;" OwnerUserId="3434" LastEditorUserId="3434" LastEditorDisplayName="Apocalisp" LastEditDate="2008-09-18T22:09:07.470" LastActivityDate="2008-09-18T22:09:07.470" CommentCount="0" CommunityOwnedDate="2008-09-19T07:14:59.840" ContentLicense="CC BY-SA 2.5" />
  <row Id="38784" PostTypeId="1" AcceptedAnswerId="41285" CreationDate="2008-09-02T03:49:17.920" Score="7" ViewCount="3474" Body="&lt;p&gt;I use &lt;strong&gt;Delphi&lt;/strong&gt; for many years, and although I have now moved on to Visual Studio I still fondly remember numbered bookmarks (&lt;kbd&gt;CTRL&lt;/kbd&gt;+&lt;kbd&gt;K&lt;/kbd&gt;+&lt;kbd&gt;1&lt;/kbd&gt; to set bookmark 1, &lt;kbd&gt;CTRL&lt;/kbd&gt;+&lt;kbd&gt;Q&lt;/kbd&gt;+&lt;kbd&gt;1&lt;/kbd&gt; to goto bookmark 1).&lt;/p&gt;&#xA;&#xA;&lt;p&gt;Is there a &lt;strong&gt;Visual Studio&lt;/strong&gt; equivalent? I'm find the dumb bookmarks in VS a chore after Delphi. I want to bookmark then return to a specific place in the file.&lt;/p&gt;&#xA;" OwnerUserId="4149" LastEditorUserId="4779472" LastEditDate="2015-06-27T14:21:16.453" LastActivityDate="2021-11-25T10:14:44.487" Title="Visual Studio equivalent to Delphi bookmarks" Tags="|visual-studio|delphi|brief-bookmarks|" AnswerCount="8" CommentCount="1" FavoriteCount="0" ContentLicense="CC BY-SA 3.0" />
  <row Id="38785" PostTypeId="2" ParentId="38784" CreationDate="2008-09-02T03:54:14.587" Score="4" Body="&lt;p&gt;&lt;kbd&gt;Ctrl&lt;/kbd&gt; &lt;kbd&gt;K&lt;/kbd&gt; + &lt;kbd&gt;Ctrl&lt;/kbd&gt; &lt;kbd&gt;K&lt;/kbd&gt; - Add/Remove Bookmark on Line&lt;br /&gt;&#xA;&lt;kbd&gt;Ctrl&lt;/kbd&gt; &lt;kbd&gt;K&lt;/kbd&gt; + &lt;kbd&gt;Ctrl&lt;/kbd&gt; &lt;kbd&gt;N&lt;/kbd&gt; - Go to Next Bookmark&lt;br /&gt;&#xA;&lt;kbd&gt;Ctrl&lt;/kbd&gt; &lt;kbd&gt;K&lt;/kbd&gt; + &lt;kbd&gt;Ctrl&lt;/kbd&gt; &lt;kbd&gt;P&lt;/kbd&gt; - Go to Previous Bookmark&lt;/p&gt;&#xA;&lt;p&gt;There are other options as well. Look under Edit-&amp;gt;Bookmarks menu,&lt;/p&gt;&#xA;" OwnerUserId="380" OwnerDisplayName="Vaibhav" LastEditorUserId="9454010" LastEditDate="2021-06-15T02:11:14.180" LastActivityDate="2021-06-15T02:11:14.180" CommentCount="1" ContentLicense="CC BY-SA 4.0" />
  <row Id="38787" PostTypeId="2" ParentId="38769" CreationDate="2008-09-02T03:58:41.830" Score="3" Body="&lt;p&gt;MPP does have its own object model that can be used to access data in it. The info should be available here: &lt;a href=&quot;http://msdn.microsoft.com/en-us/office/aa905469.aspx&quot; rel=&quot;nofollow noreferrer&quot;&gt;http://msdn.microsoft.com/en-us/office/aa905469.aspx&lt;/a&gt;&lt;/p&gt;&#xA;" OwnerUserId="380" OwnerDisplayName="Vaibhav" LastActivityDate="2008-09-02T03:58:41.830" CommentCount="0" ContentLicense="CC BY-SA 2.5" />
  <row Id="38789" PostTypeId="1" AcceptedAnswerId="38792" CreationDate="2008-09-02T03:59:44.417" Score="4" ViewCount="4406" Body="&lt;p&gt;I have a &lt;code&gt;web-service&lt;/code&gt; that I will be deploying to dev, staging and production. Along with this will be an &lt;strong&gt;ASP.net&lt;/strong&gt; application that will be deploying separately but also in those three stages. &lt;/p&gt;&#xA;&#xA;&lt;p&gt;What is the most pragmatic way to change the following line in the web-service to match the current environment?&lt;/p&gt;&#xA;&#xA;&lt;pre&gt;&lt;code&gt;[WebService(Namespace = &quot;http://dev.mycompany.com/MyAppsWebService&quot;)]&#xA;[WebService(Namespace = &quot;http://stage.mycompany.com/MyAppsWebService&quot;)]&#xA;[WebService(Namespace = &quot;http://mycompany.com/MyAppsWebService&quot;)]&#xA;&lt;/code&gt;&lt;/pre&gt;&#xA;" OwnerUserId="3149" OwnerDisplayName="Kevin Lamb" LastEditorUserId="5423108" LastEditDate="2017-07-04T08:48:45.030" LastActivityDate="2017-07-04T08:48:45.030" Title="Web Service Namespace Dynamic Naming" Tags="|c#|asp.net|web-services|" AnswerCount="1" CommentCount="0" ContentLicense="CC BY-SA 3.0" />
  <row Id="38791" PostTypeId="1" AcceptedAnswerId="38793" CreationDate="2008-09-02T04:01:09.827" Score="4" ViewCount="2483" Body="&lt;p&gt;Which Database table Schema is more efficient and why?&lt;/p&gt;&#xA;&#xA;&lt;pre&gt;&lt;code&gt;&quot;Users (UserID, UserName, CompamyId)&quot;&#xA;&quot;Companies (CompamyId, CompanyName)&quot;&#xA;&lt;/code&gt;&lt;/pre&gt;&#xA;&#xA;&lt;p&gt;OR&lt;/p&gt;&#xA;&#xA;&lt;pre&gt;&lt;code&gt;&quot;Users (UserID, UserName)&quot;&#xA;&quot;Companies (CompamyId, CompanyName)&quot;&#xA;&quot;UserCompanies (UserID, CompamyId)&quot;&#xA;&lt;/code&gt;&lt;/pre&gt;&#xA;&#xA;&lt;p&gt;Given the fact that user and company have one-to-one relation.&lt;/p&gt;&#xA;" OwnerUserId="191" OwnerDisplayName="Ramesh Soni" LastActivityDate="2008-09-17T13:44:01.717" Title="Which database table Schema is more efficient?" Tags="|database-design|" AnswerCount="6" CommentCount="1" ContentLicense="CC BY-SA 2.5" />
  <row Id="38792" PostTypeId="2" ParentId="38789" CreationDate="2008-09-02T04:02:40.307" Score="2" Body="&lt;p&gt;Your webservice object has a &quot;URL&quot; property on it which can be set via the web.config file. There's a config file that gets created when you add the web reference to your application that you should copy the contents of to your web.config or app.config file. You can then deploy the config file and not have to manage any code changes to accomodate the change in url.&lt;/p&gt;&#xA;" OwnerUserId="493" OwnerDisplayName="lomaxx" LastActivityDate="2008-09-02T04:02:40.307" CommentCount="0" ContentLicense="CC BY-SA 2.5" />
</posts>
"""
tree = xmltodict.parse(example_posts_xml, attr_prefix="")

rows = tree["posts"]["row"]

for row in rows:
    pp(row)

{'Id': '38779',
 'PostTypeId': '1',
 'AcceptedAnswerId': '40472',
 'CreationDate': '2008-09-02T03:41:06.880',
 'Score': '6',
 'ViewCount': '6282',
 'Body': '<p>I have a wcf application hosted in a windows service running a '
         'local windows account. Do I need to set an SPN for this account? If '
         "so, what's the protocol the SPN needs to be set under? I know how to "
         'do this for services over HTTP, but have never done it for '
         'net.tcp.</p>\n',
 'OwnerUserId': '781',
 'OwnerDisplayName': 'Esteban',
 'LastEditorUserId': '1116',
 'LastEditorDisplayName': 'John Nolan',
 'LastEditDate': '2008-09-02T08:49:19.323',
 'LastActivityDate': '2013-06-24T17:03:55.833',
 'Title': 'What SPN do I need to set for a net.tcp service?',
 'Tags': '|wcf|security|spn|',
 'AnswerCount': '2',
 'CommentCount': '0',
 'FavoriteCount': '0',
 'ContentLicense': 'CC BY-SA 2.5'}
{'Id': '38781',
 'PostTypeId': '2',
 'ParentId': '23930',
 'CreationDate': '2008-09-02T03:44:26.013',
 'Sc