In [1]:
import os
import sys
import math
import logging
from pathlib import Path

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={"figure.figsize": (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [5]:
# CONFIGURATION
gcp_project = "cf-model-298607"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/fwilhelm/.google-app-creds.json"

# Direct way.

In [3]:
from google.cloud import bigquery
bqclient = bigquery.Client()

# Download query results.
query_string = """
SELECT 
  COUNT(*) AS num_downloads,
  DATE_TRUNC(DATE(timestamp), MONTH) AS `month`,
FROM `bigquery-public-data.pypi.file_downloads`
WHERE file.project = 'pyscaffold'
    AND details.installer.name = 'pip'
    AND DATE(timestamp) BETWEEN DATE('2021-01-01') AND CURRENT_DATE()
GROUP BY `month`
ORDER BY `month`
"""

df = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(
        # Optionally, explicitly request to use the BigQuery Storage API. As of
        # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage
        # API is used by default.
        create_bqstorage_client=True,
    )
)
print(df.head())

   num_downloads       month
0         229342  2021-01-01
1         120081  2021-02-01
2         289039  2021-03-01
3         265421  2021-04-01
4         117702  2021-05-01


In [117]:
from sqlalchemy import *
from sqlalchemy.sql import func as F
from sqlalchemy.engine import create_engine
from sqlalchemy.schema import *

# print(select([func.count('*')], from_obj=table).scalar())

In [118]:
engine = create_engine(f'bigquery://{gcp_project}') 
table = Table('bigquery-public-data.pypi.file_downloads', MetaData(bind=engine), autoload=True)

In [119]:
query = (select([F.count("*").label("num_downloads"), 
                 F.date_trunc(func.date(column("timestamp")), text("month")).label("month")
                ], 
                from_obj=table)
         .where(F.date(column("timestamp")) == '2021-01-01'))

In [115]:
# query = (select([func.date_trunc(func.date(column("timestamp")), text("month"))], from_obj=table)
#          .where(func.date(column("timestamp")) == '2021-01-01'))

In [116]:
print(query)

SELECT count(%(count_1:STRING)s) AS `num_downloads`, date_trunc(date(`timestamp`), month) AS `month` 
FROM `bigquery-public-data.pypi.file_downloads` 
WHERE date(`timestamp`) = %(date_1:STRING)s


In [113]:
result = engine.execute(query)

DatabaseError: (google.cloud.bigquery.dbapi.exceptions.DatabaseError) 400 SELECT list expression references column timestamp which is neither grouped nor aggregated at [1:62]

(job ID: 97d1bd4a-6c55-40f9-9b95-722092bd6f7a)

                                     -----Query Job SQL Follows-----                                     

    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |
   1:SELECT count(@`count_1`) AS `num_downloads`, date_trunc(date(`timestamp`), month) AS `date_trunc_1` 
   2:FROM `bigquery-public-data.pypi.file_downloads` 
   3:WHERE date(`timestamp`) = @`date_1`
    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |
[SQL: SELECT count(%(count_1:STRING)s) AS `num_downloads`, date_trunc(date(`timestamp`), month) AS `date_trunc_1` 
FROM `bigquery-public-data.pypi.file_downloads` 
WHERE date(`timestamp`) = %(date_1:STRING)s]
[parameters: {'count_1': '*', 'date_1': '2021-01-01'}]
(Background on this error at: https://sqlalche.me/e/14/4xp6)

In [10]:
df = pd.read_sql(query, engine)

In [11]:
df

Unnamed: 0,count_1
0,176032645
