In [2]:
!pip install bigframes

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets>=7.7.1->bigframes)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


In [3]:
import bigframes.pandas as bpd

bpd.options.bigquery.ordering_mode="partial"
bpd.options.display.repr_mode="deferred"

In [None]:
#Reading directly from google cloud
#df = bpd.read_gbq_table("bigquery-public-data.iowa_liquor_sales.sales")

#Let's read as a csv file into a bigframes pandas
url = "https://raw.githubusercontent.com/baursafi/GA_Projects/refs/heads/master/2_liquor_sales_iowa/data/Iowa_Liquor_sales_sample_10pct.csv"

# Read the CSV file directly from the GitHub URL into a BigQuery table
df = bpd.read_csv(url)

In [None]:
#view data
df.peek() #head

In [None]:
df.dtypes() #view data types

In [None]:
df.describe("all").to_pandas() #view summary statistics

In [None]:
volume_by_zip = df.groupby("zip_code").agg({"volume_sold_litres": "sum"})
volume_by_zip.plot.hist(bins=20)

In [None]:
#bar plot
(
    volume_by_zip
    .sort_values("volume_sold_litres", ascending=False)
    .head(25) # head works because of ordering
    .to_pandas()
    .plot.bar(rot=80)
)

In [None]:
#scatter plot
volume_by_pop = df.groupby("zip_code").agg({"volume_sold_litres": "sum", "population": "sum"})
(
    volume_by_pop[["volume_sold_litres", "population"]]
    .to_pandas()
    .plot.scatter(x="population", y="volume_sold_litres")
)

In [None]:
# linear regression
from bigframes.ml.linear_model import LinearRegression
#from bigframes.ml.preprocessing import StandardScaler

feature_columns = volume_by_pop[["total_pop"]]
label_column = volume_by_pop[["volume_sold_litres"]]

# linear model
model = LinearRegression()
model.fit(feature_columns, label_column)


In [None]:
# performance
model.score(feature_columns, label_column).to_pandas()

In [None]:
df.category_name.nunique() #how many unique categories

In [None]:
counts = (
    df.groupby("category_name")
    .agg({"volume_sold_litres": "sum"})
    .sort_values("volume_sold_litres", ascending=False)
    .to_pandas()
)

counts.head(25).plot.bar(rot=80)