#  Task 3: Building Summary Tables
This task creates a relational database that contains summary files that allows a user to qucikly answer business questions. 

# Building Summary Tables

How have our sales-by-day changed over the last few months?  
What is our most popular item in each department?  
Which owners spend the most per month in each department?

#### Import Required Libraries

In [1]:
import os
import sqlite3
import pandas as pd
import numpy as np

# Do our imports for the code
from google.cloud import bigquery
from google.oauth2 import service_account

#### Configure Big Query Client

In [4]:
client = bigquery.Client(project = "wedge-project-np")

### Summary Table 1
This code block creates a table of sales by date by hour. Using calendar date (YYYY-MM-DD) and hour of the day, the total spend in the store, the number of transactions, and a count of the number of items is determined and saved to a pandas dataframe


In [3]:
query_1 = """
    SELECT ROUND(SUM(total), 2) as sales
        , EXTRACT(DATE FROM datetime) AS date
        , EXTRACT(HOUR FROM datetime) AS hour
        , COUNT(DISTINCT CONCAT(
            CAST(EXTRACT(DATE FROM datetime) AS STRING), 
            CAST(register_no AS STRING), 
            CAST(emp_no AS STRING), 
            CAST(trans_no AS STRING)
            )) AS transactioins

        ,SUM(
            CASE
            WHEN trans_status IN ('V', 'R') THEN -1
            ELSE 1
        END
        ) AS items

        FROM `umt-msba.wedge_transactions.transArchive*`
        WHERE department NOT IN (0, 15)
        AND (trans_status IS NULL 
        OR trans_status IN ('V', 'R', '', ' '))
        GROUP BY date, hour
        ORDER BY date, hour;
"""

# Execute the query
try:
    query_job = client.query(query_1)  # Start the query job
    df1 = query_job.to_dataframe()  # Convert the result to a pandas DataFrame

    # View the DataFrame
    print(df1.head())  # Show the first 5 rows of the DataFrame

except Exception as e:
    print(f"Query failed: {e}")

     sales        date  hour  transactioins  items
0  1006.28  2010-01-01     9             36    245
1  3128.55  2010-01-01    10             82    913
2  4001.66  2010-01-01    11            118   1108
3  3886.51  2010-01-01    12            124   1143
4  4654.52  2010-01-01    13            154   1365


### Summary Table 2
This code block creates a summary table of sales by owner by year by month. The output file has the following columns: card_no, year, month, sales, transactions, and itemsn and is saved into a pandas dataframe.


In [5]:
query_2 = """
    SELECT DISTINCT card_no as owner
        , EXTRACT(YEAR FROM datetime) AS year
        , EXTRACT(MONTH FROM datetime) AS month
        , ROUND(SUM(total), 2) as sales
        , COUNT(DISTINCT CONCAT(
            CAST(EXTRACT(DATE FROM datetime) AS STRING), 
            CAST(register_no AS STRING), 
            CAST(emp_no AS STRING), 
            CAST(trans_no AS STRING)
            )) AS transactions
        ,SUM(
            CASE
            WHEN trans_status IN ('V', 'R') THEN -1
            ELSE 1
        END
        ) AS items

    FROM `umt-msba.wedge_transactions.transArchive*`
    WHERE department NOT IN (0, 15)
    AND card_no != 3
    AND (trans_status IS NULL 
    OR trans_status IN ('V', 'R', '', ' '))
    GROUP BY owner, year, month
    ORDER BY owner, year, month;
"""

# Execute the query
try:
    query_job = client.query(query_2)  # Start the query job
    df2 = query_job.to_dataframe()  # Convert the result to a pandas DataFrame

    # View the DataFrame
    print(df2.head())  # Show the first 5 rows of the DataFrame

except Exception as e:
    print(f"Query failed: {e}")

     owner  year  month  sales  transactions  items
0  10000.0  2010     10  65.87             4     21
1  10000.0  2010     11  53.12             2     20
2  10000.0  2010     12  17.34             1      6
3  10000.0  2011      1  60.40             4     23
4  10000.0  2011      2  19.65             1      4


### Summary Table 3
This block of code creates a table of sales by product description by year by month. The output is a pandas datafram that has the following columns: upc, description, department number, department name, year, month, sales, transactions, and items.


In [6]:
query_3 = """
    SELECT DISTINCT upc
    , description
    , trans.department AS `department number`
    , depts.dept_name AS `department name`
    , EXTRACT(YEAR FROM datetime) AS year
    , EXTRACT(MONTH FROM datetime) AS month
    , ROUND(SUM(total), 2) as sales
    , COUNT(DISTINCT CONCAT(
        CAST(EXTRACT(DATE FROM datetime) AS STRING), 
        CAST(register_no AS STRING), 
        CAST(emp_no AS STRING), 
        CAST(trans_no AS STRING)
        )) AS transactions
    ,SUM(
        CASE
        WHEN trans_status IN ('V', 'R') THEN -1
        ELSE 1
    END
    ) AS items

    FROM `umt-msba.wedge_transactions.transArchive*` as trans
    JOIN `umt-msba.wedge_transactions.department_lookup` as depts
    ON trans.department = depts.department
    WHERE trans.department NOT IN (0, 15)
    AND card_no != 3
    AND (trans_status IS NULL 
    OR trans_status IN ('V', 'R', '', ' '))
    GROUP BY upc, description, `department number`, `department name`, year, month
    ORDER BY upc, description, `department number`, `department name`, year, month;
    """
# Execute the query
try:
    query_job = client.query(query_3)  # Start the query job
    df3 = query_job.to_dataframe()  # Convert the result to a pandas DataFrame

    # View the DataFrame
    print(df3.head())  # Show the first 5 rows of the DataFrame

except Exception as e:
    print(f"Query failed: {e}")

  upc              description  department number   department name  year  \
0   0              BULK Coupon                3.0              BULK  2010   
1   0              BULK Coupon                3.0              BULK  2010   
2   0            FROZEN Coupon                6.0            FROZEN  2010   
3   0  PACKAGED GROCERY Coupon                1.0  PACKAGED GROCERY  2010   
4   0  PACKAGED GROCERY Coupon                1.0  PACKAGED GROCERY  2010   

   month  sales  transactions  items  
0      1   2.00             1     -1  
1      2   1.00             1     -1  
2      2   2.00             1     -1  
3      1  14.48             2     -4  
4      2  15.49             5     -5  


### Create a SQLite Database and Add Summary Tables

In [9]:
# Connect to the SQLite database (or create it)
conn = sqlite3.connect('data/summary_database.db')

# Create a cursor object
cursor = conn.cursor()

# Create table summary table 1
df1.to_sql('summary_table_1', conn, if_exists='replace', index=False)

# Create table summary table 2
df2.to_sql('summary_table_2', conn, if_exists='replace', index=False)

# Create table summary table 3
df3.to_sql('summary_table_3', conn, if_exists='replace', index=False)

1130901

### Check to Make Sure Data is In Database

In [10]:
conn = sqlite3.connect('data/summary_database.db')

cursor = conn.cursor()

# Query to check the first summary table
cursor.execute("SELECT * FROM summary_table_1 LIMIT 5")
print(cursor.fetchall())

cursor.execute("SELECT * FROM summary_table_2 LIMIT 5")
print(cursor.fetchall())

cursor.execute("SELECT * FROM summary_table_3 LIMIT 5")
print(cursor.fetchall())

# Commit and close the connection when done
conn.commit()
conn.close()

[(1006.28, '2010-01-01', 9, 36, 245), (3128.55, '2010-01-01', 10, 82, 913), (4001.66, '2010-01-01', 11, 118, 1108), (3886.51, '2010-01-01', 12, 124, 1143), (4654.52, '2010-01-01', 13, 154, 1365)]
[(10000.0, 2010, 10, 65.87, 4, 21), (10000.0, 2010, 11, 53.12, 2, 20), (10000.0, 2010, 12, 17.34, 1, 6), (10000.0, 2011, 1, 60.4, 4, 23), (10000.0, 2011, 2, 19.65, 1, 4)]
[('0', 'BULK Coupon', 3.0, 'BULK', 2010, 1, 2.0, 1, -1), ('0', 'BULK Coupon', 3.0, 'BULK', 2010, 2, 1.0, 1, -1), ('0', 'FROZEN Coupon', 6.0, 'FROZEN', 2010, 2, 2.0, 1, -1), ('0', 'PACKAGED GROCERY Coupon', 1.0, 'PACKAGED GROCERY', 2010, 1, 14.48, 2, -4), ('0', 'PACKAGED GROCERY Coupon', 1.0, 'PACKAGED GROCERY', 2010, 2, 15.49, 5, -5)]


In [12]:

db_path = 'summary_database.db'

if os.path.exists(db_path):
    os.remove(db_path)
    print(f"Database at {db_path} has been deleted.")
else:
    print("The database file does not exist.")

PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'summary_database.db'