# Task 1 - Build a Transaction Database in Google Big Query
Loads files to GBQ, I was only able to get the small/inactive files to load with code

#### Imports

In [1]:
from google.cloud import bigquery
from google.oauth2 import service_account

import pandas as pd
from pandas.io import gbq

#### Authentication

In [2]:
service_path = ' '
service_file = ' ' # change this to your authentication information  
gbq_proj_id = ' ' # change this to your project. 
gbq_dataset_id = ' ' # and change this to your data set ID

# Creates private key 
private_key =service_path + service_file


#### Credentials

In [3]:
# Get your credentials
credentials = service_account.Credentials.from_service_account_file(service_path + service_file)

# And create a client to talk to GBQ
client = bigquery.Client(credentials = credentials, project=gbq_proj_id)

### Creating Tables

In [19]:
#my_table = "transArchive_201304_201306_inactive_clean"
#my_table = "transArchive_201301_201303_inactive_clean"
#my_table = "transArchive_201210_201212_inactive_clean"
#my_table = "transArchive_201207_201209_inactive_clean"
#my_table = "transArchive_201204_201206_inactive_clean"
#my_table = "transArchive_201201_201203_inactive_clean"
#my_table = "transArchive_201307_201309_inactive_clean"
#my_table = "transArchive_201310_201312_inactive_clean"
#my_table = "transArchive_201401_201403_inactive_clean"
#my_table = "transArchive_201404_201406_inactive_clean"
#my_table = "transArchive_201407_201409_inactive_clean"
#my_table = "transArchive_201410_201412_inactive_clean"

table_full_name = ".".join([gbq_proj_id,gbq_dataset_id,my_table])

Now we'll test to see if that table exists and, if it doesn't, create it as an empty table. There's not natively a function to test for table existence in GBQ, so we'll write our own, which I found on StackOverflow [here](https://stackoverflow.com/questions/28731102/bigquery-check-if-table-already-exists).

In [20]:
def tbl_exists(client, table_ref):
    from google.cloud.exceptions import NotFound
    try:
        client.get_table(table_ref)
        return True
    except NotFound:
        return False

In [21]:
if not tbl_exists(client, table_full_name) :
    table_ref = client.create_table(
        table = table_full_name
    )
else :
    table_ref = client.get_table(table_full_name)

At this point our table is empty and doesn't even have a schema.

In [22]:
table = client.get_table(table_ref)
print("Table {} contains {} columns".format(table_ref.table_id,len(table.schema)))

Table transArchive_201410_201412_inactive_clean contains 0 columns


In [23]:
job_config = bigquery.LoadJobConfig()
job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
job_config.schema_update_options = [
    bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION # This allows us to modify the table. 
]

### Adding Schema

In [24]:
job_config.schema = [
    bigquery.SchemaField("datetime", "TIMESTAMP", mode="NULLABLE"),
    bigquery.SchemaField("register_no", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("emp_no", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("trans_no", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("upc", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("description", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("trans_type", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("trans_subtype", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("trans_status", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("department", "NUMERIC", mode="NULLABLE"),
    bigquery.SchemaField("quantity", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("Scale", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("cost", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("unitPrice", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("total", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("regPrice", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("altPrice", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("tax", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("taxexempt", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("foodstamp", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("wicable", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("discount", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("memDiscount", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("discountable", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("discounttype", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("voided", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("percentDiscount", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("ItemQtty", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("volDiscType", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("volume", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("VolSpecial", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("mixMatch", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("matched", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("memType", "BOOLEAN", mode="NULLABLE"),
    bigquery.SchemaField("staff", "BOOLEAN", mode="NULLABLE"),
    bigquery.SchemaField("numflag", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("itemstatus", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("tenderstatus", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("charflag", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("varflag", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("batchHeaderID", "BOOLEAN", mode="NULLABLE"),
    bigquery.SchemaField("local", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("organic", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("display", "BOOLEAN", mode="NULLABLE"),
    bigquery.SchemaField("receipt", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("card_no", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("store", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("branch", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("match_id", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("trans_id", "FLOAT", mode="NULLABLE"),
]
job_config.source_format = bigquery.SourceFormat.CSV
job_config.skip_leading_rows = 1

### Loading tables to dataset

In [25]:
#loads smaller files
#with open("clean-files\\transArchive_201304_201306_inactive_clean.csv", "rb") as source_file:
#with open("clean-files\\transArchive_201301_201303_inactive_clean.csv", "rb") as source_file:
#with open("clean-files\\transArchive_201210_201212_inactive_clean.csv", "rb") as source_file:
#with open("clean-files\\transArchive_201207_201209_inactive_clean.csv", "rb") as source_file:
#with open("clean-files\\transArchive_201204_201206_inactive_clean.csv", "rb") as source_file:
#with open("clean-files\\transArchive_201201_201203_inactive_clean.csv", "rb") as source_file:
#with open("clean-files\\transArchive_201307_201309_inactive_clean.csv", "rb") as source_file:
#with open("clean-files\\transArchive_201310_201312_inactive_clean.csv", "rb") as source_file:
#with open("clean-files\\transArchive_201401_201403_inactive_clean.csv", "rb") as source_file:
#with open("clean-files\\transArchive_201404_201406_inactive_clean.csv", "rb") as source_file:
#with open("clean-files\\transArchive_201407_201409_inactive_clean.csv", "rb") as source_file:
#with open("clean-files\\transArchive_201410_201412_inactive_clean.csv", "rb") as source_file:

    
    job = client.load_table_from_file(
        source_file,
        table_ref,
        location="US",  # Must match the destination dataset location.
        job_config=job_config,
    )  # API request

### Fix file that wouldn't transfer

In [None]:
col_fix = pd.read_csv("clean-files\\transArchive_201504_201506_clean.csv")

In [None]:
#col_fix
#col_fix.dtypes

In [None]:
# to change use .astype() 
col_fix['memType'] = col_fix.memType.astype(bool)
col_fix['display'] = col_fix.display.astype(bool)

In [None]:
#col_fix
#col_fix.dtypes

In [None]:
#Save as new CSV
col_fix.to_csv(r'clean-files\\transArchive_201504_201506_clean.csv', index = False)