# Task 2 - Sample Owners

#### Imports

In [2]:
from google.cloud import bigquery
from google.oauth2 import service_account

import pandas as pd
from pandas.io import gbq

import os
from pathlib import Path

#### Connect to GBQ

In [3]:
service_path = ' '
service_file = ' ' # change this to your authentication information  
gbq_proj_id = ' ' # change this to your project. 
#gbq_dataset_id = ' ' # and change this to your data set ID
gbq_dataset_id = ' ' # and change this to your data set ID

# Creates private key 
private_key =service_path + service_file


#### Credentials

In [5]:
# Get your credentials
credentials = service_account.Credentials.from_service_account_file(service_path + service_file)

# And create a client to talk to GBQ
client = bigquery.Client(credentials = credentials, project=gbq_proj_id)

## Queries

### Build a List of Owners

In [179]:
list_of_owners = """
SELECT DISTINCT card_no
FROM `wedgehp.tn.transArchive*`
GROUP BY card_no
"""

owners = gbq.read_gbq(list_of_owners, project_id =" ")

In [184]:
# Checks to make sure it worked
owners.head(5)

Unnamed: 0,card_no
0,42371.0
1,44329.0
2,46430.0
3,46607.0
4,47327.0


In [181]:
# Saves dataframe as csv to be uploaded
owners.to_csv(r'list_of_owners.csv', index = False)

#### Upload Owner List to GBQ

In [148]:
my_table = "list_of_owners"

table_full_name = ".".join([gbq_proj_id,gbq_dataset_id,my_table])

In [56]:
def tbl_exists(client, table_ref):
    from google.cloud.exceptions import NotFound
    try:
        client.get_table(table_ref)
        return True
    except NotFound:
        return False

In [57]:
if not tbl_exists(client, table_full_name) :
    table_ref = client.create_table(
        table = table_full_name
    )
else :
    table_ref = client.get_table(table_full_name)

In [58]:
table = client.get_table(table_ref)
print("Table {} contains {} columns".format(table_ref.table_id,len(table.schema)))

Table list_of_owners contains 0 columns


In [59]:
job_config = bigquery.LoadJobConfig()
job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
job_config.schema_update_options = [
    bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION # This allows us to modify the table. 
]

In [60]:
job_config.schema = [
    bigquery.SchemaField("card_no", "FLOAT", mode="NULLABLE"),

]
job_config.source_format = bigquery.SourceFormat.CSV
job_config.skip_leading_rows = 1

In [61]:
with open("list_of_owners.csv", "rb") as source_file:
    
    job = client.load_table_from_file(
        source_file,
        table_ref,
        location="US",  # Must match the destination dataset location.
        job_config=job_config,
    )  # API request

### Make a Sample of Owners

In [149]:
sample_of_owners = """
SELECT *
FROM `wedgehp.other.list_of_owners`
ORDER by rand()
LIMIT  550
"""

sample_owners = gbq.read_gbq(sample_of_owners, project_id ="wedgehp")

In [183]:
# Checks to make sure it worked
sample_owners.head(5)

Unnamed: 0,card_no
0,18809.0
1,18060.0
2,21103.0
3,18167.0
4,13547.0


In [151]:
# Saves dataframe as csv to be uploaded
sample_owners.to_csv(r'sample_owners.csv', index = False)

#### Upload Sample of Owner List to GBQ

In [152]:
my_table = "sample_owners"

table_full_name = ".".join([gbq_proj_id,gbq_dataset_id,my_table])

In [153]:
def tbl_exists(client, table_ref):
    from google.cloud.exceptions import NotFound
    try:
        client.get_table(table_ref)
        return True
    except NotFound:
        return False

In [154]:
if not tbl_exists(client, table_full_name) :
    table_ref = client.create_table(
        table = table_full_name
    )
else :
    table_ref = client.get_table(table_full_name)

In [155]:
table = client.get_table(table_ref)
print("Table {} contains {} columns".format(table_ref.table_id,len(table.schema)))

Table sample_owners contains 0 columns


In [156]:
job_config = bigquery.LoadJobConfig()
job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
job_config.schema_update_options = [
    bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION # This allows us to modify the table. 
]

In [157]:
job_config.schema = [
    bigquery.SchemaField("card_no", "FLOAT", mode="NULLABLE"),

]
job_config.source_format = bigquery.SourceFormat.CSV
job_config.skip_leading_rows = 1

In [158]:
with open("sample_owners.csv", "rb") as source_file:
    
    job = client.load_table_from_file(
        source_file,
        table_ref,
        location="US",  # Must match the destination dataset location.
        job_config=job_config,
    )  # API request

### Extract all records for sample owners

In [6]:
records_sample = """
SELECT *
FROM `wedgehp.tn.transArchive_*`
WHERE card_no IN
    (SELECT card_no 
     FROM `wedgehp.other.sample_owners`)
"""

all_records_sample = gbq.read_gbq(records_sample, project_id ="wedgehp")

In [7]:
# Checks to make sure it worked
all_records_sample.head(5)

Unnamed: 0,datetime,register_no,emp_no,trans_no,upc,description,trans_type,trans_subtype,trans_status,department,...,batchHeaderID,local,organic,display,receipt,card_no,store,branch,match_id,trans_id
0,2014-10-21 18:53:37+00:00,3.0,17.0,26.0,0,EBT FS,T,EF,,0.0,...,,0.0,,,0.0,48313.0,1.0,0.0,0.0,10.0
1,2014-10-04 16:53:30+00:00,7.0,31.0,48.0,TAX,Tax,A,,,0.0,...,,0.0,,,0.0,48396.0,1.0,0.0,0.0,8.0
2,2014-10-10 11:38:27+00:00,4.0,17.0,45.0,TAX,Tax,A,,,0.0,...,,0.0,,,0.0,48396.0,1.0,0.0,0.0,12.0
3,2014-10-20 09:29:00+00:00,16.0,54.0,46.0,TAX,Tax,A,,,0.0,...,,0.0,,,0.0,48339.0,1.0,0.0,0.0,10.0
4,2014-10-21 18:53:40+00:00,3.0,17.0,26.0,TAX,Tax,A,,,0.0,...,,0.0,,,0.0,48313.0,1.0,0.0,0.0,14.0


In [None]:
# Saves dataframe as csv to be uploaded if it is small enough
all_records_sample.to_csv(r'all_records_sample.csv', index = False)

##### Check file size to see if it will upload

In [162]:
# Funtion to determine file size - https://amiradata.com/python-get-file-size-in-kb-mb-or-gb/

def get_file_size(file_path):
    size = os.path.getsize(file_path)
    return size
 
file_path = 'all_records_sample.csv'
size = get_file_size(file_path)
print('File size: '+ str(size) +' bytes')

File size: 316442780 bytes


In [163]:
# Fun little function to  tranlate bytes to something readable - https://amiradata.com/python-get-file-size-in-kb-mb-or-gb/
def get_file_size(file_path):
    size = os.path.getsize(file_path)
    return size
 
 
def get_file_size_2(file):
    stat = os.stat(file)
    size = stat.st_size
    return size
 
 
def get_file_size_3(file):
    size = Path(file).stat().st_size
    return size
 
 
def convert_bytes(size, unit=None):
    if unit == "KB":
        return print('File size: ' + str(round(size / 1024, 3)) + ' Kilobytes')
    elif unit == "MB":
        return print('File size: ' + str(round(size / (1024 * 1024), 3)) + ' Megabytes')
    elif unit == "GB":
        return print('File size: ' + str(round(size / (1024 * 1024 * 1024), 3)) + ' Gigabytes')
    else:
        return print('File size: ' + str(size) + ' bytes')
 
 
file = 'all_records_sample.csv'
 
print("Using 1st method : ")
size = get_file_size(file)
 
convert_bytes(size)
convert_bytes(size, "KB")
convert_bytes(size, "MB")
convert_bytes(size, "GB")

Using 1st method : 
File size: 316442780 bytes
File size: 309026.152 Kilobytes
File size: 301.783 Megabytes
File size: 0.295 Gigabytes
