In [12]:
import os
import pandas as pd
import zipfile
import csv
from io import StringIO
from google.cloud import bigquery
import pandas_gbq

### Testing on home server

#### Setting Directory for Mac and Windows 

In [13]:
try:
    directory_small = '/Users/aaronportra/Documents/ADA/wedge_project/WedgeZipOfZips_small/'
    directory = '/Users/aaronportra/Documents/ADA/wedge_project/WedgeZipOfZips/'
    os.listdir(directory)
except FileNotFoundError:
    directory_small = 'C:\\Users\\aport\\OneDrive\\Documents\\School\\Fall Semester 2024\\Applied Data Analytics\\wedge_project\\WedgeZipOfZips_small\\'
    directory = 'C:\\Users\\aport\\OneDrive\\Documents\\School\\Fall Semester 2024\\Applied Data Analytics\\wedge_project\\WedgeZipOfZips\\'
    os.listdir(directory)

## Cleaning Data and Uploading to Google Big Query

In [14]:
def sniff(file):

    file = StringIO(file)

    sample = file.read(10000)

    sniffer = csv.Sniffer()

    file.seek(0)

    delimiter = sniffer.sniff(sample).delimiter


    reader = csv.reader(file, delimiter=delimiter)

    first_row = next(reader)

    has_header = sniffer.has_header(sample)


    if len(first_row) < 50:
        has_header = False

    return delimiter, has_header

In [15]:
project_id = 'wedge-project-0'

dataset_id = 'transactions'

client = bigquery.Client(project = project_id)

dataset_ref = client.dataset(dataset_id)
try:
    client.get_dataset(dataset_ref) 
except:
    client.create_dataset(dataset_ref) 


In [16]:
def upload_gbq(data,name): 
    try: 
        client.get_table(f'{project_id}.{dataset_id}.{name}')
        print(f'{name} already exists')

    except:
        pandas_gbq.to_gbq(data,project_id = project_id,destination_table = f'{project_id}.{dataset_id}.{name}')
        print(f'Uploading {name}')

In [21]:
#Read in files and load to pandas

def create_db(directory):
    count = 0
    for file in os.listdir(directory):
        if file.endswith('.zip'):
            with zipfile.ZipFile(os.path.join(directory, file),'r') as zip_file:
                for info in zip_file.namelist():
        
                    file_content = zip_file.read(info).decode('utf-8',errors = 'replace')

                    csv_file = StringIO(file_content)

                    h = sniff(file_content)

                    csv_file.seek(0)

                    if h[1]:
                        data = pd.read_csv(csv_file, delimiter=h[0])
                        data = pd.DataFrame(data)

                        columns = data.columns


                    elif not h[1]:
                        data = pd.read_csv(csv_file, delimiter = h[0], header = None,)
                        data = pd.DataFrame(data)
                        data.columns = columns
    
                    # del_nan(data)

                    data = data.convert_dtypes()



                    data['datetime'] = pd.to_datetime(data['datetime'])
                    data['description'] = data['description'].str.replace('"','')

                    for col in data:
                            if r'\N' in data[col].values:
                                data[col] = pd.to_numeric(data[col],errors = 'coerce').astype('float64')
                            if 'price' in col.lower() or 'percent' in col.lower() or 'organic' in col.lower():
                                data[col] = pd.to_numeric(data[col],errors = 'coerce').astype('float64')
                            if data[col].dtypes == 'Int64':
                                data[col] = pd.to_numeric(data[col],errors = 'coerce').astype('float64')
                            elif data[col].dtypes == 'object':
                                data[col] = pd.to_numeric(data[col],errors = 'coerce').astype('float64')


                    bools = ['memType','staff','batchHeaderID','display']

                    for col in bools:
                        if col in data and data[col].dtypes != bool:
                            data[col] = pd.to_numeric(data[col], errors='coerce')

                            data[col] = data[col].fillna(0)

                            data[col] = (data[col] != 0).astype(bool) 


                    upload_gbq(data,info.rstrip('.csv'))


In [18]:
test_dir = '/Users/aaronportra/Documents/ADA/wedge_project/untitled folder/'

In [None]:
#create_db(directory_small)

create_db(directory)

In [20]:
#Saving to CSV

# try:
#     save_path = '/Users/aaronportra/Documents/ADA/wedge_project/clean_transactions/'
#     os.listdir(save_path)
# except FileNotFoundError:
#     save_path = 'C:\\Users\\aport\\OneDrive\\Documents\\School\\Fall Semester 2024\\Applied Data Analytics\\wedge_project\\clean_transactions\\'
#     os.listdir(save_path)

# for trans in full_zips:
#     full_zips[trans].to_csv(save_path + trans + '.csv', encoding = 'utf-8', index = False)
