In [11]:
from google.cloud import storage, bigquery
import pandas as pd
from google.oauth2 import service_account
import json

# Set up Google Cloud credentials
project_credentials = service_account.Credentials.from_service_account_file('data-finance-final-92d8049c252f.json')
project_id = 'data-finance-final'

# Initialize the client
storage_client = storage.Client(credentials=project_credentials, project=project_id)
bucket_name = 'data_finance_final'

# Get the bucket
bucket = storage_client.get_bucket(bucket_name)

# List all blobs in the bucket
blobs = list(bucket.list_blobs())

# Process the blob names to find the most recent file for each stock
latest_files = {}
for blob in blobs:
    # Assuming file names are in 'Price_STOCK-DATE.json' format
    parts = blob.name.split('_')
    #print(parts)
    if len(parts) > 1 and parts[0] == 'price/Price':
        # Extract the stock symbol and date
        stock, date_str = parts[1], parts[2].split('.')[0]
        date = pd.to_datetime(date_str, format='%Y-%m-%d')
        
        # Determine if this file is the most recent for the stock
        if stock not in latest_files or date > latest_files[stock]['date']:
            latest_files[stock] = {'date': date, 'blob': blob}

# Initialize BigQuery client
bigquery_client = bigquery.Client(credentials=project_credentials, project=project_id)
dataset_id = 'stockMetaData'
table_id = 'stock_prices_cleaned'

# Define the schema of your BigQuery table
schema = [
    bigquery.SchemaField('date', 'DATE'),
    bigquery.SchemaField('open', 'FLOAT'),
    bigquery.SchemaField('high', 'FLOAT'),
    bigquery.SchemaField('low', 'FLOAT'),
    bigquery.SchemaField('close', 'FLOAT'),
    bigquery.SchemaField('volume', 'INTEGER'),
    bigquery.SchemaField('stock', 'String')

]

# Create or get the dataset and table
dataset_ref = bigquery_client.dataset(dataset_id)
dataset = bigquery.Dataset(dataset_ref)
bigquery_client.create_dataset(dataset, exists_ok=True)
table_ref = dataset_ref.table(table_id)
table = bigquery.Table(table_ref, schema=schema)
bigquery_client.create_table(table, exists_ok=True)

df_combined = pd.DataFrame()
# For each stock, load the most recent data into BigQuery
for stock, file_info in latest_files.items():
    print(stock)
    
    try:
        # Download the blob to a local variable
        data_string = file_info['blob'].download_as_string()
        data_json = json.loads(data_string)

        # Process and clean the data as needed, then convert to DataFrame
        # This is an example, you'll need to adjust it to match your JSON structure
        df = pd.DataFrame.from_dict(data_json['Time Series (Daily)'], orient='index')
        df.index = pd.to_datetime(df.index)
        df.sort_index(inplace=True)

        # Reset the index and rename columns
        df.reset_index(inplace=True)
        df.rename(columns={'index': 'date'}, inplace=True)
        df.columns = ['date', 'open', 'high', 'low', 'close', 'volume']
        df['stock'] = stock

        df['open'] = pd.to_numeric(df['open'], errors='coerce')
        df['high'] = pd.to_numeric(df['high'], errors='coerce')
        df['low'] = pd.to_numeric(df['low'], errors='coerce')
        df['close'] = pd.to_numeric(df['close'], errors='coerce')
        df['volume'] = pd.to_numeric(df['volume'], downcast='integer', errors='coerce')
        df['date'] = pd.to_datetime(df['date'])
        #Check for and handle any possible duplicates in the 'date' column
        df = df.drop_duplicates(subset='date')
        df = df.sort_values(by='date')
        
        df_combined = pd.concat([df_combined,df], axis=0)
    except KeyError as e:
        print("Unable to Format")

print(df_combined)
try:
    # If the table doesn't exist, create it, otherwise append the data
    df_combined.to_gbq(destination_table=f'{dataset_id}.{table_id}', project_id=project_id, if_exists='replace', credentials=project_credentials)

except KeyError as e:
    print("Unable to Format")



AAPL
AMZN
FB
Unable to Format
GOOG
IBM
INTC
MSFT
SPX
Unable to Format
TSLA
V
         date    open     high      low   close    volume stock
0  2023-07-25  193.33  194.440  192.915  193.62  37283201  AAPL
1  2023-07-26  193.67  195.640  193.320  194.50  47471868  AAPL
2  2023-07-27  196.02  197.200  192.550  193.22  47460180  AAPL
3  2023-07-28  194.67  196.626  194.140  195.83  48291443  AAPL
4  2023-07-31  196.06  196.490  195.260  196.45  38824113  AAPL
..        ...     ...      ...      ...     ...       ...   ...
95 2023-12-07  254.89  256.140  253.500  255.82   3589256     V
96 2023-12-08  255.00  256.040  253.870  255.74   3732515     V
97 2023-12-11  255.00  257.630  255.000  256.52   6479312     V
98 2023-12-12  257.30  259.720  256.395  259.56   5946564     V
99 2023-12-13  259.15  262.480  258.690  262.38   4566474     V

[800 rows x 7 columns]


100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]
