In [13]:
from google.cloud import storage, bigquery
import pandas as pd
from google.oauth2 import service_account
import json

# Set up Google Cloud credentials
project_credentials = service_account.Credentials.from_service_account_file('data-finance-final-92d8049c252f.json')
project_id = 'data-finance-final'

# Initialize the client
storage_client = storage.Client(credentials=project_credentials, project=project_id)
bucket_name = 'data_finance_final'

# Get the bucket
bucket = storage_client.get_bucket(bucket_name)

# List all blobs in the bucket
blobs = list(bucket.list_blobs())
filtered_blobs = [blob for blob in blobs if blob.name.startswith('sentiment/Sentiment')]
#print(filtered_blobs)


latest_files = {}

for blob in filtered_blobs:
    # Assuming file names are in 'sentiment/Sentiment_STOCK_DATE.json' format
    parts = blob.name.split('_')
    #print(parts)
    if len(parts) > 1 and parts[0] == 'sentiment/Sentiment':
        # Extract the stock symbol and date
        stock, date_str = parts[1], parts[2].split('.')[0]
        date = pd.to_datetime(date_str, format='%Y-%m-%d')
        
        # Determine if this file is the most recent for the stock
        if stock not in latest_files or date > latest_files[stock]['date']:
            latest_files[stock] = {'date': date, 'blob': blob}

# Initialize BigQuery client
bigquery_client = bigquery.Client(credentials=project_credentials, project=project_id)
dataset_id = 'stockMetaData'
table_id = 'stock_sentiments_cleaned'

# Define the schema of your BigQuery table
schema = [
    bigquery.SchemaField('date', 'DATE'),
    bigquery.SchemaField('stock', 'String'),
    bigquery.SchemaField('sentiment', 'FLOAT'),
]

# Create or get the dataset and table
dataset_ref = bigquery_client.dataset(dataset_id)
dataset = bigquery.Dataset(dataset_ref)
bigquery_client.create_dataset(dataset, exists_ok=True)
table_ref = dataset_ref.table(table_id)
table = bigquery.Table(table_ref, schema=schema)
bigquery_client.create_table(table, exists_ok=True)

df_combined = pd.DataFrame()
for stock, file_info in latest_files.items():
    try:
        print(stock)

        # Download the blob to a local variable
        data_string = file_info['blob'].download_as_string()
        data_json = json.loads(data_string)

        sentiment_total = 0
        sentiment_count = 0

        # Iterate through each article in the JSON data
        for article in data_json["feed"]:
            # Iterate through each ticker sentiment in the article
            for ticker_info in article["ticker_sentiment"]:
                ticker = ticker_info["ticker"]
                if ticker == stock:
                    sentiment_score = float(ticker_info["ticker_sentiment_score"])
                    sentiment_total += sentiment_score
                    sentiment_count += 1

        average_sentiment = sentiment_total / sentiment_count

        columns = ['date', 'stock','sentiment']
        df = pd.DataFrame([[file_info['date'],stock,average_sentiment]], columns=columns)
        df_combined = pd.concat([df_combined,df], axis=0)

    except KeyError as e:
        print("Unable to Format")
    
        
print(df_combined)

try:
    # If the table doesn't exist, create it, otherwise append the data
    df_combined.to_gbq(destination_table=f'{dataset_id}.{table_id}', project_id=project_id, if_exists='replace', credentials=project_credentials)
except KeyError as e:
    print("Unable to Format")

AAPL
AMZN
FB
Unable to Format
GOOG
IBM
INTC
MSFT
SPX
Unable to Format
TSLA
V
        date stock  sentiment
0 2023-12-14  AAPL   0.136018
0 2023-12-14  AMZN   0.159763
0 2023-12-14  GOOG   0.105519
0 2023-12-14   IBM   0.079879
0 2023-12-14  INTC   0.179523
0 2023-12-14  MSFT   0.109018
0 2023-12-14  TSLA   0.088962
0 2023-12-14     V   0.182790


100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 975.87it/s]
