In [149]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as sf      # sf = spark functions
import pyspark.sql.types as st          # st = spark types


sparkql = pyspark.sql.SparkSession.builder.master('local').getOrCreate()

import os
data_dir = '../data'

file_names = ['ADBE','AMZN', 'CRM', 'CSCO', 'GOOGL', 'IBM','INTC','META','MSFT','NFLX','NVDA','ORCL','TSLA']

columns = 'stock_name string, date date, open float, high float, low float, close float, adj_close float, volume int'

df = sparkql.read.csv(os.path.join(data_dir,'AAPL.csv'), header=True)
df = df.toDF('date', 'open', 'high', 'low', 'close', 'adj_close', 'volume')
df = df.withColumn('stock_name', sf.lit('AAPL'))
df = df.select('stock_name', 'date', 'open', 'high', 'low', 'close', 'adj_close', 'volume')

for csv in file_names:
    idf = sparkql.read.csv(os.path.join(data_dir,csv+'.csv'), header=True)
    idf = idf.toDF('date', 'open', 'high', 'low', 'close', 'adj_close', 'volume')
    idf = idf.withColumn('stock_name', sf.lit(csv))
    idf = idf.select('stock_name', 'date', 'open', 'high', 'low', 'close', 'adj_close', 'volume')
    idf.createOrReplaceGlobalTempView("comp_key")
    idf = sparkql.sql("SELECT CONCAT(stock_name, CAST(date AS string)) AS sd_id, stock_name, date, open, high, low, close, adj_close, volume FROM comp_key")
    
    #idf = idf.toPandas()
    
    df = df.union(idf)

df.show(5)

#df.coalesce(1).write.format("parquet").save(os.path.join(data_dir,'all_tech_stocks.parquet'))


AnalysisException: Table or view not found: comp_key; line 1 pos 123;
'Project ['CONCAT('stock_name, cast('date as string)) AS sd_id#56410, 'stock_name, 'date, 'open, 'high, 'low, 'close, 'adj_close, 'volume]
+- 'UnresolvedRelation [comp_key], [], false


In [142]:
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.exceptions import NotFound

PROJECT_NAME = 'team-week-3'
DATASET_NAME = 'tech_stocks_world_events'

key_path = "/Users/Ruben/Desktop/google_cred/.cred/team_project_3/team-week-3-2f1d10dceea4.json"

credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client = bigquery.Client(credentials=credentials, project=credentials.project_id)

dataset_id = f"{PROJECT_NAME}.{DATASET_NAME}"
table_id = f"{PROJECT_NAME}.{DATASET_NAME}.stocks"

data_dir = '../data'
DATA_FILE = os.path.join(data_dir,'all_tech_stocks.parquet/part-00000-59898083-310c-4f1e-a416-5593ab0d19c9-c000.snappy.parquet')

TABLE_SCHEMA = [
    bigquery.SchemaField('stock_name', 'STRING', mode='REQUIRED'),
    bigquery.SchemaField('date', 'DATE', mode='NULLABLE'),
    bigquery.SchemaField('open', 'FLOAT', mode='NULLABLE'),
    bigquery.SchemaField('high', 'FLOAT', mode='NULLABLE'),
    bigquery.SchemaField('low', 'FLOAT', mode='NULLABLE'),
    bigquery.SchemaField('close', 'FLOAT', mode='NULLABLE'),
    bigquery.SchemaField('adj_close', 'STRING', mode='NULLABLE'),
    bigquery.SchemaField('volume', 'INTEGER', mode='NULLABLE'),
    ]

def create_dataset():
    if client.get_dataset(dataset_id) == NotFound:
        dataset = bigquery.Dataset(dataset_id)
        dataset.location = "US"
        dataset = client.create_dataset(dataset, exists_ok=True)
    else:
        pass


def create_stocks_table():
    job_config = bigquery.LoadJobConfig(
            source_format=bigquery.SourceFormat.PARQUET,
            autodetect=True,
            create_disposition='CREATE_NEVER',
            write_disposition='WRITE_TRUNCATE',
            ignore_unknown_values=True,
        )
    table = bigquery.Table(table_id, schema=TABLE_SCHEMA)
    table = client.create_table(table, exists_ok=True)

    with open(DATA_FILE, "rb") as source_file:
        job = client.load_table_from_file(source_file, table_id, job_config=job_config)

    job.result()

create_dataset()
create_stocks_table()

In [132]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as sf      # sf = spark functions
import pyspark.sql.types as st          # st = spark types


import os
data_dir = '../data'

file_names = ['AAPL', 'ADBE','AMZN', 'CRM', 'CSCO', 'GOOGL', 'IBM','INTC','META','MSFT','NFLX','NVDA','ORCL','TSLA']

#columns = 'stock_name string, date date, open float, high float, low float, close float, adj_close float, volume int'

df_list = []

def composite_key(row):
    return f"{row.stock_name}-{row.date}"

for csv in file_names:
    idf = pd.read_csv(os.path.join(data_dir,csv+'.csv'))
    idf = idf.rename(columns={'Date':'date', 'Open':'open', 'High':'high', 'Low':'low', 'Close':'close', 'Adj Close':'adj_close', 'Volume':'volume'})
    idf.insert(0,"stock_name", csv)
    idf.insert(0,"sd_id", idf['sd_id'].apply(composite_key))
    df_list.append(idf)

df = pd.concat(df_list,axis=0)
df.set_index('sd_id', inplace=True)


df.to_csv(os.path.join(data_dir,'all_tech_stocks.csv'), header=True)
#df = sparkql.read.csv(os.path.join(data_dir,'all_tech_stocks.csv'), header=True)

KeyError: 'sd_id'