In [108]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as sf      # sf = spark functions
import pyspark.sql.types as st          # st = spark types


sparkql = pyspark.sql.SparkSession.builder.master('local').getOrCreate()

import os
data_dir = '../data'

file_names = ['ADBE','AMZN', 'CRM', 'CSCO', 'GOOGL', 'IBM','INTC','META','MSFT','NFLX','NVDA','ORCL','TSLA']

columns = 'stock_name string, date date, open float, high float, low float, close float, adj_close float, volume int'

df = sparkql.read.csv(os.path.join(data_dir,'AAPL.csv'), header=True)
df = df.toDF('date', 'open', 'high', 'low', 'close', 'adj_close', 'volume')
df = df.withColumn('stock_name', sf.lit('AAPL'))
df = df.select('stock_name', 'date', 'open', 'high', 'low', 'close', 'adj_close', 'volume')

for csv in file_names:
    idf = sparkql.read.csv(os.path.join(data_dir,csv+'.csv'), header=True)
    idf = idf.toDF('date', 'open', 'high', 'low', 'close', 'adj_close', 'volume')
    idf = idf.withColumn('stock_name', sf.lit(csv))
    idf = idf.select('stock_name', 'date', 'open', 'high', 'low', 'close', 'adj_close', 'volume')
    #idf = idf.toPandas()
    
    df = df.union(idf)


df.write.csv(os.path.join(data_dir,'all_tech_stocks'), header=True)
#df = sparkql.read.csv(os.path.join(data_dir,'all_tech_stocks.csv'), header=True)

                                                                                

In [105]:
df.tail(10)

[Row(stock_name='TSLA', date='2022-12-15', open='153.440002', high='160.929993', low='153.279999', close='157.669998', adj_close='157.669998', volume='122334500'),
 Row(stock_name='TSLA', date='2022-12-16', open='159.639999', high='160.990005', low='150.039993', close='150.229996', adj_close='150.229996', volume='139032200'),
 Row(stock_name='TSLA', date='2022-12-19', open='154.000000', high='155.250000', low='145.820007', close='149.869995', adj_close='149.869995', volume='139390600'),
 Row(stock_name='TSLA', date='2022-12-20', open='146.050003', high='148.470001', low='137.660004', close='137.800003', adj_close='137.800003', volume='159563300'),
 Row(stock_name='TSLA', date='2022-12-21', open='139.339996', high='141.259995', low='135.889999', close='137.570007', adj_close='137.570007', volume='145417400'),
 Row(stock_name='TSLA', date='2022-12-22', open='136.000000', high='136.630005', low='122.260002', close='125.349998', adj_close='125.349998', volume='210090300'),
 Row(stock_name=

In [118]:
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.exceptions import NotFound

PROJECT_NAME = 'team-week-3'
DATASET_NAME = 'tech_stocks_world_events'

key_path = "/Users/Ruben/Desktop/google_cred/.cred/team_project_3/team-week-3-2f1d10dceea4.json"

credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client = bigquery.Client(credentials=credentials, project=credentials.project_id)

dataset_id = f"{PROJECT_NAME}.{DATASET_NAME}"
table_id = f"{PROJECT_NAME}.{DATASET_NAME}.stocks"

data_dir = '../data'
DATA_FILE = os.path.join(data_dir,'all_tech_stocks/part-000*c000.csv')

TABLE_SCHEMA = [
    bigquery.SchemaField('stock_name', 'STRING', mode='REQUIRED'),
    bigquery.SchemaField('date', 'DATE', mode='NULLABLE'),
    bigquery.SchemaField('open', 'FLOAT', mode='NULLABLE'),
    bigquery.SchemaField('high', 'FLOAT', mode='NULLABLE'),
    bigquery.SchemaField('low', 'FLOAT', mode='NULLABLE'),
    bigquery.SchemaField('close', 'FLOAT', mode='NULLABLE'),
    bigquery.SchemaField('adj_close', 'STRING', mode='NULLABLE'),
    bigquery.SchemaField('volume', 'INTEGER', mode='NULLABLE'),
    ]

def create_dataset():
    if client.get_dataset(dataset_id) == NotFound:
        dataset = bigquery.Dataset(dataset_id)
        dataset.location = "US"
        dataset = client.create_dataset(dataset, exists_ok=True)
    else:
        pass


def create_stocks_table():
    job_config = bigquery.LoadJobConfig(
            source_format=bigquery.SourceFormat.CSV,
            autodetect=True,
            create_disposition='CREATE_NEVER',
            write_disposition='WRITE_TRUNCATE',
            ignore_unknown_values=True,
        )
    table = bigquery.Table(table_id, schema=TABLE_SCHEMA)
    table = client.create_table(table, exists_ok=True)

    with open(DATA_FILE, "rb") as source_file:
        job = client.load_table_from_file(source_file, table_id, job_config=job_config)

    job.result()

create_dataset()
create_stocks_table()

FileNotFoundError: [Errno 2] No such file or directory: '../data/all_tech_stocks/part-000*c000.csv'

In [124]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as sf      # sf = spark functions
import pyspark.sql.types as st          # st = spark types


import os
data_dir = '../data'

file_names = ['AAPL', 'ADBE','AMZN', 'CRM', 'CSCO', 'GOOGL', 'IBM','INTC','META','MSFT','NFLX','NVDA','ORCL','TSLA']

#columns = 'stock_name string, date date, open float, high float, low float, close float, adj_close float, volume int'

df_list = []

for csv in file_names:
    idf = pd.read_csv(os.path.join(data_dir,csv+'.csv'))
    idf = idf.rename(columns={'Date':'date', 'Open':'open', 'High':'high', 'Low':'low', 'Close':'close', 'Adj Close':'adj_close', 'Volume':'volume'})
    idf.insert(0,"stock_name", csv)
    df_list.append(idf)

df = pd.concat(df_list,axis=0)


df.to_csv(os.path.join(data_dir,'all_tech_stocks.csv'), 'w', header=True)
#df = sparkql.read.csv(os.path.join(data_dir,'all_tech_stocks.csv'), header=True)