In [219]:
import pyspark
from pyspark.sql import SparkSession
import yfinance as yahooFinance
import pandas as pd

In [220]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [221]:
# Import packages
import yfinance as yf
import pandas as pd

# Read and print the stock tickers that make up S&P500
tickers = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
tickers.to_csv("list_500_companies.csv")


In [124]:
!wc -l list_500_companies.csv

504 list_500_companies.csv


In [77]:
df = spark.read \
    .option("header", "true") \
    .csv('list_500_companies.csv')

In [86]:
df.show()

24/04/12 12:21:24 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Symbol, Security, GICS Sector, GICS Sub-Industry, Headquarters Location, Date added, CIK, Founded
 Schema: _c0, Symbol, Security, GICS Sector, GICS Sub-Industry, Headquarters Location, Date added, CIK, Founded
Expected: _c0 but found: 
CSV file: file:///workspaces/RKZoomCamp2024/list_500_companies.csv
+---+------+--------------------+--------------------+--------------------+---------------------+----------+-------+-----------+
|_c0|Symbol|            Security|         GICS Sector|   GICS Sub-Industry|Headquarters Location|Date added|    CIK|    Founded|
+---+------+--------------------+--------------------+--------------------+---------------------+----------+-------+-----------+
|  0|   MMM|                  3M|         Industrials|Industrial Conglo...| Saint Paul, Minne...|1957-03-04|  66740|       1902|
|  1|   AOS|         A. O. Smith|         Industrials|   Building Products| Milwaukee,

In [131]:
# in order to specify start date and 
# end date we need datetime package
import datetime
 
# startDate , as per our convenience we can modify
startDate = datetime.datetime(2017, 1, 1)
 
# endDate , as per our convenience we can modify
endDate = datetime.datetime(2023, 12, 31)


In [176]:
#files = ['META', 'AMZN', 'GOOGL', 'AMD']
files = ['META', 'AMZN']
print(files)

['META', 'AMZN']


In [177]:
for f in files:
    stock = yahooFinance.Ticker(f)
    # pass the parameters as the taken dates for start and end
    hist = stock.history(start=startDate,end=endDate)
    hist.to_csv(f'{f}_hist.csv')
    file_name = f'{f}_hist.csv'
    print(file_name)

META_hist.csv
AMZN_hist.csv


In [178]:
!wc -l {file_name}

1761 AMZN_hist.csv


In [179]:
df = spark.read \
    .option("header", "true") \
    .csv(file_name)

In [180]:
df.show()

+--------------------+------------------+------------------+------------------+------------------+---------+---------+------------+
|                Date|              Open|              High|               Low|             Close|   Volume|Dividends|Stock Splits|
+--------------------+------------------+------------------+------------------+------------------+---------+---------+------------+
|2017-01-03 00:00:...|37.895999908447266|  37.9379997253418|  37.3849983215332| 37.68349838256836| 70422000|      0.0|         0.0|
|2017-01-04 00:00:...|37.919498443603516| 37.98400115966797|37.709999084472656| 37.85900115966797| 50210000|      0.0|         0.0|
|2017-01-05 00:00:...| 38.07749938964844|39.119998931884766| 38.01300048828125|39.022499084472656|116602000|      0.0|         0.0|
|2017-01-06 00:00:...| 39.11800003051758| 39.97200012207031| 38.92399978637695| 39.79949951171875|119724000|      0.0|         0.0|
|2017-01-09 00:00:...|39.900001525878906|  40.0885009765625|  39.58850097656

In [181]:
df.schema

StructType([StructField('Date', StringType(), True), StructField('Open', StringType(), True), StructField('High', StringType(), True), StructField('Low', StringType(), True), StructField('Close', StringType(), True), StructField('Volume', StringType(), True), StructField('Dividends', StringType(), True), StructField('Stock Splits', StringType(), True)])

In [182]:
!head -n 1001 {file_name} > head.csv

In [183]:
import pandas as pd

In [184]:
df_pandas = pd.read_csv('head.csv')

In [185]:
df_pandas.dtypes

Date             object
Open            float64
High            float64
Low             float64
Close           float64
Volume            int64
Dividends       float64
Stock Splits    float64
dtype: object

In [186]:
#spark.createDataFrame(df_pandas).schema

In [187]:
from pyspark.sql import types

In [188]:
#schema = types.StructType([
#    types.StructField('_c0', types.IntegerType(), True),
#    types.StructField('Symbol', types.StringType(), True),
#    types.StructField('Security', types.StringType(), True),
#    types.StructField('GICS Sector', types.StringType(), True),
#    types.StructField('GICS Sub-Industry', types.StringType(), True),
#    types.StructField('Date added', types.DateType(), True),
#    types.StructField('CIK', types.StringType(), True),
#    types.StructField('Founded', types.StringType(), True)
#])
schema = types.StructType([
    types.StructField('Date', types.DateType(), True),
    types.StructField('Open', types.FloatType(), True),
    types.StructField('High', types.FloatType(), True),
    types.StructField('Low', types.FloatType(), True),
    types.StructField('Close', types.FloatType(), True),
    types.StructField('Volume', types.IntegerType(), True),
    types.StructField('Dividends', types.FloatType(), True),
    types.StructField('Stock Splits', types.FloatType(), True)
])

In [189]:
df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv(file_name)

In [190]:
df = df.repartition(24)

In [195]:
path = f'yf/{files[1]}/2023/'

In [196]:
print(path)

yf/AMZN/2023/


In [201]:
df.write.parquet(path, mode='overwrite')

                                                                                

In [203]:
df = spark.read.parquet(path)

In [204]:
df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- Close: float (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Dividends: float (nullable = true)
 |-- Stock Splits: float (nullable = true)



In [205]:
df.registerTempTable('yf_amzn_2023')



In [216]:
df_result = spark.sql("""
SELECT
    Date, Volume
FROM
    yf_amzn_2023
WHERE 
    High > 40.0
""")

In [217]:
df_result.show()

+----------+---------+
|      Date|   Volume|
+----------+---------+
|2022-12-07| 68086900|
|2017-07-27|219834000|
|2017-09-18| 68226000|
|2020-01-14| 68818000|
|2020-11-18| 58336000|
|2019-03-20|125312000|
|2021-12-21| 55956000|
|2022-11-22| 62192000|
|2019-04-29| 80426000|
|2023-09-05| 40636700|
|2022-02-28| 57684000|
|2018-07-02| 63714000|
|2021-04-09| 86830000|
|2018-07-23| 77770000|
|2023-12-01| 39924600|
|2019-09-30| 52894000|
|2021-11-23| 73804000|
|2018-10-24|138568000|
|2017-07-18| 80152000|
|2018-12-07|151522000|
+----------+---------+
only showing top 20 rows



In [218]:
df_result.coalesce(1).write.parquet('yf/out/', mode='overwrite')