# 

In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
import yfinance as yf
from pyspark.sql.functions import format_number, mean, max, min
from pyspark.sql.functions import (dayofmonth, hour, dayofyear, month, year, weekofyear, date_format)

In [2]:
pyspark.__version__

'3.0.3'

In [3]:
sc = SparkContext.getOrCreate()

22/07/04 16:00:56 WARN Utils: Your hostname, computador resolves to a loopback address: 127.0.1.1; using 10.0.0.135 instead (on interface wlp2s0)
22/07/04 16:00:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/07/04 16:00:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
spark = SparkSession(sc)

In [5]:
pyspark

<module 'pyspark' from '/home/edson/spark/spark-3.0.3-bin-hadoop3.2/python/pyspark/__init__.py'>

In [6]:
start_date = '2011-12-31'
end_date = '2017-12-31'

In [7]:
data = yf.download('WMT', start=start_date, end=end_date)

[*********************100%***********************]  1 of 1 completed


In [9]:
data.to_csv('./data/walmart_data.csv')

In [10]:
df = spark.read.csv('./data/walmart_data.csv', inferSchema=True, header=True)

In [11]:
df.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']

In [12]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)



In [13]:
for line in df.head(5):
    print(line, '\n')

Row(Date='2012-01-03', Open=59.970001220703125, High=61.060001373291016, Low=59.869998931884766, Close=60.33000183105469, Adj Close=47.53925704956055, Volume=12668800) 

Row(Date='2012-01-04', Open=60.209999084472656, High=60.349998474121094, Low=59.470001220703125, Close=59.709999084472656, Adj Close=47.05069351196289, Volume=9593300) 

Row(Date='2012-01-05', Open=59.349998474121094, High=59.619998931884766, Low=58.369998931884766, Close=59.41999816894531, Adj Close=46.82217788696289, Volume=12768200) 

Row(Date='2012-01-06', Open=59.41999816894531, High=59.45000076293945, Low=58.869998931884766, Close=59.0, Adj Close=46.491233825683594, Volume=8069400) 

Row(Date='2012-01-09', Open=59.029998779296875, High=59.54999923706055, Low=58.91999816894531, Close=59.18000030517578, Adj Close=46.63306427001953, Volume=6679300) 



In [14]:
summary = df.describe()
summary.select(summary['summary'],
                  format_number(summary['Open'].cast('float'), 2).alias('Open'),
                  format_number(summary['High'].cast('float'), 2).alias('High'),
                  format_number(summary['Low'].cast('float'), 2).alias('LOw'),
                  format_number(summary['Close'].cast('float'), 2).alias('Close'),
                  format_number(summary['Volume'].cast('int'), 0).alias('Volume'),
                  ).show()

                                                                                

+-------+--------+--------+--------+--------+----------+
|summary|    Open|    High|     LOw|   Close|    Volume|
+-------+--------+--------+--------+--------+----------+
|  count|1,509.00|1,509.00|1,509.00|1,509.00|     1,509|
|   mean|   73.44|   73.94|   73.01|   73.48| 8,335,863|
| stddev|    7.58|    7.60|    7.55|    7.58| 4,573,647|
|    min|   56.39|   57.06|   56.30|   56.42| 2,094,900|
|    max|   99.91|  100.13|   99.12|   99.62|80,898,100|
+-------+--------+--------+--------+--------+----------+



In [15]:
df_hv = df.withColumn('HV Ratio', df['High']/df['Volume']).select(['HV Ratio'])

In [16]:
df_hv.show()

+--------------------+
|            HV Ratio|
+--------------------+
|4.819714682786927E-6|
|6.290848662516662E-6|
| 4.66941298944916E-6|
| 7.36733843444859E-6|
|8.915604814435727E-6|
|8.644477449144044E-6|
|9.351828386844425E-6|
| 8.29141562102703E-6|
|7.712212051589609E-6|
|7.071764777688419...|
|1.015495462653464...|
|  6.5763540967921E-6|
| 5.90145296180676E-6|
|8.547679390846264E-6|
|8.420709512685392E-6|
|1.041448335142357...|
|8.316075435382035E-6|
|9.721183804158345E-6|
|8.029435987746889E-6|
|6.307432228123159E-6|
+--------------------+
only showing top 20 rows



In [17]:
df.orderBy(df['High'].desc()).select(['Date']).head(1)[0]['Date']

'2017-11-17'

In [18]:
df.select(mean('Close')).show()

+-----------------+
|       avg(Close)|
+-----------------+
|73.48192850243105|
+-----------------+



In [19]:
df.select(max('Volume'), min('Volume')).show()

+-----------+-----------+
|max(Volume)|min(Volume)|
+-----------+-----------+
|   80898100|    2094900|
+-----------+-----------+



In [20]:
df.filter(df['Close'] < 60).count()

81

In [21]:
df.filter('High > 80').count() * 100/df.count()

13.585155732273028

In [22]:
df.corr('High', 'Volume')

-0.21686100845970427

In [23]:
year_df = df.withColumn('Year', year(df['Date']))

In [24]:
year_df.groupBy('Year').max()['Year', 'max(High)'].show()



+----+------------------+
|Year|         max(High)|
+----+------------------+
|2015| 90.97000122070312|
|2013| 81.37000274658203|
|2014| 88.08999633789062|
|2012|  77.5999984741211|
|2016| 75.19000244140625|
|2017|100.12999725341797|
+----+------------------+



In [25]:
month_df = df.withColumn('Month', month(df['Date']))
month_df = month_df.groupBy('Month').mean()
month_df = month_df.orderBy('Month')
month_df['Month', 'avg(Close)'].show()



+-----+-----------------+
|Month|       avg(Close)|
+-----+-----------------+
|    1|70.82504124286746|
|    2|70.92482777299553|
|    3| 71.5119999225323|
|    4|73.08225816295993|
|    5|73.09976547956467|
|    6|73.36492192745209|
|    7|74.74850379763626|
|    8|74.23105280022872|
|    9|73.39057400187508|
|   10|73.78681824424051|
|   11|75.76213136266489|
|   12|76.81428603520469|
+-----+-----------------+



                                                                                