In [33]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, DoubleType, BooleanType
from pyspark.sql.functions import col, array_contains, avg, kurtosis, mean, skewness

# Get or instantiate a SparkContext and register it as a singleton object
sc = SparkSession.builder.appName('adq').getOrCreate()

In [6]:
delimiter = ","
# A DataFrame is created by reading a file
df = sc.read.format("csv").option("inferSchema", True).option("header", True).option("sep", delimiter).load("TSLA.csv")

df.printSchema()
df.show()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)

+----------+---------+---------+---------+---------+---------+--------+
|      Date|     Open|     High|      Low|    Close|Adj Close|  Volume|
+----------+---------+---------+---------+---------+---------+--------+
|2010-06-29|     19.0|     25.0|17.540001|23.889999|23.889999|18766300|
|2010-06-30|25.790001|    30.42|23.299999|    23.83|    23.83|17187100|
|2010-07-01|     25.0|    25.92|    20.27|21.959999|21.959999| 8218800|
|2010-07-02|     23.0|     23.1|18.709999|19.200001|19.200001| 5139800|
|2010-07-06|     20.0|     20.0|    15.83|16.110001|16.110001| 6866900|
|2010-07-07|     16.4|16.629999|    14.98|     15.8|     15.8| 6921700|
|2010-07-08|16.139999|    17.52|    15.57|17.459999|17.459999| 7711400|
|2010-07-09|  

# Data Science PySpark Notebook
---


In [9]:
""" 
Column instance: df.colName; almost all functions from the pyspark.sql.functions module take one or more column instances as arguments. These functions are important for data manipulation
"""

df.show(n=5, truncate=False)

# pandas DataFrame -> spark DataFrame
# pdf = pd.read_csv('TSLA')
# df = spark.createDataFrame(pdf)

+----------+---------+-----+---------+---------+---------+--------+
|Date      |Open     |High |Low      |Close    |Adj Close|Volume  |
+----------+---------+-----+---------+---------+---------+--------+
|2010-06-29|19.0     |25.0 |17.540001|23.889999|23.889999|18766300|
|2010-06-30|25.790001|30.42|23.299999|23.83    |23.83    |17187100|
|2010-07-01|25.0     |25.92|20.27    |21.959999|21.959999|8218800 |
|2010-07-02|23.0     |23.1 |18.709999|19.200001|19.200001|5139800 |
|2010-07-06|20.0     |20.0 |15.83    |16.110001|16.110001|6866900 |
+----------+---------+-----+---------+---------+---------+--------+
only showing top 5 rows



In [10]:
df.dtypes

[('Date', 'string'),
 ('Open', 'double'),
 ('High', 'double'),
 ('Low', 'double'),
 ('Close', 'double'),
 ('Adj Close', 'double'),
 ('Volume', 'int')]

In [27]:
df(avg(col("High"))).show()

TypeError: 'DataFrame' object is not callable

In [34]:
df.agg(kurtosis(col("High"))).show()
df.agg(skewness(col("High"))).show()

+-------------------+
|     kurtosis(High)|
+-------------------+
|-0.5854562653993631|
+-------------------+

+--------------------+
|      skewness(High)|
+--------------------+
|0.011915565591975215|
+--------------------+



In [32]:
df.filter(df['High'] > 0).agg(mean(col("High"))).show()

+------------------+
|         avg(High)|
+------------------+
|189.57822425620878|
+------------------+



In [35]:
df.describe().show()

+-------+----------+------------------+------------------+------------------+------------------+------------------+-----------------+
|summary|      Date|              Open|              High|               Low|             Close|         Adj Close|           Volume|
+-------+----------+------------------+------------------+------------------+------------------+------------------+-----------------+
|  count|      2416|              2416|              2416|              2416|              2416|              2416|             2416|
|   mean|      null| 186.2711466001655|189.57822425620878|182.91663908236822|186.40365078187054|186.40365078187054|5572721.688741722|
| stddev|      null|118.74016318157156|120.89232871387047|116.85759099326675|119.13601997634154|119.13601997634154|4987809.151888422|
|    min|2010-06-29|         16.139999|         16.629999|             14.98|              15.8|              15.8|           118500|
|    max|2020-02-03|        673.690002|        786.140015|    

In [None]:
# Make the describe function except for the other measures