# Spark Pipeline with Kaggle Data

### 1. Set Up the Data

In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as sf
import pyspark.sql.types as st

# Creating a spark session
spark = SparkSession.builder.master('local').getOrCreate()
coffee_csv = './data/coffee.csv'
# All the columns should be floats except for the 'Date' and 'Currency' columns.
schema = 'date date,open float,high float,low float,close float,volume float,currency string'
# Read the coffee data CSV file into a Spark DataFrame.
df = spark.read.csv(coffee_csv, schema=schema, header=True, enforceSchema=True)
df.describe

<bound method DataFrame.describe of DataFrame[date: date, open: float, high: float, low: float, close: float, volume: float, currency: string]>

### Columns from Aggregate Functions

In [11]:
# Add a column to the DataFrame where the values are the difference between 'Open' and 'Close'.
df = df.withColumn('daily_price_change', df.open - df.close)
# Add a column to the DataFrame where the values are the difference between 'High' and 'Low'.
df = df.withColumn('daily_fluctuation', df.high - df.low)
# Add a column to the DataFrame where the values are 'True' if the volume for that day was 100 or above, and otherwise 'False'.
df = df.withColumn('vol_over_100', df.volume > 100)

df.show(3)
df.columns

+----------+------+-----+------+------+------+--------+------------------+-----------------+------------+
|      date|  open| high|   low| close|volume|currency|daily_price_change|daily_fluctuation|vol_over_100|
+----------+------+-----+------+------+------+--------+------------------+-----------------+------------+
|2000-01-03|122.25|124.0| 116.1| 116.5|6640.0|     USD|              5.75|        7.9000015|        true|
|2000-01-04|116.25|120.5|115.75|116.25|5492.0|     USD|               0.0|             4.75|        true|
|2000-01-05| 115.0|121.0| 115.0| 118.6|6165.0|     USD|        -3.5999985|              6.0|        true|
+----------+------+-----+------+------+------+--------+------------------+-----------------+------------+
only showing top 3 rows



['date',
 'open',
 'high',
 'low',
 'close',
 'volume',
 'currency',
 'daily_price_change',
 'daily_fluctuation',
 'vol_over_100']