In [None]:
import findspark
findspark.find()
findspark.init()

In [None]:
import pyspark
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
    .master("local") \
    .appName("StockMarket") \
    .config("spark.executor.memory", "6gb") \
    .getOrCreate()

In [None]:
df =spark.read.format('com.databricks.spark.csv')\
                    .options(header='true', inferschema='true')\
                    .load('AAPL.csv')

In [None]:
df.show()

In [None]:
import pyspark.sql.functions as f
df = df.withColumn('date', f.to_date('Date'))

In [None]:
df.show(n=5)

In [None]:
date_breakdown = ['year', 'month', 'day']
for i in enumerate(date_breakdown):
    index = i[0]
    name = i[1]
    df = df.withColumn(name, f.split('date', '-')[index])

df.show(n=10)

In [None]:
df_plot = df.select('year', 'Adj Close').toPandas()

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline
df_plot.set_index('year', inplace=True)
df_plot.plot(figsize=(16, 6), grid=True)
plt.title('Apple stock')
plt.ylabel('Stock Quote ($)')
plt.show()

In [None]:
df.toPandas().shape

In [None]:
df.dropna().count()

In [None]:
df.select('Open', 'High', 'Low', 'Close', 'Adj Close').describe().show()

In [None]:
df.groupBy(['year']).agg({'Adj Close':'count'})\
    .withColumnRenamed('count(Adj Close)', 'Row Count')\
    .orderBy(["year"],ascending=False)\
    .show()

In [None]:
trainDF = df[df.year < 2017]
testDF = df[df.year > 2016]

In [None]:
trainDF.toPandas().shape

In [None]:
testDF.toPandas().shape

In [None]:
trainDF_plot = trainDF.select('year', 'Adj Close').toPandas()
trainDF_plot.set_index('year', inplace=True)
trainDF_plot.plot(figsize=(16, 6), grid=True)
plt.title('Apple Stock 2000-2016')
plt.ylabel('Stock Quote ($)')
plt.show()

In [None]:
testDF_plot = testDF.select('year', 'Adj Close').toPandas()
testDF_plot.set_index('year', inplace=True)
testDF_plot.plot(figsize=(16, 6), grid=True)
plt.title('Apple Stock 2017-2018')
plt.ylabel('Stock Quote ($)')
plt.show()

In [None]:
import numpy as np
trainArray=np.array(trainDF.select('Open','High','Low','Close','Volume','Adj Close').collect())
testArray=np.array(testDF.select('Open','High','Low','Close','Volume','Adj Close').collect())

In [None]:
print(trainArray[0])
print('---------------')
print(testArray[0])

In [None]:
from sklearn.preprocessing import MinMaxScaler
minMaxScale = MinMaxScaler()

In [None]:
minMaxScale.fit(trainArray)

In [None]:
testingArray = minMaxScale.transform(testArray)
trainingArray = minMaxScale.transform(trainArray)

In [None]:
print(testingArray[0])
print('---------------')
print(trainingArray[0])

In [None]:
xtrain = trainingArray[:, 0:-1]
xtest = testingArray[:, 0:-1]
ytrain = trainingArray[:, -1:]
ytest = testingArray[:, -1:]

In [None]:
trainingArray[0]

In [None]:
xtrain[0]

In [None]:
ytrain[0]

In [None]:
print('xtrain shape = {}'.format(xtrain.shape))
print('xtest shape = {}'.format(xtest.shape))
print('ytrain shape = {}'.format(ytrain.shape))
print('ytest shape = {}'.format(ytest.shape))

In [None]:
plt.figure(figsize=(16,6))
plt.plot(xtrain[:,0],color='red', label='open')
plt.plot(xtrain[:,1],color='blue', label='high')
plt.plot(xtrain[:,2],color='green', label='low')
plt.plot(xtrain[:,3],color='purple', label='close')
plt.legend(loc = 'upper left')
plt.title('Open, High, Low, and Close by Day')
plt.xlabel('Days')
plt.ylabel('Scaled Quotes')
plt.show()

In [None]:
plt.figure(figsize=(16,6))
plt.plot(xtrain[:,4],color='black', label='volume')
plt.legend(loc = 'upper right')
plt.title('Volume by Day')
plt.xlabel('Days')
plt.ylabel('Scaled Volume')
plt.show()

In [None]:
from keras import models, layers

In [None]:
model = models.Sequential()
model.add(layers.LSTM(1, input_shape=(1,5)))
model.add(layers.Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

In [None]:
xtrain = xtrain.reshape((xtrain.shape[0], 1, xtrain.shape[1]))
xtest = xtest.reshape((xtest.shape[0], 1, xtest.shape[1]))

In [None]:
print('The shape of xtrain is {}: '.format(xtrain.shape))
print('The shape of xtest is {}: '.format(xtest.shape))

In [None]:
loss = model.fit(xtrain, ytrain, batch_size=10, epochs=10)

In [None]:
plt.plot(loss.history['loss'], label = 'loss')
plt.title('mean squared error by epoch')
plt.legend()
plt.show()

In [None]:
predicted = model.predict(xtest)

In [None]:
combined_array = np.concatenate((ytest, predicted), axis = 1)

In [None]:
plt.figure(figsize=(16,6))
plt.plot(combined_array[:,0],color='red', label='actual')
plt.plot(combined_array[:,1],color='blue', label='predicted')
plt.legend(loc = 'lower right')
plt.title('2017 Actual vs. Predicted APPL Stock')
plt.xlabel('Days')
plt.ylabel('Scaled Quotes')
plt.show()

In [None]:
import sklearn.metrics as metrics
np.sqrt(metrics.mean_squared_error(ytest,predicted))

In [None]:
spark.stop()