In [1]:
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import *
from pyspark.sql import Row
from pyspark.sql.functions import *
import numpy as np

In [2]:
path = 'C://Users/beile.yaaqob.aisin/Downloads/The_Reddit_Ethereum_Dataset/'

In [3]:
# initializing spark session
sc = SparkContext(appName="PySparkShell")
spark = SparkSession(sc)

In [5]:
my_schema = StructType().add("date", "string") \
             .add("body", "string") \
             .add("btc_price", "float") \
             .add("eth_price", "float")

In [20]:
# read the dataset  
my_data = spark.read.format("csv").option("header","true").option("multiline","true").option("quote", "\"").option("escape", "\"").schema(my_schema).load(path+"data.csv")
my_data = my_data.dropna()

train = my_data[my_data['date'] <= '2019-10-07'] # arbitrary end of the last market cycle

test = my_data[my_data['date'] > '2019-10-07']
test = test[test['date'] <= '2021-07-12'] # the most recent 'bottom' of eth

wild = my_data[my_data['date'] > '2021-07-12']

In [7]:
# define stage 1: tokenize the text    
stage_1 = RegexTokenizer(inputCol= 'body' , outputCol= 'tokens', pattern= '\\W')
# define stage 2: remove the stop words
stage_2 = StopWordsRemover(inputCol= 'tokens', outputCol= 'filtered_words')
# define stage 3: create a word vector of the size 100
stage_3 = Word2Vec(inputCol= 'filtered_words', outputCol= 'word_vector', vectorSize= 100)
# define stage 4: assembling the word vector and the vector of btc price 
stage_4 = VectorAssembler(inputCols=['word_vector','btc_price'], outputCol='vector')


# setup the pipeline
pipeline = Pipeline(stages= [stage_1, stage_2, stage_3, stage_4])# setup the pipeline

In [8]:
pipeline_model = pipeline.fit(my_data) # training the pipeline

In [21]:
X_train = pipeline_model.transform(train)

X_test = pipeline_model.transform(test)

X_wild = pipeline_model.transform(wild) # to generate price predictions and record as results

In [10]:
dtr = DecisionTreeRegressor(featuresCol = 'vector', labelCol='eth_price') #model #1
gbt = GBTRegressor(featuresCol = 'vector', labelCol='eth_price') # model #2

In [11]:
dtr_fit = dtr.fit(X_train) #decision tree regressor
gbt_fit = gbt.fit(X_train) #Gradient-Boosted Trees regressor

In [70]:
yhat_dtr = dtr_fit.transform(X_test)
yhat_dtr = yhat_dtr[['eth_price','prediction']] # predictions by the decision tree model

yhat_gbt = gbt_fit.transform(X_test)
yhat_gbt = yhat_gbt[['eth_price','prediction']] # predictions by the gbt model

In [73]:
eval_ = RegressionEvaluator(labelCol= 'eth_price', predictionCol= 'prediction') # for evaluation

In [85]:
# evaluating the decision tree model

rmse = eval_.evaluate(yhat_dtr, {eval_.metricName: "rmse"})

print('rmse_dtr is %.2f' %rmse)

mae = eval_.evaluate(yhat_dtr, {eval_.metricName: "mae"})
print('mae_dtr is %.2f' %mae)

r2 = eval_.evaluate(yhat_dtr, {eval_.metricName: "r2"})
print('r2_dtr is %.2f' %r2)

rmse_dtr is 918.71
mae_dtr is 546.65
r2_dtr is -0.01


In [86]:
# evaluating the gbt model
rmse_gbt = eval_.evaluate(yhat_gbt, {eval_.metricName: "rmse"})

print('rmse_gbt is %.2f' %rmse_gbt)

mae_gbt = eval_.evaluate(yhat_gbt, {eval_.metricName: "mae"})
print('mae_gbt is %.2f' %mae_gbt)

r2_gbt = eval_.evaluate(yhat_gbt, {eval_.metricName: "r2"})
print('r2_gbt is %.2f' %r2_gbt)

rmse_gbt is 924.47
mae_gbt is 544.41
r2_gbt is -0.03


In [88]:
# writing the evalutions in txt files

f1 = open(path+'dtr_eval.txt', 'w')

f1.write('rmse_dtr is %.2f' %rmse)
f1.write('\n')
f1.write('mae_dtr is %.2f' %mae)
f1.write('\n')
f1.write('r2_dtr is %.2f' %r2)
f1.close()

f2 = open(path+'gbt_eval.txt', 'w')

f2.write('rmse_gbt is %.2f' %rmse_gbt)
f2.write('\n')
f2.write('mae_gbt is %.2f' %mae_gbt)
f2.write('\n')
f2.write('r2_gbt is %.2f' %r2_gbt)
f2.close()

In [80]:
# predictions of two models
yhat_dtr_wild = dtr_fit.transform(X_wild)
yhat_dtr_wild = yhat_dtr_wild[['eth_price','prediction']]

yhat_gbt_wild = gbt_fit.transform(X_wild)
yhat_gbt_wild = yhat_gbt_wild[['eth_price','prediction']]

In [81]:
import pandas as pd

In [82]:
yhat_dtr_wild = yhat_dtr_wild.toPandas()
yhat_gbt_wild = yhat_gbt_wild.toPandas()

In [84]:
yhat_dtr_wild.to_csv(path+'dtr_output.csv',index=False)
yhat_gbt_wild.to_csv(path+'gbt_output.csv',index=False)