In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.functions import *
from datetime import datetime

In [None]:
sc.getConf()#.get("spark.executor.instances")

In [None]:
config = SparkConf().setAll([
    ('spark.executor.cores', '2'), ('spark.executor.memory', '4g'), ('spark.driver.memory','4g'), ('spark.submit.deployMode','client')
])
sc.stop()
sc = SparkContext(conf=config)

#sc = SparkContext()
spark = SparkSession(sc)
sqlContext = SQLContext(sc)

In [None]:
df = sqlContext.read.csv('user_ecommerce-data.csv', header=True, inferSchema=True)
df = df.drop('CustomerID', 'UnitPrice')
df = df.withColumn("Date", to_date(col("Date")))

#df.show()
df.count()

In [None]:
def get_outliers(df, column):
    quantiles = df.stat.approxQuantile(column, [0.05, 0.95], 0.0)
    Q1 = quantiles[0]
    Q3 = quantiles[1]
    IQR = Q3 - Q1
    lowerRange = Q1 - 1.5 * IQR
    upperRange = Q3 + 1.5 * IQR
    df.filter((df[column] < lowerRange) | (df[column] > upperRange)).show()

def remove_outliers(df, column):
    quantiles = df.stat.approxQuantile(column, [0.05, 0.95], 0.0)
    Q1 = quantiles[0]
    Q3 = quantiles[1]
    IQR = Q3 - Q1
    lowerRange = Q1 - 1.5 * IQR
    upperRange = Q3 + 1.5 * IQR
    df = df.filter((df[column] >= lowerRange) | (df[column] <= upperRange))

remove_outliers(df, 'Quantity')
remove_outliers(df, 'TotalPrice')

In [None]:
agg = df.select('Date', 'Quantity', 'TotalPrice').groupby('Date').sum()
agg = agg.withColumnRenamed('sum(Quantity)', 'Quantity')
agg = agg.withColumnRenamed('sum(TotalPrice)', 'TotalPrice')

agg = agg.withColumn('Month', month('Date'))
agg = agg.withColumn('DayOfMonth', dayofmonth('Date'))
agg = agg.withColumn('DayOfWeek', dayofweek('Date'))
agg = agg.withColumn('DayOfYear', dayofyear('Date'))
agg = agg.withColumn('Weekend', dayofweek('Date').isin(6,7).cast('int'))

#agg.show()
agg.count()

In [None]:
from pyspark.sql.functions import desc, asc
agg = agg.orderBy(asc("Date"))
pd_agg = agg.toPandas()
pd_agg['Date'].count() * 0.9, pd_agg['Date'].count() * 0.1

In [None]:
275+30

In [None]:
from pyspark.sql.functions import desc, asc
agg = agg.orderBy(asc("Date"))
agg_train = agg.limit(275)
agg_test = agg.orderBy(desc("Date")).limit(30).orderBy("Date")
#train.show()
#test.show()

#pd_train = agg_train.toPandas()
#pd_test = agg_test.toPandas()
#pd_train['Date'].unique(), pd_test['Date'].unique()

In [None]:
"""
from pyspark.ml.feature import Bucketizer

bucketizer = Bucketizer(splits=[ 0, 2, 5, 8, 11, 14, 15, 5000], inputCol="Quantity", outputCol="QuantityRange")
df = bucketizer.setHandleInvalid("keep").transform(df)

bucketizer = Bucketizer(splits=[ 0, 1, 2, 3, 4, 20], inputCol="UnitPrice", outputCol="PriceRange")
df = bucketizer.setHandleInvalid("keep").transform(df)

from pyspark.ml.feature import QuantileDiscretizer
discretizer = QuantileDiscretizer(numBuckets=3, inputCol="Month", outputCol="DateRange")
result = discretizer.fit(df).transform(df)

from pyspark.ml.feature import StringIndexer

si = StringIndexer(inputCol='StockCode', outputCol='StockCodeIndex')
df = si.fit(df).transform(df)

si = StringIndexer(inputCol='Country', outputCol='CountryIndex')
df = si.fit(df).transform(df)

df = df.withColumn("CountryIndex", df["CountryIndex"].cast('int'))
df = df.withColumn("StockCodeIndex", df["StockCodeIndex"].cast('int'))

df = df.withColumn("QuantityRange", df["QuantityRange"].cast('int'))
df = df.withColumn("PriceRange", df["PriceRange"].cast('int'))

df = df.withColumn("Cluster", df["Cluster"].cast('int'))
#df.show()
"""

In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    #inputCols=['Quantity', 'UnitPrice', 'QuantityRange', 'PriceRange', 'Month', 'CountryIndex', 'StockCodeIndex', 'Cluster'],
    inputCols=['Quantity','TotalPrice','Month','DayOfMonth','DayOfWeek','DayOfYear','Weekend'],
    outputCol="features")

train = assembler.transform(agg_train).select(['features', 'TotalPrice'])
test = assembler.transform(agg_test).select(['features', 'TotalPrice'])

In [None]:
def evaluate(prediction):
    evaluator = RegressionEvaluator(labelCol="TotalPrice", predictionCol="prediction", metricName="rmse")
    print(f"Root Mean Squared Error (RMSE) on test data = {evaluator.evaluate(prediction)}")

    evaluator = RegressionEvaluator(labelCol="TotalPrice", predictionCol="prediction", metricName="mse")
    print(f"Mean Squared Error (MSE) on test data = {evaluator.evaluate(prediction)}")

    evaluator = RegressionEvaluator(labelCol="TotalPrice", predictionCol="prediction", metricName="mae")
    print(f"MAE on test data = {evaluator.evaluate(prediction)}")

    evaluator = RegressionEvaluator(labelCol="TotalPrice", predictionCol="prediction", metricName="r2")
    print(f"R2 on test data = {evaluator.evaluate(prediction)}")


In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression

#lr = LinearRegression(maxIter=5, regParam=0.3, elasticNetParam=0.8)
lr = LinearRegression(labelCol='TotalPrice')
lrparamGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.001, 0.01, 0.1, 0.5, 1.0, 2.0])
             #.addGrid(lr.regParam, [0.01, 0.1, 0.5])
             .addGrid(lr.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
             #.addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10, 20, 50])
             #.addGrid(lr.maxIter, [1, 5, 10])
             .build())
lrevaluator = RegressionEvaluator(labelCol="TotalPrice", predictionCol="prediction", metricName="rmse")

lrcv = CrossValidator(estimator=lr,
                          estimatorParamMaps=lrparamGrid,
                          evaluator=lrevaluator,
                          numFolds=5)

lrcvModel = lrcv.fit(train)

#lrcvSummary = lrcvModel.bestModel.summary
#print("Coefficient Standard Errors: " + str(lrcvSummary.coefficientStandardErrors))
#print("P Values: " + str(lrcvSummary.pValues)) # Last element is the intercept

lrpredictions = lrcvModel.transform(test)

# cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluate(lrpredictions)

#import numpy as np
#lrcvModel.getEstimatorParamMaps()[ np.argmax(lrcvModel.avgMetrics) ]

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(labelCol='TotalPrice')
dtparamGrid = (ParamGridBuilder()
             #.addGrid(dt.maxDepth, [2, 5, 10, 20, 30])
             .addGrid(dt.maxDepth, [2, 5, 10])
             #.addGrid(dt.maxBins, [10, 20, 40, 80, 100])
             .addGrid(dt.maxBins, [10, 20])
             .build())
dtevaluator = RegressionEvaluator(labelCol="TotalPrice", predictionCol="prediction", metricName="rmse")

dtcv = CrossValidator(estimator=dt,
                          estimatorParamMaps=dtparamGrid,
                          evaluator=dtevaluator,
                          numFolds=3)

dtcvModel = dtcv.fit(train)
dtpredictions = dtcvModel.transform(test)

# cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluate(dtpredictions)

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor

gbt = GBTRegressor(labelCol='TotalPrice')
gbtparamGrid = (ParamGridBuilder()
             #.addGrid(gbt.maxDepth, [2, 5, 10, 20, 30])
             #.addGrid(gbt.maxDepth, [2, 5, 10])
                .addGrid(gbt.maxDepth, [2])
             #.addGrid(gbt.maxBins, [10, 20, 40, 80, 100])
             #.addGrid(gbt.maxBins, [10, 20])
                .addGrid(gbt.maxBins, [10])
             .build())
gbtevaluator = RegressionEvaluator(labelCol="TotalPrice", predictionCol="prediction", metricName="r2")

gbtcv = CrossValidator(estimator=gbt,
                          estimatorParamMaps=gbtparamGrid,
                          evaluator=gbtevaluator,
                          numFolds=3)

gbtcvModel = gbtcv.fit(train)
gbtpredictions = gbtcvModel.transform(test)

# cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluate(gbtpredictions)

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(labelCol='TotalPrice')
rfparamGrid = (ParamGridBuilder()
             #.addGrid(rf.maxDepth, [2, 5, 10, 20, 30])
             .addGrid(rf.maxDepth, [2, 5, 10])
             #.addGrid(rf.maxBins, [10, 20, 40, 80, 100])
             .addGrid(rf.maxBins, [5, 10, 20])
             #.addGrid(rf.numTrees, [5, 20, 50, 100, 500])
             .addGrid(rf.numTrees, [5, 20, 50])
             .build())
rfevaluator = RegressionEvaluator(labelCol="TotalPrice", predictionCol="prediction", metricName="rmse")

rfcv = CrossValidator(estimator=rf,
                          estimatorParamMaps=rfparamGrid,
                          evaluator=rfevaluator,
                          numFolds=3,
                          parallelism=10
                     )
start = datetime.now()
rfcvModel = rfcv.fit(train)
print(f'fitting: {datetime.now() - start}')
start = datetime.now()
rfpredictions = rfcvModel.transform(test)
print(f'predicting: {datetime.now() - start}')

# cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluate(rfpredictions)

In [None]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

for md in [30]:
    for mb in [200]:
        print(f'maxDepth={md},maxBins={mb}')
        gbt = GBTRegressor(labelCol='TotalPrice',maxDepth=md,maxBins=mb)
        start = datetime.now()
        gbtModel = gbt.fit(train)
        print(f'fitting: {datetime.now() - start}')
        start = datetime.now()
        prediction = gbtModel.transform(test)
        print(f'predicting: {datetime.now() - start}')

        #print(gbtModel.featureImportances)
        #evaluate(gbtModel.transform(train))
        evaluate(gbtModel.transform(test))

In [None]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

for md in [30]:
    for mb in [540]:
        for nt in [25]:
            print(f'maxDepth={md},maxBins={mb},numTrees={nt}')
            rf = RandomForestRegressor(labelCol='TotalPrice',maxDepth=md,maxBins=mb,numTrees=nt)
            start = datetime.now()
            rfModel = rf.fit(train)
            print(f'fitting: {datetime.now() - start}')
            start = datetime.now()
            prediction = rfModel.transform(test)
            print(f'predicting: {datetime.now() - start}')

            print(rfModel.featureImportances)
            #evaluate(rfModel.transform(train))
            evaluate(rfModel.transform(test))
            print('')

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

for mi in [5,10]:
    for rp in [1]:
        for en in [0.5, 1]:
            print(f'maxIter={mi},regParam={rp},elasticNetParam={en}')
            lr = LinearRegression(labelCol='TotalPrice', maxIter=mi, regParam=rp, elasticNetParam=en)
            start = datetime.now()
            lrModel = lr.fit(train)
            print(f'fitting: {datetime.now() - start}')
            start = datetime.now()
            prediction = lrModel.transform(test)
            print(f'predicting: {datetime.now() - start}')

            #print(lrModel.featureImportances)
            #evaluate(lrModel.transform(train))
            evaluate(lrModel.transform(test))
            print('')

In [None]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(labelCol='TotalPrice', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(train)

print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

#trainingSummary.residuals.show()

prediction = lrModel.transform(test)
evaluate(prediction)