Takes Ecommerce data from Kaggle and uses it to try to identify cancellations using Logistic Regression
    1. Cleanses data
            Removes all NAs
            amends dates to String with correct format
            
    2. Calculates additional features: 
            Month of Year
            Day of Week
    3. Calculates label 
            Invoice Code like 'C%'
    3a. Filters data set down to only the CustomerID with highest number of Cancellations
    (this step was added after the initial model predicted no cancellatins whatsoever)
    4. Indexes String features (inc error handling)
    5. 'Hot Encodes' categorical features inc. those from 4.
    4. Creates 'pipeline' 
            indexing
            encoding
            vector assembly
            logistic regression model
    5. Fits model to training data
    8. Applies model to test data
    9. Basic evaluation: model predicts no cancellations whatsoever even after filtering to customer
    with the most cancellations

In [24]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, IntegerType, StructType, StringType, FloatType
from pyspark.sql import functions as F
#import (format_number,dayofmonth,hour,dayofyear,month,
 #                                  year,weekofyear,date_format,concat, lit, from_unixtime
  #                                 , unix_timestamp, to_date, sum as sm, format_number as fn
   #                               ,bround, avg as av, udf, desc)
#from pyspark.sql.functions import DayofYear as dow
import numpy as np
spark = SparkSession.builder.appName('abc').getOrCreate()
sc = spark.sparkContext
from pyspark.ml.classification import LogisticRegression

df = spark.read.csv('EcommerceData.csv',inferSchema=True,header=True)

df = df.na.drop()
df = df.withColumn('Date', F.to_date(F.from_unixtime(F.unix_timestamp(
    F.concat(df['InvoiceDate'],F.lit('')),format='MM/d/yyyy HH:mm'))))
#df = df.withColumn('DayOfWeek', F.date_format(F.from_unixtime(F.unix_timestamp(df.InvoiceDate)),'EEEE'))
df = df.withColumn('DayOfWeek', F.date_format(df.Date,'u').cast('integer'))
df = df.withColumn('Month', F.month(df['Date']))
#df = df.withColumn('Unix', F.unix_timestamp(df['Date'],format='yyyy/MM/dd'))
gred = df.groupBy('CustomerID', 'Date').count()
gg = gred.groupBy('CustomerID').count().filter('count>39').drop('count').withColumnRenamed('CustomerID','CustID')
df = df.join(gg,gg.CustID==df.CustomerID)
df = df.withColumn('Val', F.bround(df['Quantity']* df['Unitprice'],2))

uErr = F.udf(lambda col: 1 if 'C' in col else 0, IntegerType())
df = df.withColumn('Label',uErr('InvoiceNo'))
df.printSchema()
df.columns
#df.groupBy('Label').count().show()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- CustID: integer (nullable = true)
 |-- Val: double (nullable = true)
 |-- Label: integer (nullable = true)



['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country',
 'Date',
 'DayOfWeek',
 'Month',
 'CustID',
 'Val',
 'Label']

In [25]:
df.groupBy('CustomerID', 'Label').count().orderBy(F.desc('Label'),F.desc('count')).show(5)

+----------+-----+-----+
|CustomerID|Label|count|
+----------+-----+-----+
|     14911|    1|  226|
|     17841|    1|  136|
|     15311|    1|  112|
|     13798|    1|   90|
|     14606|    1|   82|
|     12748|    1|   46|
|     13089|    1|   39|
|     14527|    1|   39|
|     16029|    1|   32|
|     13767|    1|   31|
|     16422|    1|   30|
|     16133|    1|   26|
|     13408|    1|   23|
|     13078|    1|   22|
|     17811|    1|   21|
|     14156|    1|   20|
|     15189|    1|   20|
|     13694|    1|   17|
|     15039|    1|    6|
|     14646|    1|    5|
+----------+-----+-----+
only showing top 20 rows



In [29]:
df = df.filter(df['CustomerID']=='14911')

In [30]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)
#features: StockCode, Day of Week, Month
my_final_data = df.select('Month','StockCode','DayOfWeek','Label')
Stock_indexer = StringIndexer(inputCol='StockCode',outputCol='StockIndex').setHandleInvalid("skip")
Stock_encoder = OneHotEncoder(inputCol='StockIndex',outputCol='StockVec')
#Mon_indexer = StringIndexer(inputCol='Month',outputCol='MonIndex').setHandleInvalid("skip")
Mon_encoder = OneHotEncoder(inputCol='Month',outputCol='MonVec')
WD_encoder = OneHotEncoder(inputCol='DayOfWeek',outputCol='WDVec')

assembler = VectorAssembler(inputCols=['StockVec',
 'MonVec',
 'WDVec'],outputCol='features')

from pyspark.ml.classification import LogisticRegression as lr
from pyspark.ml import Pipeline

lr_err = lr(maxIter=10,regParam=0.3,featuresCol='features',labelCol='Label')

pipeline = Pipeline(stages=[Stock_indexer,
                           Stock_encoder,Mon_encoder,WD_encoder,
                           assembler,lr_err])
train_data, test_data = my_final_data.randomSplit([0.8,.2])
fit_model = pipeline.fit(train_data)
results = fit_model.transform(test_data)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='Label')
results

DataFrame[Month: int, StockCode: string, DayOfWeek: int, Label: int, StockIndex: double, StockVec: vector, MonVec: vector, WDVec: vector, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [4]:
results.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Label: integer (nullable = true)
 |-- StockIndex: double (nullable = true)
 |-- CustIndex: double (nullable = true)
 |-- StockVec: vector (nullable = true)
 |-- CustVec: vector (nullable = true)
 |-- WDVec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [31]:
#results = results.withColumn('rprediction',F.bround('prediction',1))
#results.show()
results.groupby('prediction').count().show()
results.show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0| 1017|
+----------+-----+

+-----+---------+---------+-----+----------+-------------------+--------------+-------------+--------------------+--------------------+--------------------+----------+
|Month|StockCode|DayOfWeek|Label|StockIndex|           StockVec|        MonVec|        WDVec|            features|       rawPrediction|         probability|prediction|
+-----+---------+---------+-----+----------+-------------------+--------------+-------------+--------------------+--------------------+--------------------+----------+
|    1|   18097C|        2|    0|     827.0| (1634,[827],[1.0])|(12,[1],[1.0])|(7,[2],[1.0])|(1653,[827,1635,1...|[3.24808755191550...|[0.96260433068897...|       0.0|
|    1|    20914|        3|    1|      30.0|  (1634,[30],[1.0])|(12,[1],[1.0])|(7,[3],[1.0])|(1653,[30,1635,16...|[2.44479750392560...|[0.92018016896627...|       0.0|
|    1|    21035|        7|    0|     756.0| (1634,[756],[1.0])|

In [15]:
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.types import FloatType, BooleanType
def absn(n):
    return abs(n)
uAbs = F.udf(absn,FloatType())
df = df.withColumn('AbsVal',uAbs('Val') )
df = df.orderBy('CustomerID','StockCode','AbsVal',desc('Val'))
win1 = Window.partitionBy('CustomerID','StockCode').orderBy('AbsVal',desc('Val')).rowsBetween(-1,0)
df = df.withColumn('RevTot',F.sum('Val').over(win1).cast('Float'))
def revcheck(a,b):
    if (a<0) & (b==0):
        return 1
    else:
        return 0
uRevCheck = F.udf(revcheck)
#df.show(1)
df = df.withColumn('RevFlag',uRevCheck('Val','RevTot'))
#creates a LIKE filter function
likef = F.udf(lambda col: True if 'C' in col else False, BooleanType())
df.filter(likef('InvoiceNo')).select('CustomerID','InvoiceNo','StockCode','Description','Val','AbsVal','RevTot').show(3)

Py4JJavaError: An error occurred while calling o251.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 38.0 failed 1 times, most recent failure: Lost task 0.0 in stage 38.0 (TID 251, localhost, executor driver): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$4: (string) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:231)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:225)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:826)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:826)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Unseen label: 84614A.
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:170)
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:166)
	... 16 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1918)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1958)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:935)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:934)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:275)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply$mcI$sp(Dataset.scala:2745)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:2742)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:2742)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
	at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2765)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:2742)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$4: (string) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:231)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:225)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:826)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:826)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.apache.spark.SparkException: Unseen label: 84614A.
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:170)
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:166)
	... 16 more


In [22]:
#add Year and Day of Year column in order to see transactin volume over 10days

from pyspark.sql.types import IntegerType
#import sys
from pyspark.sql import Window

# Import VectorAssembler for use later on
from pyspark.ml.feature import MinMaxScaler, VectorAssembler

#remove refunds
def isErr(s):
    if s=='M':
        return 1
    else: 
        return 0
uIsErr = udf(isErr)
#fdf = df.filter("Val is not null and Val >= 0 and StockCode <> 'M'")
fdf = df.withColumn('Manual', uIsErr('StockCode'))

#fdf = fdf.groupBy(['CustomerID','Date', 'Unix','Label']).agg({'Val':'sum'})
#fdf = fdf.withColumnRenamed('sum(Val)','Val')
#fdf = fdf.withColumn('Val', bround(fdf['Val'],scale=2))

fdf = fdf.withColumn('DayOfYear', dayofyear(fdf['Date']))
fdf = fdf.withColumn('Month', month(fdf['Date']))

#define offset window to capture activity in past 10 days
days = lambda x: x *86400
w = Window.partitionBy(fdf['CustomerID']).orderBy(fdf['Unix']).rangeBetween(-days(28),-days(1))
w2 = Window.partitionBy(fdf['CustomerID']).orderBy(fdf['Unix']).rangeBetween(-days(126),-days(1))

#Calc average daily spend over short 1 and long 1 periods
fdf = fdf.withColumn('S1Tot',bround(sm(fdf['Val']).over(w),scale=2))
fdf = fdf.withColumn('L1Tot',bround(sm(fdf['Val']).over(w2),scale=2))
fdf = fdf.na.fill(0,subset='S1Tot')
fdf = fdf.na.fill(0,subset='L1Tot')
fdf = fdf.withColumn('L1-S1Tot', bround(fdf['L1Tot']-fdf['S1Tot'],2))

#Calc average daily spend over short 1 and long 1 periods
fdf = fdf.withColumn('S1Avg',bround(av(fdf['Val']).over(w),scale=2))
fdf = fdf.withColumn('L1Avg',bround(av(fdf['Val']).over(w2),scale=2))
fdf = fdf.na.fill(0,subset='S1Avg')
fdf = fdf.na.fill(0,subset='L1Avg')
fdf = fdf.withColumn('L1-S1Avg', bround(fdf['L1Avg']-fdf['S1Avg'],2))

w3 = Window.partitionBy(fdf['CustomerID']).orderBy(fdf['Unix']).rangeBetween(-days(7),-days(1))
w4 = Window.partitionBy(fdf['CustomerID']).orderBy(fdf['Unix']).rangeBetween(-days(28),-days(1))

#Calc average daily spend over short 1 and long 1 periods
fdf = fdf.withColumn('S2Avg',bround(av(fdf['Val']).over(w3),scale=2))
fdf = fdf.withColumn('L2Avg',bround(av(fdf['Val']).over(w4),scale=2))
fdf = fdf.na.fill(0,subset='S2Avg')
fdf = fdf.na.fill(0,subset='L2Avg')
fdf = fdf.withColumn('L2-S2Avg', bround(fdf['L2Avg']-fdf['S2Avg'],2))

#Create df containing only customers with >40 transactions
gred = fdf.groupBy('CustomerID', 'Date').count()
gg = gred.groupBy('CustomerID').count().filter('count>39').drop('count')

assembler = VectorAssembler(
    
    inputCols=["L1-S1Tot", "Month","L1-S1Avg","L2-S2Avg"],
    outputCol="features")
#gg.count()B


In [6]:
fdf.filter(fdf.Label==0).count()


19202

In [35]:
from pyspark.sql.types import *

#create list to add columns for results to the 'gg' DF
metricCols =['r2', 'RMSE', 'pVal_L1-S1Tot','Month','pVal_L1-S1Avg', 'pVal_L2-S2Avg']

columnlist = ['CustomerID'] + metricCols
datal = [1,2.01,2.01,2.01,2.01,2.01,2.01]

schema1 = [StructField(columnlist[0],IntegerType(),True)
           , StructField(columnlist[1], FloatType(),True)
          , StructField(columnlist[2], FloatType(),True)
          , StructField(columnlist[3], FloatType(),True)
          , StructField(columnlist[4], FloatType(),True)
          , StructField(columnlist[5], FloatType(),True)
          , StructField(columnlist[6], FloatType(),True)
            ]

dataa = sc.parallelize([datal])
schema2 = StructType(fields=schema1)
resultdf = spark.createDataFrame(dataa,schema2)
resultdf.show()

+----------+----+----+-------------+-----+-------------+-------------+
|CustomerID|  r2|RMSE|pVal_L1-S1Tot|Month|pVal_L1-S1Avg|pVal_L2-S2Avg|
+----------+----+----+-------------+-----+-------------+-------------+
|         1|2.01|2.01|         2.01| 2.01|         2.01|         2.01|
+----------+----+----+-------------+-----+-------------+-------------+



In [37]:
#Create list of customers from df containing cust with >40 days of transactions
custList = [x[0] for x in gg.select('CustomerID').collect()]
#iterate through list running Linear Regression Model for each one
for c in custList:
    final = fdf.filter(fdf['CustomerID']==c).withColumnRenamed('Val','Label')
    final = final.select('L1-S1Tot','Month','L1-S1Avg','L2-S2Avg','Label')
    #Assembles vector for input to linear regression model
    output = assembler.transform(final)
    scaler = MinMaxScaler(inputCol='features',outputCol='sFeatures')
    scalerModel = scaler.fit(output)
    output = scalerModel.transform(output)
    #takes just the scaled Features column and labels to make a two column df for input to linear r
    final_data = output.select(['sFeatures', 'Label']).withColumnRenamed('sFeatures','features')
    train_data,test_data = final_data.randomSplit([0.7,0.3])
    # Create a Linear Regression Model object
    lr = LinearRegression(labelCol='Label')
    lrModel = lr.fit(train_data)
    summary = lrModel.summary 
    #creates a list of the regression metrics for the client
    r2 =summary.r2
    RMSE =lrModel.summary.rootMeanSquaredError
    datalist = [(c,r2,RMSE,lrModel.summary.pValues[0]
                 ,lrModel.summary.pValues[1]
                 ,lrModel.summary.pValues[2]
                 ,lrModel.summary.pValues[3])]
    #converts the regression output list to an RDD, then to a df and unions the df to the main results df
    pdatalist = sc.parallelize(datalist)
    raw_df = spark.createDataFrame(datalist,schema2)
    resultdf = resultdf.union(raw_df)

#filters out the literals used to create the inital df and sorts by R Squared
resultdf = resultdf.filter('CustomerID<>1').orderBy('r2',ascending=False)
#Formats to 4 decimal places
for c in resultdf.columns[1:]:
             resultdf = resultdf.withColumn(c,bround(c,4))
resultdf.show()


In [38]:
resultdf = resultdf.filter('CustomerID<>1').orderBy('r2',ascending=False)
for c in resultdf.columns[1:]:
             resultdf = resultdf.withColumn(c,bround(c,4))
resultdf.show()


+----------+------+---------+---------------+----------+---------------+---------------+
|CustomerID|    r2|     RMSE|pVal_L1-S1_Tot3|pVal_Month|pVal_L1-S1_Tot5|pVal_L1-S1_Tot6|
+----------+------+---------+---------------+----------+---------------+---------------+
|     17841|0.5075| 144.6799|         0.1024|    1.0E-4|         0.3395|         0.3949|
|     14527|0.4239|  56.4599|         0.4134|    0.0183|         0.6819|          0.442|
|     15039|0.2872| 195.5123|         0.5953|    0.0229|         0.2834|         0.7516|
|     13798|0.2496| 605.7141|         0.0229|    0.4618|         0.4169|         0.5994|
|     14646|0.1997|5504.3335|         0.8977|    0.7417|         0.0115|         0.8673|
|     14156|0.1454|3813.3564|         0.7905|    0.2849|         0.3052|           0.23|
|     16422|0.1442| 517.9576|         0.6189|    0.5622|         0.3058|         0.0637|
|     14606|0.1313|  49.8456|         0.0212|    0.0216|         0.3328|         0.7299|
|     12748|0.1232| 3

In [30]:
for c in dftemplate = dftemplate.withColumn('r2',bround('r2',3))
dftemplate.orderBy('r2',ascending=False).show()
# Print the coefficients and intercept for linear regression
#print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

+----------+---+------------------+--------------------+--------------------+--------------------+-------------------+
|CustomerID| r2|              RMSE|     pVal_L1-S1_Tot3|          pVal_Month|     pVal_L1-S1_Tot5|    pVal_L1-S1_Tot6|
+----------+---+------------------+--------------------+--------------------+--------------------+-------------------+
|         1|2.0|              2.01|                2.01|                2.01|                2.01|               2.01|
|     14606|0.0| 51.21707534790039|0.004383583553135395| 0.24977006018161774|  0.6457734107971191| 0.6916141510009766|
|     15311|0.0|    514.8115234375|0.021972138434648514|0.008193759247660637| 0.15857551991939545|  0.935386598110199|
|     14156|0.0| 2388.231689453125|  0.7416616082191467|  0.8343241810798645|   0.752126157283783|0.09927629679441452|
|     14646|0.0|  6070.65380859375| 0.05948681756854057| 0.08887233585119247|0.004989327397197485| 0.7663782238960266|
|     15039|0.0|183.70236206054688|  0.711657762

In [9]:
summary = lrModel.summary
print("R squared:        {:.3f}".format(summary.r2))
print("RMSE:             {:,.0f}".format(lrModel.summary.rootMeanSquaredError))
print("pValue L1-S1Tot:  {:.4f}".format(lrModel.summary.pValues[0]))
print("pValue Month:     {:.4f}".format(lrModel.summary.pValues[1]))
print("pValue L1-S1Avg:  {:.4f}".format(lrModel.summary.pValues[2]))
print("pValue L2-S2Avg:  {:.4f}".format(lrModel.summary.pValues[3]))
rdf = summary.predictions
rdf = rdf.withColumn('Residuals', rdf.Label - rdf.prediction)
rdf.show(10)

R squared:        0.199
RMSE:             51
pValue L1-S1Tot:  0.0019
pValue Month:     0.0137
pValue L1-S1Avg:  0.9158
pValue L2-S2Avg:  0.3117
+--------------------+------+------------------+------------------+
|            features| Label|        prediction|         Residuals|
+--------------------+------+------------------+------------------+
|[0.0,1.0,0.450558...|316.79|174.87918741927427|141.91081258072575|
|[0.0,1.0,0.450558...|109.88|173.41625074663162|-63.53625074663162|
|[0.0,1.0,0.450558...|192.68|172.56201849372488| 20.11798150627513|
|[0.0,1.0,0.450558...|215.88|172.56201849372488| 43.31798150627512|
|[0.0,1.0,0.450558...|108.63| 158.4970247727116| -49.8670247727116|
|[0.09275777647036...|339.68| 193.7762902693422| 145.9037097306578|
|[0.26997732060503...|116.06|202.13398147581205|-86.07398147581205|
|[0.29574785067658...| 166.3| 198.1464756127939|-31.84647561279388|
|[0.29574785067658...|265.18|194.64351470472775| 70.53648529527226|
|[0.29574785067658...|164.87|189.665552

In [10]:
lrModel

LinearRegression_43fcae7f4e9e7953d8a7

In [11]:
print(summary)

<pyspark.ml.regression.LinearRegressionTrainingSummary object at 0x7fb840105eb8>
