In [1]:
from pyspark.sql import SparkSession 

In [2]:
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [3]:
from pyspark.ml.regression import LinearRegression

## Getting to know the data

In [4]:
data = spark.read.csv('Ecommerce_Customers.csv',inferSchema=True,header=True)

In [5]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [30]:
print('Total users: %d' % data.count())

Total users: 500


In [31]:
# Example
data.show(5)

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [32]:
type(data.head(1)[0])

pyspark.sql.types.Row

In [33]:
# data from the first user
for header, item in zip(data.columns, data.head(1)[0]):
    print('{}: {}'.format(header,item))

Email: mstephenson@fernandez.com
Address: 835 Frank TunnelWrightmouth, MI 82180-9605
Avatar: Violet
Avg Session Length: 34.49726772511229
Time on App: 12.65565114916675
Time on Website: 39.57766801952616
Length of Membership: 4.0826206329529615
Yearly Amount Spent: 587.9510539684005


# Setting data for maching learning

In [35]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [47]:
# select only numeric columns for features
cols = ['Avg Session Length','Time on App','Time on Website','Length of Membership']
assembler = VectorAssembler(inputCols=cols,outputCol='features')

In [48]:
output = assembler.transform(data)

In [49]:
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [52]:
output.select('features').take(1)

[Row(features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

## Select only features and target columns

In [58]:
final_data = output.select('features','Yearly Amount Spent')

In [59]:
final_data.show(5)

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
+--------------------+-------------------+
only showing top 5 rows



## Splitting train and test data

In [92]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [93]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                346|
|   mean| 499.85830624674566|
| stddev|  79.07187878482645|
|    min|  275.9184206503857|
|    max|  725.5848140556806|
+-------+-------------------+



In [94]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                154|
|   mean|  498.0912023891001|
| stddev|  80.10323328412343|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



## Building a linear regression model

In [99]:
lr = LinearRegression(labelCol='Yearly Amount Spent',featuresCol='features')

In [100]:
lr_model = lr.fit(train_data)

In [115]:
lr_model.summary.predictions.describe().show()

+-------+-------------------+------------------+
|summary|Yearly Amount Spent|        prediction|
+-------+-------------------+------------------+
|  count|                346|               346|
|   mean| 499.85830624674566|499.85830624674554|
| stddev|  79.07187878482645|  78.4586947002367|
|    min|  275.9184206503857|282.05796235977596|
|    max|  725.5848140556806|  724.628898310506|
+-------+-------------------+------------------+



## Evaluating the model

In [105]:
test_results = lr_model.evaluate(test_data)

In [108]:
test_results.residuals.show(5)

+-------------------+
|          residuals|
+-------------------+
|-13.065280065772527|
| -6.753823210773419|
| -19.11928963148364|
| -4.868418741349103|
|-13.029808613919101|
+-------------------+
only showing top 5 rows



In [114]:
test_results.predictions.describe().show()

+-------+-------------------+------------------+
|summary|Yearly Amount Spent|        prediction|
+-------+-------------------+------------------+
|  count|                154|               154|
|   mean|  498.0912023891001| 498.6425575706747|
| stddev|  80.10323328412343| 78.16529738663755|
|    min| 256.67058229005585|255.93176535343946|
|    max|  765.5184619388373| 765.8652624198089|
+-------+-------------------+------------------+



In [117]:
print('RMSE: {}'.format(test_results.rootMeanSquaredError))
# comparing RMSE with mean and stddev of the data --> small error
print('R2: {}'.format(test_results.r2))

RMSE: 10.27991269696053
R2: 0.9834229193558983


## Quite good agreement between summaries of test labels and predictions

## Next: Deploy the model onto the data with only features available and make predictions

In [118]:
unlabeled_data = test_data.select('features')

In [119]:
unlabeled_data.show(5)

+--------------------+
|            features|
+--------------------+
|[30.3931845423455...|
|[30.4925366965402...|
|[30.8162006488763...|
|[30.8794843441274...|
|[31.0662181616375...|
+--------------------+
only showing top 5 rows



In [120]:
predictions = lr_model.transform(unlabeled_data)

In [121]:
predictions.show(5)

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.3931845423455...|332.99414986896613|
|[30.4925366965402...|289.22506893068794|
|[30.8162006488763...|285.20563057995264|
|[30.8794843441274...| 495.0750187262038|
|[31.0662181616375...|461.96310182159345|
+--------------------+------------------+
only showing top 5 rows



In [125]:
# this should agree with the predictions make on test_data (since unlabeled_data is the same as test_data)
test_results.predictions.show(5)

+--------------------+-------------------+------------------+
|            features|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[30.3931845423455...|  319.9288698031936|332.99414986896613|
|[30.4925366965402...|  282.4712457199145|289.22506893068794|
|[30.8162006488763...|   266.086340948469|285.20563057995264|
|[30.8794843441274...|  490.2065999848547| 495.0750187262038|
|[31.0662181616375...| 448.93329320767435|461.96310182159345|
+--------------------+-------------------+------------------+
only showing top 5 rows

