In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext

In [2]:
sc = SparkContext()
sqlContext = SQLContext(sc)

In [28]:
df = sqlContext.read.csv('d_small_ecommerce-data.csv', header=True)

In [29]:
df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   539725|    22988|   SOLDIERS EGG CUP |       6|2010-12-21 13:58:00|     1.25|   17211.0|United Kingdom|
|   539725|    84947|ANTIQUE

In [5]:
df.dtypes

[('InvoiceNo', 'string'),
 ('StockCode', 'string'),
 ('Description', 'string'),
 ('Quantity', 'string'),
 ('InvoiceDate', 'string'),
 ('UnitPrice', 'string'),
 ('CustomerID', 'string'),
 ('Country', 'string')]

In [6]:
df = df.drop_duplicates(['CustomerID'])

In [7]:
# df.dropna(subset=['Description']).show()
df = df.na.drop()

In [30]:
from pyspark.ml.feature import Bucketizer

df = df.withColumn('label', df['Quantity'] * df['UnitPrice'])

df = df.filter((df['Quantity'] > 0) & (df['UnitPrice'] > 0))
df = df.filter((df['label'] < 25000) & (df['UnitPrice'] < 15))

df = df.withColumn("Quantity", df["Quantity"].cast('int'))
df = df.withColumn("UnitPrice", df["UnitPrice"].cast('float'))

bucketizer = Bucketizer(splits=[ 0, 2, 5, 8, 11, 14, 15, 5000], inputCol="Quantity", outputCol="QuantityRange")
df = bucketizer.setHandleInvalid("keep").transform(df)

bucketizer = Bucketizer(splits=[ 0, 1, 2, 3, 4, 20], inputCol="UnitPrice", outputCol="PriceRange")
df = bucketizer.setHandleInvalid("keep").transform(df)

In [31]:
from pyspark.ml.feature import QuantileDiscretizer
from pyspark.sql.functions import month

df = df.withColumn('Month', month(df['InvoiceDate']))

discretizer = QuantileDiscretizer(numBuckets=3, inputCol="Month", outputCol="DateRange")

result = discretizer.fit(df).transform(df)

In [32]:
from pyspark.ml.feature import StringIndexer

si = StringIndexer(inputCol='StockCode', outputCol='StockCodeIndex')
df = si.fit(df).transform(df)

si = StringIndexer(inputCol='Country', outputCol='CountryIndex')
df = si.fit(df).transform(df)

In [33]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=['Quantity', 'UnitPrice', 'QuantityRange', 'PriceRange', 'Month', 'CountryIndex', 'StockCodeIndex'],
    outputCol="features")

df = assembler.transform(df)

In [34]:
df_fr = df.select(['features', 'label'])
df_fr.show()

+--------------------+------------------+
|            features|             label|
+--------------------+------------------+
|[6.0,2.5499999523...|15.299999999999999|
|[6.0,3.3900001049...|             20.34|
|[8.0,2.75,3.0,2.0...|              22.0|
|[6.0,3.3900001049...|             20.34|
|[6.0,3.3900001049...|             20.34|
|[6.0,1.25,2.0,1.0...|               7.5|
|[6.0,1.25,2.0,1.0...|               7.5|
|[1.0,7.9499998092...|              7.95|
|[1.0,7.9499998092...|              7.95|
|[1.0,7.9499998092...|              7.95|
|[6.0,2.9500000476...|17.700000000000003|
|[6.0,2.9500000476...|17.700000000000003|
|[12.0,1.490000009...|             17.88|
|[24.0,1.25,6.0,1....|              30.0|
|[24.0,1.25,6.0,1....|              30.0|
|[2.0,9.9499998092...|              19.9|
|[4.0,3.75,1.0,3.0...|              15.0|
|[8.0,1.9500000476...|              15.6|
|[12.0,1.950000047...|              23.4|
|[4.0,4.1500000953...|              16.6|
+--------------------+------------

In [35]:
train, test = df_fr.randomSplit([0.8, 0.2], seed=12345)

In [36]:
train.show()

+--------------------+------------------+
|            features|             label|
+--------------------+------------------+
|[1.0,7.9499998092...|              7.95|
|[1.0,7.9499998092...|              7.95|
|[1.0,7.9499998092...|              7.95|
|[2.0,9.9499998092...|              19.9|
|[3.0,4.9499998092...|14.850000000000001|
|[4.0,3.75,1.0,3.0...|              15.0|
|[4.0,4.1500000953...|              16.6|
|[4.0,4.1500000953...|              16.6|
|[6.0,1.25,2.0,1.0...|               7.5|
|[6.0,1.25,2.0,1.0...|               7.5|
|[6.0,2.0999999046...|12.600000000000001|
|[6.0,2.5499999523...|15.299999999999999|
|[6.0,2.9500000476...|17.700000000000003|
|[6.0,2.9500000476...|17.700000000000003|
|[6.0,3.3900001049...|             20.34|
|[6.0,3.3900001049...|             20.34|
|[6.0,3.3900001049...|             20.34|
|[8.0,1.9500000476...|              15.6|
|[8.0,2.75,3.0,2.0...|              22.0|
|[12.0,1.490000009...|             17.88|
+--------------------+------------

In [37]:
test.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[4.0,4.1500000953...| 16.6|
+--------------------+-----+



In [38]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [5, 10]).build()

crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(),
                          numFolds=2)

cvModel = crossval.fit(train)

In [39]:
prediction = cvModel.transform(test)
#selected = prediction.select("id", "text", "probability", "prediction")
for row in prediction.collect():
    print(row)

Row(features=DenseVector([4.0, 4.15, 1.0, 4.0, 12.0, 1.0, 12.0]), label=16.6, prediction=16.31566984210318)
