# Prediction Model for BNPL Revenue

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd

from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import *

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executer.memory", "4g")
    .getOrCreate()
)

22/09/26 11:31:56 WARN Utils: Your hostname, Luo resolves to a loopback address: 127.0.1.1; using 172.27.92.55 instead (on interface eth0)
22/09/26 11:31:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/26 11:31:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Data Preprocessing

In [3]:
full = spark.read.parquet('../data/curated/train_data/')
full = full.withColumnRenamed('y_total_num_consumer', 'next_total_num_consumer').withColumnRenamed('y_total_revenue', 'next_total_revenue')\
    .withColumnRenamed('y_total_num_transaction', 'next_total_num_transaction')
full.printSchema()

                                                                                

root
 |-- merchant_abn: long (nullable = true)
 |-- total_num_consumer: long (nullable = true)
 |-- avg_dollar_value: double (nullable = true)
 |-- total_num_transaction: long (nullable = true)
 |-- mean_income: double (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- total_revenue: double (nullable = true)
 |-- total_num_postcode: long (nullable = true)
 |-- tag: string (nullable = true)
 |-- next_total_num_consumer: long (nullable = true)
 |-- next_total_revenue: double (nullable = true)
 |-- next_total_num_transaction: long (nullable = true)



### Clustering

In [4]:
tag_mean = full.groupBy('tag')\
      .agg(
         F.mean("total_revenue").alias("mean_revenue_of_tags")
      )

In [5]:
tag_mean = tag_mean.toPandas()

In [6]:
from sklearn.cluster import KMeans
import numpy as np

kmeans = KMeans(n_clusters=4, random_state=0).fit(np.array(tag_mean['mean_revenue_of_tags']).reshape(-1, 1))
kmeans.labels_

array([2, 0, 0, 1, 2, 0, 3, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 2, 2, 1, 1, 2,
       1, 0], dtype=int32)

In [7]:
tag_mean['tag_labels'] = kmeans.labels_
tag_mean

Unnamed: 0,tag,mean_revenue_of_tags,tag_labels
0,jewelry,301542.063851,2
1,watch,565603.630545,0
2,cable,545919.891131,0
3,garden supply,732549.584612,1
4,antique,431668.237726,2
5,shoe,534436.300789,0
6,tent,995172.851759,3
7,stationery,518011.067448,0
8,artist supply,719151.356147,1
9,florists,637021.720045,1


In [8]:
tag_mean_sdf = spark.createDataFrame(tag_mean[['tag', 'tag_labels']])
tag_mean_sdf

tag,tag_labels
jewelry,2
watch,0
cable,0
garden supply,1
antique,2
shoe,0
tent,3
stationery,0
artist supply,1
florists,1


In [9]:
full = full.join(tag_mean_sdf, ["tag"], how="left") 
# use left join here since if no historical data is provided, we cannot predict the future value of a merchant
full

tag,merchant_abn,total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,revenue_level,total_revenue,total_num_postcode,next_total_num_consumer,next_total_revenue,next_total_num_transaction,tag_labels
jewelry,10596295795,8,10439.40181102842,8,61840.875,a,571244.0798428855,8,,,,2
watch,10187291046,87,111.08408713922158,87,61060.0459770115,b,31795.597893195016,87,99.0,41683.21121325837,100.0,0
watch,10264435225,1238,114.10783402533237,1272,62006.31132075472,c,346896.9592900661,1018,1519.0,435003.6795629895,1566.0,0
watch,10922217544,18,163.5626661571798,18,63804.22222222222,c,4946.134870167458,18,19.0,5880.61894060871,19.0,0
shoe,10955677986,196,224.31663343377568,197,62081.20304568528,a,249233.7191755476,191,232.0,311007.83981679846,235.0,0
tent,10651113986,17,537.592173774402,17,57981.94117647059,b,29701.967601035707,17,24.0,40997.39033296765,24.0,3
stationery,10618089367,903,382.6779937277748,919,63214.549510337325,b,1410241.1961990686,783,994.0,1427868.1901742313,1011.0,0
artist supply,10463252268,22,464.0964976850653,22,60070.77272727273,a,67488.91405656068,22,26.0,78474.65405470507,26.0,1
florists,10545955006,108,475.27264003873785,108,63127.56481481482,a,316189.37413271976,106,133.0,359500.80479674053,133.0,1
music,10364012396,4,276.08689369891994,4,81123.75,b,4008.7818228908673,4,16.0,19636.79081402693,16.0,0


## Model for BNPL Revenue
Features:
    total number of consumer, average dollar value, total number of transaction, mean income, total number of postcode, tags

label:
    next year revenue

In [10]:
# drop the columns not needed
revenue_df = full.drop('merchant_abn', 'revenue_level', 'total_revenue', 'next_total_num_consumer', 'next_total_num_transaction')
revenue_df

tag,total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,total_num_postcode,next_total_revenue,tag_labels
jewelry,8,10439.40181102842,8,61840.875,8,,2
watch,87,111.08408713922158,87,61060.0459770115,87,41683.21121325837,0
watch,1238,114.10783402533237,1272,62006.31132075472,1018,435003.6795629895,0
watch,18,163.5626661571798,18,63804.22222222222,18,5880.61894060871,0
shoe,196,224.31663343377568,197,62081.20304568528,191,311007.83981679846,0
tent,17,537.592173774402,17,57981.94117647059,17,40997.39033296765,3
stationery,903,382.6779937277748,919,63214.549510337325,783,1427868.1901742313,0
artist supply,22,464.0964976850653,22,60070.77272727273,22,78474.65405470507,1
florists,108,475.27264003873785,108,63127.56481481482,106,359500.80479674053,1
music,4,276.08689369891994,4,81123.75,4,19636.79081402693,0


### Indexing and One-hot Encoding

In [11]:
# change tags into numeric feature by one hot encoding
indexer = StringIndexer(inputCol="tag", outputCol="tagIndex")
revenue_df = indexer.fit(revenue_df).transform(revenue_df)
ohe = OneHotEncoder(inputCol="tagIndex", outputCol="tagOHE")
revenue_df = ohe.fit(revenue_df).transform(revenue_df)
revenue_df

tag,total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,total_num_postcode,next_total_revenue,tag_labels,tagIndex,tagOHE
jewelry,8,10439.40181102842,8,61840.875,8,,2,23.0,"(23,[],[])"
jewelry,1,6987.246435378608,1,48235.0,1,,2,23.0,"(23,[],[])"
jewelry,2,1396.3251261623384,2,54250.5,2,,2,23.0,"(23,[],[])"
jewelry,1,4798.332815388768,1,80991.0,1,,2,23.0,"(23,[],[])"
jewelry,33,9848.725593936158,33,60891.90909090909,33,6996.210950909105,2,23.0,"(23,[],[])"
jewelry,29,14897.926207832394,29,62719.72413793104,28,,2,23.0,"(23,[],[])"
jewelry,3,3622.567091022215,3,66842.66666666667,3,5551.664760915629,2,23.0,"(23,[],[])"
jewelry,1,19486.76358643924,1,70738.0,1,,2,23.0,"(23,[],[])"
jewelry,5,4844.117193121709,5,64499.4,5,2973.525203961843,2,23.0,"(23,[],[])"
jewelry,3,15354.649596808333,3,55943.66666666666,3,,2,23.0,"(23,[],[])"


In [12]:
import six
for i in revenue_df.columns[:-1]:
    if not( isinstance(revenue_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to next_total_revenue for ", i, revenue_df.stat.corr('next_total_revenue',i))

Correlation to next_total_revenue for  total_num_consumer 0.7410793330888668
Correlation to next_total_revenue for  avg_dollar_value -0.09721738918447717
Correlation to next_total_revenue for  total_num_transaction 0.6509905314160019
Correlation to next_total_revenue for  mean_income 0.014726025333045787
Correlation to next_total_revenue for  total_num_postcode 0.6656045508317902
Correlation to next_total_revenue for  next_total_revenue 1.0
Correlation to next_total_revenue for  tag_labels -0.028666428658009408
Correlation to next_total_revenue for  tagIndex -0.10513625584784485


### Vectorization

In [13]:
features = ['total_num_consumer', 'avg_dollar_value', 'total_num_transaction', 'mean_income', 'total_num_postcode', 'tagOHE']
assembler = VectorAssembler(inputCols=features, outputCol='features')
final_revenue_df = assembler.transform(revenue_df)
final_revenue_df = final_revenue_df.select('features','next_total_revenue')

### Model fitting

In [14]:
# missing values will not be included
train_df, test_df = (final_revenue_df.drop('tag', 'tagIndex').filter(F.col('next_total_revenue').isNotNull())).randomSplit([0.7, 0.3])

In [15]:
train_df, test_df = (final_revenue_df.filter((F.col('next_total_revenue').isNotNull())&(F.col('tag_labels')==1))).drop('tag', 'tagIndex', 'tag_labels').randomSplit([0.7, 0.3])

In [16]:
train_df.count(), test_df.count()

(1137, 421)

#### Linear Regression

In [17]:
lr = LinearRegression(labelCol='next_total_revenue', maxIter=10, regParam=0.3, elasticNetParam=0.8)
fitted_model = lr.fit(train_df)
fitted_model.setFeaturesCol("features")
fitted_model.setPredictionCol("prediction")

22/09/26 11:32:17 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/09/26 11:32:17 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


LinearRegressionModel: uid=LinearRegression_ecde2c4f87c2, numFeatures=28

In [18]:
print("Coefficients: %s" % str(fitted_model.coefficients))
print("Intercept: %s" % str(fitted_model.intercept))

Coefficients: [329.94873459809827,199.00613634951037,-0.0,3.4546764682089095,1406.9778957621609,135277.2373765769,0.0,302266.6058912068,0.0,-661996.2502317633,-127142.11324265848,167648.23980101466,0.0,0.0,0.0,0.0,0.0,275300.37130694755,0.0,0.0,-52079.85753377457,0.0,26219.694735976373,0.0,0.0,0.0,0.0,0.0]
Intercept: -431457.878446186


In [19]:
trainingSummary = fitted_model.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

numIterations: 10
objectiveHistory: [0.5, 0.4433406452936922, 0.2722512330319915, 0.22420136470078894, 0.20786499882706577, 0.20106384372916292, 0.19784373338733197, 0.19476822947033842, 0.1928758980062302, 0.19193522388591372, 0.19171004424360535]
+-------------------+
|          residuals|
+-------------------+
|-459834.30010480137|
|  29987.71717017758|
| 14735.770490550465|
|-22233.438865994627|
| -18797.59805945043|
| -6738.614387888592|
|-15217.969016764873|
|   49440.5916303634|
| 21608.614265913835|
| 42670.005155233106|
| 56342.395593939145|
| 22382.010636478866|
|  17123.44564349079|
|  24444.29233231898|
|   36042.2340155858|
| -69592.91749208097|
| -54158.07764345864|
|  25652.95962790656|
|-14768.541019344164|
|  70611.47292887203|
+-------------------+
only showing top 20 rows

RMSE: 1113264.945888
r2: 0.616580


In [20]:
train_df.describe().show()

+-------+--------------------+
|summary|  next_total_revenue|
+-------+--------------------+
|  count|                1137|
|   mean|    771279.959894413|
| stddev|  1798673.6248159038|
|    min|  326.68328215436907|
|    max|1.8112946393009316E7|
+-------+--------------------+



##### Linear Regression Evaluation

In [21]:
lr_predictions = fitted_model.transform(test_df)
lr_predictions.select("prediction","next_total_revenue","features").show(10)
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="next_total_revenue",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+-------------------+------------------+--------------------+
|         prediction|next_total_revenue|            features|
+-------------------+------------------+--------------------+
| 469286.43929568346|2624.8015545185413|(28,[0,1,2,3,4,5]...|
| -46095.90399259748| 17257.12863751407|(28,[0,1,2,3,4,5]...|
|   8263.25219461153|24217.897520994058|(28,[0,1,2,3,4,5]...|
|-12503.398190982814| 7662.348508455224|(28,[0,1,2,3,4,5]...|
|-18581.036177817325|12184.883889297475|(28,[0,1,2,3,4,5]...|
|   8952.68873853347|31713.574800623595|(28,[0,1,2,3,4,5]...|
| 65489.376260855875|28967.859022303237|(28,[0,1,2,3,4,5]...|
|   77404.4310631436|50985.798178586294|(28,[0,1,2,3,4,5]...|
|   78845.7016533133| 66897.44807969415|(28,[0,1,2,3,4,5]...|
|-12884.868294099288|32339.166396653898|(28,[0,1,2,3,4,5]...|
+-------------------+------------------+--------------------+
only showing top 10 rows

R Squared (R2) on test data = 0.625172


In [22]:
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression(labelCol='next_total_revenue', family="gamma", link="inverse", maxIter=10, regParam=0.3)
model = glr.fit(train_df)
summary = model.summary

22/09/26 11:32:20 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
22/09/26 11:32:20 WARN Instrumentation: [803360cb] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/09/26 11:32:20 WARN Instrumentation: [803360cb] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/09/26 11:32:20 WARN Instrumentation: [803360cb] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/09/26 11:32:20 WARN Instrumentation: [803360cb] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/09/26 11:32:20 WARN Instrumentation: [803360cb] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/09/26 11:32:20 WARN Instrumentation: [803360cb] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/09/26 11:

In [23]:
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

Coefficients: [-7.29921789176163e+35,-1.514818879549416e+35,1.5951885476432797e+35,-3.3873051879364135e+33,-4.731608488764943e+35,-7.046855017481054e+36,0.0,-1.0273188703163688e+37,0.0,2.315676577215516e+37,6.470201884501187e+36,-5.418302311592143e+36,0.0,0.0,0.0,0.0,0.0,-8.049257366324759e+36,0.0,0.0,1.9054737432836764e+36,0.0,-7.448379131271373e+35,0.0,0.0,0.0,0.0,0.0]
Intercept: 2.5986924360779387e+38


In [24]:
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("T Values: " + str(summary.tValues))
print("P Values: " + str(summary.pValues))
print("Dispersion: " + str(summary.dispersion))
print("Null Deviance: " + str(summary.nullDeviance))
print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
print("Deviance: " + str(summary.deviance))
print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
print("AIC: " + str(summary.aic))
print("Deviance Residuals: ")
summary.residuals().show()

Py4JJavaError: An error occurred while calling o539.coefficientStandardErrors.
: java.lang.UnsupportedOperationException: No Std. Error of coefficients available for this GeneralizedLinearRegressionModel
	at org.apache.spark.ml.regression.GeneralizedLinearRegressionTrainingSummary.coefficientStandardErrors$lzycompute(GeneralizedLinearRegression.scala:1464)
	at org.apache.spark.ml.regression.GeneralizedLinearRegressionTrainingSummary.coefficientStandardErrors(GeneralizedLinearRegression.scala:1459)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)


#### Random Forest Regressor

In [25]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [26]:
rf = RandomForestRegressor(numTrees = 10, maxDepth = 10, labelCol='next_total_revenue')

In [27]:
model = rf.fit(train_df)
result = model.transform(test_df)

In [28]:
rf_evaluator = RegressionEvaluator(labelCol='next_total_revenue', metricName="mae", predictionCol='prediction')
mae = rf_evaluator.evaluate(result)
rf_evaluator = RegressionEvaluator(labelCol='next_total_revenue', metricName="r2", predictionCol='prediction')
r2 = rf_evaluator.evaluate(result)
print('+++++++++++++++++++++++++++++++++++++++++++')
print(f'Using Categorical feature: {features}')
print('mae:{}'.format(mae))
print('r2: {}'.format(r2))

+++++++++++++++++++++++++++++++++++++++++++
Using Categorical feature: ['total_num_consumer', 'avg_dollar_value', 'total_num_transaction', 'mean_income', 'total_num_postcode', 'tagOHE']
mae:362693.0951681056
r2: 0.6669955988610947


In [29]:
result.select("prediction","next_total_revenue","features").show(20)

+------------------+------------------+--------------------+
|        prediction|next_total_revenue|            features|
+------------------+------------------+--------------------+
| 9191.476797416268|2624.8015545185413|(28,[0,1,2,3,4,5]...|
|  25584.6871792982| 17257.12863751407|(28,[0,1,2,3,4,5]...|
|19555.864367857117|24217.897520994058|(28,[0,1,2,3,4,5]...|
|29422.267071773724| 7662.348508455224|(28,[0,1,2,3,4,5]...|
|29568.587410052955|12184.883889297475|(28,[0,1,2,3,4,5]...|
| 39426.51326838844|31713.574800623595|(28,[0,1,2,3,4,5]...|
| 41029.14680574974|28967.859022303237|(28,[0,1,2,3,4,5]...|
| 34442.81005783456|50985.798178586294|(28,[0,1,2,3,4,5]...|
|44237.873582189546| 66897.44807969415|(28,[0,1,2,3,4,5]...|
| 33325.86763179635|32339.166396653898|(28,[0,1,2,3,4,5]...|
|41610.822838702836|47301.027783661135|(28,[0,1,2,3,4,5]...|
| 31465.97720661906|22537.278176372223|(28,[0,1,2,3,4,5]...|
| 39702.63569027528|14782.670203792048|(28,[0,1,2,3,4,5]...|
| 43267.68758263775| 335

#### Gradient Boosting Tree

In [None]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'next_total_revenue', maxIter=10)
gbt_model = gbt.fit(train_df)
gbt_predictions = gbt_model.transform(test_df)
gbt_predictions.select('prediction', 'next_total_revenue', 'features').show(5)

+------------------+------------------+--------------------+
|        prediction|next_total_revenue|            features|
+------------------+------------------+--------------------+
| 35290.34412488112| 9662.469562619945|(28,[0,1,2,3,4,5]...|
| 35290.34412488112| 7335.947872319652|(28,[0,1,2,3,4,5]...|
| 17460.04096239942| 18686.29029154868|(28,[0,1,2,3,4,5]...|
| 17460.04096239942|22465.136709436512|(28,[0,1,2,3,4,5]...|
|15821.954153259354|6669.5712197070525|(28,[0,1,2,3,4,5]...|
+------------------+------------------+--------------------+
only showing top 5 rows



In [None]:
gbt_evaluator = RegressionEvaluator(
    labelCol="next_total_revenue", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 1.31818e+06


#### Multilayer Perceptron Regressor

In [32]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV

In [33]:
revenue_pd = revenue_df.toPandas().dropna()
revenue_pd

                                                                                

Unnamed: 0,tag,total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,total_num_postcode,next_total_revenue,tag_labels,tagIndex,tagOHE
4,jewelry,33,9848.725594,33,60891.909091,33,6.996211e+03,2,23.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,jewelry,3,3622.567091,3,66842.666667,3,5.551665e+03,2,23.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,jewelry,5,4844.117193,5,64499.400000,5,2.973525e+03,2,23.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
10,jewelry,17,2647.341101,17,65339.823529,17,1.995391e+04,2,23.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
11,jewelry,25,5595.325452,25,66485.200000,25,2.011502e+04,2,23.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...
3948,bicycle,167,147.146114,169,63679.236686,163,6.416802e+04,0,10.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3949,bicycle,240,51.738894,241,61790.132780,230,5.320625e+04,0,10.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3950,bicycle,356,980.503019,358,63258.500000,336,1.381719e+06,0,10.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3951,bicycle,236,1540.157629,238,61031.621849,232,6.890149e+05,0,10.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [34]:
features_pd = ['total_num_consumer', 'avg_dollar_value', 'total_num_transaction', 'mean_income', 'total_num_postcode']
X = revenue_pd[features_pd]
y = revenue_pd['next_total_revenue'].to_numpy()

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=30034, test_size=0.3)

In [36]:
# scale train and test dataset in order to be standard normally distributed with zero mean
sc_X = StandardScaler()
X_trainscaled=sc_X.fit_transform(X_train)
X_testscaled=sc_X.transform(X_test)
X_trainscaled.shape, X_testscaled.shape

((2648, 5), (1136, 5))

##### Hyperparameters

In [42]:
mlp_reg = MLPRegressor(hidden_layer_sizes=(128,128,128),activation="relu" ,solver = 'adam', random_state=30034, max_iter=20000)\
    .fit(X_trainscaled, y_train)
y_pred=mlp_reg.predict(X_testscaled)
print("The Score with ", (metrics.r2_score(y_pred, y_test)))

The Score with  0.6755919993467134


In [48]:
mlp_reg = MLPRegressor(hidden_layer_sizes=(128,128,128,128),activation="relu" ,solver = 'adam', random_state=30034, max_iter=20000)\
    .fit(X_trainscaled, y_train)
y_pred=mlp_reg.predict(X_testscaled)
print("The Score with ", (metrics.r2_score(y_pred, y_test)))

The Score with  0.7147332388747438


In [49]:
mlp_reg = MLPRegressor(hidden_layer_sizes=(128,128,128,128),activation="tanh" ,solver = 'adam', random_state=30034, max_iter=20000)\
    .fit(X_trainscaled, y_train)
y_pred=mlp_reg.predict(X_testscaled)
print("The Score with ", (metrics.r2_score(y_pred, y_test)))

The Score with  -7.809592698162063e+36




In [50]:
mlp_reg = MLPRegressor(hidden_layer_sizes=(128,128,128,128),activation="logistic" ,solver = 'adam', random_state=30034, max_iter=20000)\
    .fit(X_trainscaled, y_train)
y_pred=mlp_reg.predict(X_testscaled)
print("The Score with ", (metrics.r2_score(y_pred, y_test)))

The Score with  -8.710900745153732e+36




In [None]:
mlp_reg = MLPRegressor(hidden_layer_sizes=(128,128,128,128),activation="relu" ,solver = 'lbfgs', random_state=30034, max_iter=20000)\
    .fit(X_trainscaled, y_train)
y_pred=mlp_reg.predict(X_testscaled)
print("The Score with ", (metrics.r2_score(y_pred, y_test)))

In [51]:
mlp_reg = MLPRegressor(hidden_layer_sizes=(256,256,256,256),activation="relu" ,solver = 'adam', random_state=30034, max_iter=20000)\
    .fit(X_trainscaled, y_train)
y_pred=mlp_reg.predict(X_testscaled)
print("The Score with ", (metrics.r2_score(y_pred, y_test)))

The Score with  0.7193549243896455


In [54]:
mlp_reg = MLPRegressor(hidden_layer_sizes=(128,128,128,128),activation="relu" ,solver = 'adam', random_state=30034, max_iter=20000)\
    .fit(X_trainscaled, y_train)
y_pred=mlp_reg.predict(X_testscaled)
print("The Score with ", (metrics.r2_score(y_pred, y_test)))

The Score with  0.7147332388747438


In [38]:
# using grid search for best parameter combinations
param_grid = {
    'hidden_layer_sizes': [(150,100,50), (120,80,40), (100,50,30)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['sgd', 'adam', 'lbfgs'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

grid = GridSearchCV(mlp_reg, param_grid, n_jobs= -1, cv=5)
grid.fit(X_trainscaled, y_train)
print(grid.best_params_)

  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


22/09/26 10:58:45 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 221434 ms exceeds timeout 120000 ms
22/09/26 10:58:45 WARN SparkContext: Killing executors is not supported by current scheduler.


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


KeyboardInterrupt: 

In [None]:
grid_predictions = grid.predict(X_testscaled)
grid_result = pd.DataFrame({'Actual': y_test, 'Predicted': grid_predictions})
grid_result.head()

For solver: 

The default solver ‘adam’ works pretty well on relatively large datasets (with thousands of training samples or more) in terms of both training time and validation score. For small datasets, however, ‘lbfgs’ can converge faster and perform better.

For activition:

‘identity’, no-op activation, useful to implement linear bottleneck, returns f(x) = x

‘logistic’, the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)).

‘tanh’, the hyperbolic tan function, returns f(x) = tanh(x).

‘relu’, the rectified linear unit function, returns f(x) = max(0, x)

(from Sklearn)

##### Evaluation

In [None]:
df_result = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_result.head()

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
print('Mean Absolute Error for grid search:', metrics.mean_absolute_error(y_test, grid_predictions))  
print('Mean Squared Error for grid search:', metrics.mean_squared_error(y_test, grid_predictions))  
print('Root Mean Squared Error for grid search:', np.sqrt(metrics.mean_squared_error(y_test, grid_predictions)))


In [30]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

2022-09-26 11:33:40.375243: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-26 11:33:40.535022: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-26 11:33:40.535044: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-09-26 11:33:40.558965: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-26 11:33:41.349896: W tensorflow/stream_executor/platform/de

In [41]:
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(20, input_shape=(5,), kernel_initializer='normal', activation='relu'))
	model.add(Dense(1, kernel_initializer='normal'))
	# Compile model
	model.compile(loss='mean_squared_error', optimizer='adam')
	return model
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(model=baseline_model, epochs=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10)
results = cross_val_score(pipeline, X_trainscaled, y_train, cv=kfold, scoring='neg_mean_squared_error')
print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))

22/09/26 13:25:19 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 3606424 ms exceeds timeout 120000 ms
22/09/26 13:25:19 WARN SparkContext: Killing executors is not supported by current scheduler.


KeyboardInterrupt: 