## Predict the customer's total amount expenditure (continuous money Value )

## Convert realstic data into a format that are most adapted to Spark MLib

In [1]:
# requierments

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("lregexample").getOrCreate()

In [4]:
from pyspark.ml.regression import LinearRegression

In [5]:
data = spark.read.csv("../../Data/Ecommerce_Customers.csv", inferSchema=True, header = True)

In [7]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



# we want to predict the Yearly amount Spent

In [38]:
# Analysing the data
for item in data.head(1) [0]:
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [39]:
# set up the dataframe fo mlib libraries by using vectors 

from pyspark.ml.linalg import Vectors 
from pyspark.ml.feature import  VectorAssembler

In [40]:
# Grab the features and labels 
# Vector ASsembler take inputCols as columns we want to take together and output par that indicates what they mean


## get the numeric data
assembler = VectorAssembler(inputCols= ["Avg Session Length","Time on App","Time on App","Time on App",
                                    "Length of Membership"], outputCol="features") 

In [41]:
# now lets Transform the data 

output = assembler.transform(data)

In [42]:
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [48]:
# now let extract out Final dataset 
final_data = output.select(["features","Yearly Amount Spent"])

In [49]:
# now lets prepare the data for split to make our model 

In [50]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [51]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                342|
|   mean|  497.2534091575866|
| stddev|   76.4334906389623|
|    min|   266.086340948469|
|    max|  712.3963268096637|
+-------+-------------------+



In [52]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                158|
|   mean| 503.77438732532113|
| stddev|  85.30134382288338|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



# Lets create our Model Linear Regression to predict our Yearly amount Spent

In [55]:
lr = LinearRegression(labelCol="Yearly Amount Spent", featuresCol="features")

In [56]:
lr_model = lr.fit(train_data)

In [57]:
test_result = lr_model.evaluate(test_data)

# Evalution the models 

In [58]:
test_result.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -13.03957714553951|
|   9.50878015895762|
|-7.0726759951549525|
|-13.696731684005897|
|-22.625715998367582|
|-5.2126568309602135|
|-4.8280105547132735|
| -2.290710539914812|
| -5.392433334243663|
|-26.813461264174407|
| -6.999509480135373|
|-18.376956538288198|
|-3.1787307687838506|
|-2.2943665220087723|
| 11.129858228008175|
|-2.7960467455949924|
|  4.428168824318618|
|0.09838367496899991|
|  5.176354702675667|
| 15.548968769366127|
+-------------------+
only showing top 20 rows



In [60]:
test_result.rootMeanSquaredError

9.768562785434582

In [62]:
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



if we compare the RMS root mean squareError that give us the error in the unit of Yearly amount spent we can say 
that out model is good because if we take the mean is 499 , std = 79 min = 256 and max = 765  and the error prediction is just about 10 dolars so we can say that out model is good enough to predict the yearly amout spent with error of 10 dolars 

In [61]:
test_result.r2

0.986802033159588

about the rSquared error that say that out model expalin 98 percent of the variance in the data that also os good 

# let us deploy this in real data set for some customer to predict the yearly amount spent

In [63]:
unlabeled_data = test_data.select("features")

In [64]:
prediction = lr_model.transform(unlabeled_data)

In [66]:
prediction.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.3931845423455...| 332.9684469487331|
|[30.7377203726281...|452.27196203727226|
|[31.0613251567161...|494.62813405305656|
|[31.0662181616375...|462.63002489168025|
|[31.1239743499119...|509.57276983813335|
|[31.2681042107507...|428.68319000478414|
|[31.5147378578019...| 494.6404985511747|
|[31.5761319713222...| 543.5172945292431|
|[31.6253601348306...|381.72933409116786|
|[31.6739155032749...| 502.5385291740556|
|[31.7242025238451...|510.38739676809587|
|[31.8164283341993...| 519.4994480419446|
|[31.8186165667690...| 449.5974041389195|
|[31.8627411090001...| 558.5925076960555|
|[31.9096268275227...|  552.316177445231|
|[31.9120759292006...| 390.3307630513027|
|[31.9764800614612...| 326.1662772097816|
|[32.0444861274404...|  448.131445511581|
|[32.0478009788678...| 508.2742164834208|
|[32.0478146331398...| 481.8405889894773|
+--------------------+------------