# Import the Required Libraries

In [1]:
import numpy as np 
import pandas as pd  
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Customers").getOrCreate()

In [2]:
from pyspark.ml.regression import LinearRegression

# Load The Dataset

In [3]:
dataset = spark.read.csv("D:/DataSets/Ecommerce_Customers.csv", inferSchema = True, header = True)

In [4]:
dataset

DataFrame[Email: string, Address: string, Avg Session Length: double, Time on App: double, Time on Website: double, Length of Membership: double, Yearly Amount Spent: double]

In [5]:
dataset.show(20)

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|
|mstephens@davidso...|14023 Rodriguez P...|       33.33067252|12.79518855|  

In [6]:
dataset.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [21]:
#sklearn

#x1,x2,x3,x4,x5   v1----> model-->prediction

#[x1,x2,x3,x4,x5]  v1----> model-->prediction

# Feature Engineering

In [8]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [9]:
featureassembler = VectorAssembler(inputCols = ["Avg Session Length", "Time on App", "Time on Website", "Length of Membership"], outputCol="IndependentFeatures")

In [10]:
output = featureassembler.transform(dataset)

In [11]:
output.show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent| IndependentFeatures|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|[34.49726773,12.6...|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|[31.92627203,11.1...|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|[33.00091476,11.3...|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.7

In [12]:
output.select("IndependentFeatures").show()

+--------------------+
| IndependentFeatures|
+--------------------+
|[34.49726773,12.6...|
|[31.92627203,11.1...|
|[33.00091476,11.3...|
|[34.30555663,13.7...|
|[33.33067252,12.7...|
|[33.87103788,12.0...|
|[32.0215955,11.36...|
|[32.73914294,12.3...|
|[33.9877729,13.38...|
|[31.93654862,11.8...|
|[33.99257277,13.3...|
|[33.87936082,11.5...|
|[29.53242897,10.9...|
|[33.19033404,12.9...|
|[32.38797585,13.1...|
|[30.73772037,12.6...|
|[32.1253869,11.73...|
|[32.33889932,12.0...|
|[32.18781205,14.7...|
|[32.61785606,13.9...|
+--------------------+
only showing top 20 rows



In [13]:
output.columns

['Email',
 'Address',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent',
 'IndependentFeatures']

In [14]:
finalized_data = output.select("IndependentFeatures", "Yearly Amount Spent")

In [15]:
finalized_data.show()

+--------------------+-------------------+
| IndependentFeatures|Yearly Amount Spent|
+--------------------+-------------------+
|[34.49726773,12.6...|         587.951054|
|[31.92627203,11.1...|        392.2049334|
|[33.00091476,11.3...|        487.5475049|
|[34.30555663,13.7...|         581.852344|
|[33.33067252,12.7...|         599.406092|
|[33.87103788,12.0...|        637.1024479|
|[32.0215955,11.36...|        521.5721748|
|[32.73914294,12.3...|        549.9041461|
|[33.9877729,13.38...|         570.200409|
|[31.93654862,11.8...|        427.1993849|
|[33.99257277,13.3...|        492.6060127|
|[33.87936082,11.5...|        522.3374046|
|[29.53242897,10.9...|        408.6403511|
|[33.19033404,12.9...|        573.4158673|
|[32.38797585,13.1...|        470.4527333|
|[30.73772037,12.6...|        461.7807422|
|[32.1253869,11.73...|        457.8476959|
|[32.33889932,12.0...|        407.7045475|
|[32.18781205,14.7...|        452.3156755|
|[32.61785606,13.9...|        605.0610388|
+----------

# Model Building

In [16]:
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])

In [17]:
regressor = LinearRegression(featuresCol = "IndependentFeatures", labelCol = "Yearly Amount Spent")
regressor = regressor.fit(train_data)

In [18]:
regressor.coefficients

DenseVector([26.3515, 38.6811, 0.4457, 61.5469])

In [19]:
regressor.intercept

-1072.0627151628055

In [20]:
pred_result = regressor.evaluate(test_data)
pred_result.predictions.show()

+--------------------+-------------------+------------------+
| IndependentFeatures|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[31.26064687,13.2...|        421.3266313|420.89501229849475|
|[31.28344748,12.7...|        591.7810894| 568.3252494457638|
|[31.36621217,11.1...|        430.5888826| 425.6386958303342|
|[31.38958548,10.9...|        410.0696111| 408.3735718881785|
|[31.44597248,12.8...|        484.8769649|480.87999237431654|
|[31.5171218,10.74...|        275.9184207|279.32477811127774|
|[31.53160448,13.3...|        436.5156057|431.78502639348653|
|[31.57020083,13.3...|        545.9454921| 562.4540033092487|
|[31.66104982,11.3...|        416.3583536| 416.3043507569407|
|[31.73663569,10.7...|        496.9334463| 493.5504614638569|
|[31.76561882,12.4...|        496.5540816|500.27068930540713|
|[31.81861657,11.2...|        446.4186734| 447.7204835239388|
|[31.82797906,12.4...|        440.0027475| 448.4306420601067|
|[31.829