In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('lr_project').getOrCreate()

In [3]:
from pyspark.ml.regression import LinearRegression

In [4]:
data = spark.read.csv('/FileStore/tables/Ecommerce_Customers.csv', header = True,
                     inferSchema = True)

In [5]:
data.printSchema()

In [6]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [7]:
data.columns

In [8]:
assembler = VectorAssembler(inputCols= ['Avg Session Length', 
                                        'Time on App', 
                                        'Time on Website', 
                                        'Length of Membership'],
                           outputCol = 'features')

In [9]:
output = assembler.transform(data)

In [10]:
output.printSchema()

In [11]:
# take a look at the output data
output.head(1)

In [12]:
final_data = output.select('features', 'Yearly Amount Spent')
final_data.show()

In [13]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [14]:
train_data.describe().show()

In [15]:
test_data.describe().show()

In [16]:
#linear regression model
lr = LinearRegression(featuresCol= 'features',
                     labelCol = 'Yearly Amount Spent')

In [17]:
lr_model = lr.fit(train_data)

In [18]:
test_results = lr_model.evaluate(test_data)

In [19]:
# the difference between test data and predicted data
test_results.residuals.show()

In [20]:
test_results.rootMeanSquaredError

In [21]:
test_results.r2

In [22]:
real_data = test_data.select('features')

In [23]:
predictions = lr_model.transform(real_data)

In [24]:
predictions.show()