In [None]:
from pyspark.sql import SparkSession

# Initialize the spark session and naming it
# appName() sets a name for the application which will be shown in the Spark web UI
# getOrCreate() gets the sparkSession 
spark = SparkSession.builder.appName("Customers").getOrCreate()

In [None]:
# Read the csv dataset
# inferSchema - session will automatically understand the schema like strings or floats. Uses one extra pass over the data
# header - uses the first line as a name of columns
dataset = spark.read.csv("Ecommerce_Customers.csv",inferSchema=True,header=True)

In [None]:
# print the column values
dataset

In [None]:
# Print the first n rows
dataset.show()

In [None]:
# Print the schema in the tree format
dataset.printSchema()

In [None]:
# from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [None]:
# A feature transformer merges multiple columns into a vector column in order to train the model
# Any name for output col
featureassemble = VectorAssembler(inputCols=["Avg Session Length", "Time on App", "Time on Website", "Length of Membership"], outputCol = "Independent Features")

In [None]:
# Create a new dataframe using the above columns
output = featureassemble.transform(dataset)

In [None]:
# Print the first n rows
output.show()

In [None]:
# Print only the output column
output.select("Independent Features").show()

In [None]:
output.columns

In [None]:
finalized_data = output.select("Independent Features", "Yearly Amount Spent")

In [None]:
finalized_data.show()

In [None]:
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])

In [None]:
# Import the models for linear regression
# loss = "squarederror" and "huber" loss
# regularization = none(ordinary least squares), L2(ridge regression), L1(Lasso), L2+L1(elastic net)
from pyspark.ml.regression import LinearRegression

regressor = LinearRegression(featuresCol='Independent Features',
                             labelCol='Yearly Amount Spent')
regressor = regressor.fit(train_data)

In [None]:
# The model coefficients
regressor.coefficients

In [None]:
# The model intercept
regressor.intercept

In [None]:
pred_results = regressor.evaluate(test_data)

In [None]:
# pred_results.predictions.show()
pred_results.predictions.show(40)