In [None]:
!pip install pyspark

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Regression').getOrCreate()

In [3]:
from pyspark.ml.regression import LinearRegression

In [13]:
# Use Spark to read in the Ecommerce Customers csv file.
data = spark.read.csv("/content/Ecommerce_Customers.csv",inferSchema=True,header=True)

In [None]:

# Print the Schema of the DataFrame
data.printSchema()

In [None]:

data.show()

In [16]:

data.head()

Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)

In [17]:
for item in data.head():
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [18]:
#Setting Up DataFrame for Machine Learning

In [19]:
# A few things we need to do before Spark can accept the data!
# It needs to be in the form of two columns
# ("label","features")

# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
data.columns

In [22]:
assembler = VectorAssembler(
    inputCols=["Avg Session Length", "Time on App", 
               "Time on Website",'Length of Membership'],
    outputCol="features")

In [23]:
output = assembler.transform(data)

In [None]:

output.select("features").show()

In [None]:
output.show()

In [26]:
final_data = output.select("features",'Yearly Amount Spent')

In [27]:
train_data,test_data = final_data.randomSplit([0.7,0.3])


In [None]:

train_data.describe().show()

In [None]:
test_data.describe().show()

In [30]:
# Create a Linear Regression Model object
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [31]:
# Fit the model to the data and call this model lrModel
lrModel = lr.fit(train_data,)
# Print the coefficients and intercept for linear regression
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: [25.90177916851113,38.55695649579878,0.27645234356577036,61.992223797186426] Intercept: -1050.695388788772


In [None]:
test_results = lrModel.evaluate(test_data)

# Interesting results....
test_results.residuals.show()

In [None]:
unlabeled_data = test_data.select('features')

predictions = lrModel.transform(unlabeled_data)

predictions.show()

In [None]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))
print("R _squared: {}".format(test_results.r2))