In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('lr').getOrCreate()

In [3]:
%fs ls /FileStore/tables

path,name,size
dbfs:/FileStore/tables/ContainsNull.csv,ContainsNull.csv,61
dbfs:/FileStore/tables/Ecommerce_Customers.csv,Ecommerce_Customers.csv,86871
dbfs:/FileStore/tables/Spark_Essentials-5d27c.dbc,Spark_Essentials-5d27c.dbc,1414841
dbfs:/FileStore/tables/appl_stock.csv,appl_stock.csv,143130
dbfs:/FileStore/tables/cogsley_clients.csv,cogsley_clients.csv,384219
dbfs:/FileStore/tables/cogsley_sales.csv,cogsley_sales.csv,2176442
dbfs:/FileStore/tables/people.json,people.json,73
dbfs:/FileStore/tables/sales_info.csv,sales_info.csv,196
dbfs:/FileStore/tables/sample_linear_regression_data.txt,sample_linear_regression_data.txt,119069
dbfs:/FileStore/tables/state_info.csv,state_info.csv,2778


In [4]:
path = '/FileStore/tables/Ecommerce_Customers.csv'
df = spark.read.csv(path,inferSchema= True,header=True)

In [5]:
df.show(5)

In [6]:
df.describe().show()

In [7]:
df.printSchema()

In [8]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [9]:
df.columns

In [10]:
# taking the interesting variables and transforming into one feature vector named features.This featureS column is the independent variable
#spark expect this format
assembler = VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website','Length of Membership'],
                           outputCol='features')

In [11]:
output = assembler.transform(df)

In [12]:
output.printSchema()

In [13]:
output.show(2)

In [14]:
final_data = output.select(['features','Yearly Amount Spent'])

In [15]:
final_data.show(5) 
# here features is the vector of independent varibles  and yearly amount is the depndent variable

In [16]:
train_data,test_data= final_data.randomSplit([0.7,0.3])

In [17]:
train_data.describe().show()

In [18]:
from pyspark.ml.regression import LinearRegression

In [19]:
lr = LinearRegression(featuresCol='features',labelCol='Yearly Amount Spent',predictionCol='prediction')

In [20]:
lr_model = lr.fit(train_data)

In [21]:
model_summary = lr_model.summary

In [22]:
model_summary.pValues

In [23]:
model_summary.rootMeanSquaredError

In [24]:
model_summary.r2

In [25]:
model_summary.residuals.show()

In [26]:
test_result = lr_model.evaluate(test_data)

In [27]:
test_result.r2

In [28]:
test_result.rootMeanSquaredError

In [29]:
final_data.describe().show()
# here the mean of  y is 499.31 and sd is 79.31 while the root mean sqaure error of y on test data is 10.5. this means the model is fitted well

In [30]:
unlabeled_data = test_data.select('features')

In [31]:
pred = lr_model.transform(unlabeled_data)

In [32]:
pred.show(5)

In [33]:
pred.show(5)