### Spark for Machine Learning & AI
### 05 Regression

In [2]:
import findspark
findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

### Linear Regression

In [3]:
from pyspark.ml.regression import LinearRegression
pp_df=spark.read.csv("./power_plant.csv")
pp_df

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string]

In [4]:
pp_df=spark.read.csv("./power_plant.csv",header=True,inferSchema=True)
pp_df

DataFrame[AT: double, V: double, AP: double, RH: double, PE: double]

In [5]:
from pyspark.ml.feature import VectorAssembler

In [6]:
vectorAssembler=VectorAssembler(inputCols=["AT","V","AP","RH"],outputCol="features")
vpp_df = vectorAssembler.transform(pp_df)
vpp_df.take(1)

[Row(AT=14.96, V=41.76, AP=1024.07, RH=73.17, PE=463.26, features=DenseVector([14.96, 41.76, 1024.07, 73.17]))]

In [7]:
lr=LinearRegression(featuresCol="features",labelCol="PE")
lr_model=lr.fit(vpp_df)

In [8]:
lr_model.coefficients

DenseVector([-1.9775, -0.2339, 0.0621, -0.1581])

In [9]:
lr_model.intercept

454.6092744523414

In [12]:
lr_model.summary.rootMeanSquaredError

4.557126016749488

In [13]:
lr_model.save("lr1.model")

### Decision tree regression

In [14]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

In [15]:
pp_df=spark.read.csv("./power_plant.csv",header=True, inferSchema=True)
pp_df.take(1)

[Row(AT=14.96, V=41.76, AP=1024.07, RH=73.17, PE=463.26)]

In [16]:
vectorAssembler=VectorAssembler(inputCols=["AT","V","AP","RH"],outputCol="features")
vpp_df = vectorAssembler.transform(pp_df)
vpp_df.take(1)

[Row(AT=14.96, V=41.76, AP=1024.07, RH=73.17, PE=463.26, features=DenseVector([14.96, 41.76, 1024.07, 73.17]))]

In [17]:
splits = vpp_df.randomSplit([0.7,0.3])
train_df=splits[0]
test_df=splits[1]
train_df.count()

6663

In [18]:
test_df.count()

2905

In [19]:
vpp_df.count()

9568

In [20]:
dt=DecisionTreeRegressor(featuresCol="features",labelCol="PE")
dt_model=dt.fit(train_df)
dt_predictions=dt_model.transform(test_df)
dt_evaluator=RegressionEvaluator(labelCol="PE",predictionCol="prediction",metricName="rmse")
rmse=dt_evaluator.evaluate(dt_predictions)
rmse

4.650148321835953

### Gradient-boosted tree regression

In [21]:
from pyspark.ml.regression import GBTRegressor

In [23]:
gbt=GBTRegressor(featuresCol="features",labelCol="PE")
gbt_model=gbt.fit(train_df)
gbt_predictions=gbt_model.transform(test_df)
gbt_evaluator=RegressionEvaluator(labelCol="PE",predictionCol="prediction",metricName="rmse")
gbt_rmse=gbt_evaluator.evaluate(gbt_predictions)
gbt_rmse

4.245838297443614