In [1]:
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .appName("TransformerExample")\
        .getOrCreate()

In [35]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors

In [3]:
training = spark.read.format("libsvm")\
                     .load("../../data/sample_linear_regression_data.txt")

In [28]:
training.take(5)

[Row(label=-9.490009878824548, features=SparseVector(10, {0: 0.4551, 1: 0.3664, 2: -0.3826, 3: -0.4458, 4: 0.3311, 5: 0.8067, 6: -0.2624, 7: -0.4485, 8: -0.0727, 9: 0.5658})),
 Row(label=0.2577820163584905, features=SparseVector(10, {0: 0.8387, 1: -0.127, 2: 0.4998, 3: -0.2269, 4: -0.6452, 5: 0.1887, 6: -0.5805, 7: 0.6519, 8: -0.6556, 9: 0.1749})),
 Row(label=-4.438869807456516, features=SparseVector(10, {0: 0.5026, 1: 0.1421, 2: 0.16, 3: 0.505, 4: -0.9372, 5: -0.2842, 6: 0.6356, 7: -0.1646, 8: 0.9481, 9: 0.4268})),
 Row(label=-19.782762789614537, features=SparseVector(10, {0: -0.0389, 1: -0.4167, 2: 0.8997, 3: 0.641, 4: 0.2733, 5: -0.2618, 6: -0.2795, 7: -0.1307, 8: -0.0854, 9: -0.0546})),
 Row(label=-7.966593841555266, features=SparseVector(10, {0: -0.062, 1: 0.6546, 2: -0.6979, 3: 0.6677, 4: -0.0794, 5: -0.4389, 6: -0.6081, 7: -0.6415, 8: 0.7314, 9: -0.0268}))]

In [36]:
# https://spark.apache.org/docs/latest/api/python/getting_started/quickstart.html#DataFrame-Creation

lineData = spark.createDataFrame([
    [-1.0, Vectors.dense(-0.6)],
    [0.1, Vectors.dense(0.2)],
    [1.2, Vectors.dense(0.9)],
    [2.4, Vectors.dense(2.1)],
    [3.3, Vectors.dense(2.9)]],
    schema=['y', 'x']
)

In [37]:
lineData.collect()

[Row(y=-1.0, x=DenseVector([-0.6])),
 Row(y=0.1, x=DenseVector([0.2])),
 Row(y=1.2, x=DenseVector([0.9])),
 Row(y=2.4, x=DenseVector([2.1])),
 Row(y=3.3, x=DenseVector([2.9]))]

In [38]:
lr = LinearRegression(
    featuresCol="x",
    labelCol="y",
    maxIter=10,
    regParam=0.3,
    elasticNetParam=0.8
)

In [39]:
lrModel = lr.fit(lineData)

In [40]:
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

Coefficients: [0.988377505144876]
Intercept: 0.11278474434063622


In [41]:
# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

numIterations: 2
objectiveHistory: [0.49999999999999994, 0.42957286533750577, 0.1597155255093012]
+--------------------+
|           residuals|
+--------------------+
| -0.5197582412537106|
|-0.21046024536961142|
| 0.19767550102897546|
|  0.2116224948551242|
| 0.32092049073922313|
+--------------------+

RMSE: 0.316636
r2: 0.957875


In [15]:
from pyspark.ml.linalg import Vectors

In [16]:
df = spark.createDataFrame([
     (1.0, 2.0, Vectors.dense(1.0)),
     (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"])

In [17]:
df.collect()

[Row(label=1.0, weight=2.0, features=DenseVector([1.0])),
 Row(label=0.0, weight=2.0, features=SparseVector(1, {}))]