In [1]:
#创建 sparksession 对象
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('lin_reg').getOrCreate()

In [2]:
#导入 Linear Regression 
from pyspark.ml.regression import LinearRegression

In [7]:
#导入数据
df=spark.read.csv('Linear_regression_dataset.csv',inferSchema=True,header=True)#inferSchema=True表明spark将在后台自行推断数据集中值的数据类型

In [4]:
df.show(5)

+-----+-----+-----+-----+-----+------+
|var_1|var_2|var_3|var_4|var_5|output|
+-----+-----+-----+-----+-----+------+
|  734|  688|   81|0.328|0.259| 0.418|
|  700|  600|   94| 0.32|0.247| 0.389|
|  712|  705|   93|0.311|0.247| 0.417|
|  734|  806|   69|0.315| 0.26| 0.415|
|  613|  759|   61|0.302| 0.24| 0.378|
+-----+-----+-----+-----+-----+------+
only showing top 5 rows



首选查看数据结构

In [8]:
print((df.count(), len(df.columns)))

(1232, 6)


In [9]:
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)



In [13]:
df.describe().show(5,False)

+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|var_1            |var_2            |var_3             |var_4               |var_5               |output             |
+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|count  |1232             |1232             |1232              |1232                |1232                |1232               |
|mean   |715.0819805194806|715.0819805194806|80.90422077922078 |0.3263311688311693  |0.25927272727272715 |0.39734172077922014|
|stddev |91.5342940441652 |93.07993263118064|11.458139049993724|0.015012772334166148|0.012907228928000298|0.03326689862173776|
|min    |463              |472              |40                |0.277               |0.214               |0.301              |
|max    |1009             |1103             |116               |0.373               |0.294               |0.491

In [15]:
df.head(5)

[Row(var_1=734, var_2=688, var_3=81, var_4=0.328, var_5=0.259, output=0.418),
 Row(var_1=700, var_2=600, var_3=94, var_4=0.32, var_5=0.247, output=0.389),
 Row(var_1=712, var_2=705, var_3=93, var_4=0.311, var_5=0.247, output=0.417),
 Row(var_1=734, var_2=806, var_3=69, var_4=0.315, var_5=0.26, output=0.415),
 Row(var_1=613, var_2=759, var_3=61, var_4=0.302, var_5=0.24, output=0.378)]

查看相关性

In [17]:
from pyspark.sql.functions import corr

In [20]:
df.select(corr('var_1','output')).show()

+-------------------+
|corr(var_1, output)|
+-------------------+
| 0.9187399607627283|
+-------------------+



特征工程化

In [21]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

VectorAssembler创建一个向量，该向量会合并所有输入特征。VectorAssembler只会创建单个特征，这个特征会捕获该行的输入值，因此，并不会分别使用五个输入列，如实际上是将所有的输入列合并成单个特征向量列

我们输入5个自变量来创建单个特征向量列

In [22]:
vec_assmebler=VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'],outputCol='features')

In [23]:
features_df=vec_assmebler.transform(df)

In [24]:
features_df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)
 |-- features: vector (nullable = true)



查看创建好的向量列

In [25]:
features_df.select('features').show(5,False)

+------------------------------+
|features                      |
+------------------------------+
|[734.0,688.0,81.0,0.328,0.259]|
|[700.0,600.0,94.0,0.32,0.247] |
|[712.0,705.0,93.0,0.311,0.247]|
|[734.0,806.0,69.0,0.315,0.26] |
|[613.0,759.0,61.0,0.302,0.24] |
+------------------------------+
only showing top 5 rows



我们使用该features列以及输入列即标签列来构建线性回归模型

In [26]:
model_df=features_df.select('features','output')

In [27]:
model_df.show(5,False)

+------------------------------+------+
|features                      |output|
+------------------------------+------+
|[734.0,688.0,81.0,0.328,0.259]|0.418 |
|[700.0,600.0,94.0,0.32,0.247] |0.389 |
|[712.0,705.0,93.0,0.311,0.247]|0.417 |
|[734.0,806.0,69.0,0.315,0.26] |0.415 |
|[613.0,759.0,61.0,0.302,0.24] |0.378 |
+------------------------------+------+
only showing top 5 rows



In [28]:
#model df的shape
print((model_df.count(), len(model_df.columns)))

(1232, 2)


划分数据集

In [29]:
train_df,test_df=model_df.randomSplit([0.7,0.3])

In [30]:
print((train_df.count(), len(train_df.columns)))

(872, 2)


In [31]:
print((test_df.count(), len(test_df.columns)))

(360, 2)


In [32]:
train_df.describe().show()

+-------+--------------------+
|summary|              output|
+-------+--------------------+
|  count|                 872|
|   mean|  0.3981146788990823|
| stddev|0.033376131707401745|
|    min|               0.311|
|    max|               0.491|
+-------+--------------------+



构建和训练线性回归模型

In [34]:
liner_reg=LinearRegression(labelCol='output')
lr_model=liner_reg.fit(train_df)

查看预测结果

In [36]:
#变量系数
print(lr_model.coefficients)

[0.00034642151800646956,5.047411976464547e-05,0.0001630335852657896,-0.7004540132692385,0.5237957869785653]


In [37]:
#截距项
lr_model.intercept

0.19299641631015274

In [38]:
#预测
training_predictions=lr_model.evaluate(train_df)

In [39]:
training_predictions.meanSquaredError

0.000142895075293416

In [40]:
training_predictions.r2

0.8715767694972083

测试集

In [41]:
test_results=lr_model.evaluate(test_df)

In [42]:
test_results.residuals.show(10)

+--------------------+
|           residuals|
+--------------------+
|-0.01280624132215974|
|-0.00153969351663...|
|0.013097071702937335|
|-0.01401716194435...|
|0.007339799512507017|
|0.002015221570141...|
|-0.00857162167250...|
|-0.00204245532745...|
| 0.01581893143385027|
|-0.00593573051215...|
+--------------------+
only showing top 10 rows



In [43]:
test_results.r2

0.8626252784700689

In [44]:
test_results.rootMeanSquaredError

0.012203723207422518

In [45]:
test_results.meanSquaredError

0.00014893086012338297