### 简单的spark ml使用

In [1]:
import math
import time
import datetime
import warnings
import chinese_calendar
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import LongType, IntegerType, StructType, Row, StructField, StringType, TimestampType, FloatType, \
    ArrayType
import pyspark.sql.functions as F
from pyspark.sql.types import *

warnings.filterwarnings("ignore")
from helper.spark_helper import *
from helper.presto_helper import query_presto
from helper.csv_helper import *
from helper.pandas_helper import *
from helper.presto_helper import query_hive_wy

In [5]:
spark.stop()

In [6]:
def open_spark_session(app_name="ai-train"):
    conf = (SparkConf().setMaster("local").setAppName(app_name).set("spark.yarn.queue", "offline").set(
        "spark.sql.crossJoin.enabled", "true").set("hive.exec.dynamic.partition.mode", "nonstrict"))
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    return spark  

spark = open_spark_session(app_name='dws_ai_dispatch_station_st_info_da_v1')

In [2]:
today = datetime.datetime.now().strftime('%Y-%m-%d')
test = 'and city_id in (161)'

### 数据准备

In [7]:
df = spark.sql(f"""
with bike_cond as (
select
  city_id,
  bike_sn,
  if(bike_analysis_label in ('使用车辆', '可用状态未用车辆'), 1, 0) is_useable,
  if(bike_analysis_label in ('使用车辆'), 1, 0) is_used
from
  dwt.dwt_se_es_bike_info_df
where

  event_day = '20220710'
  and bike_analysis_label in ('使用车辆', '可用状态未用车辆', '不可用状态车辆', '投放状态且在库维保车辆')
   {test}
  and if(bike_analysis_label in ('使用车辆', '可用状态未用车辆'), 1, 0) =1 
),
order3 as (
  select
    city_id,
    bike_sn,
    count(1) cnt3
  from
    dwt.dwt_bd_order_detail_da
  where
    event_day between replace(
      to_date(cast('{today}' as TIMESTAMP) - interval '3' day),
      '-',
      ''
    )
    and replace(
      to_date(cast('{today}' as TIMESTAMP) - interval '1' day),
      '-',
      ''
    )
     {test}
  group by
    city_id,
    bike_sn
),
order7 as (
  select
    city_id,
    bike_sn,
    count(1) cnt7
  from
    dwt.dwt_bd_order_detail_da
  where
    event_day between replace(
      to_date(cast('{today}' as TIMESTAMP) - interval '7' day),
      '-',
      ''
    )
    and replace(
      to_date(cast('{today}' as TIMESTAMP) - interval '1' day),
      '-',
      ''
    )
     {test}
  group by
    city_id,
    bike_sn
),
order14 as (
  select
    city_id,
    bike_sn,
    count(1) cnt14
  from
    dwt.dwt_bd_order_detail_da
  where
    event_day between replace(
      to_date(cast('{today}' as TIMESTAMP) - interval '14' day),
      '-',
      ''
    )
    and replace(
      to_date(cast('{today}' as TIMESTAMP) - interval '1' day),
      '-',
      ''
    )
     {test}
  group by
    city_id,
    bike_sn
),
order21 as (
  select
    city_id,
    bike_sn,
    count(1) cnt21
  from
    dwt.dwt_bd_order_detail_da
  where
    event_day between replace(
      to_date(cast('{today}' as TIMESTAMP) - interval '21' day),
      '-',
      ''
    )
    and replace(
      to_date(cast('{today}' as TIMESTAMP) - interval '1' day),
      '-',
      ''
    )
     {test}
  group by
    city_id,
    bike_sn
),
order30 as (
  select
    city_id,
    bike_sn,
    count(1) cnt30
  from
    dwt.dwt_bd_order_detail_da
  where
    event_day between replace(
      to_date(cast('{today}' as TIMESTAMP) - interval '30' day),
      '-',
      ''
    )
    and replace(
      to_date(cast('{today}' as TIMESTAMP) - interval '1' day),
      '-',
      ''
    )
     {test}
  group by
    city_id,
    bike_sn
)

select
  a.*,
  COALESCE(cnt3,0) cnt3,
  COALESCE(cnt7,0) cnt7,
  COALESCE(cnt14,0) cnt14,
  COALESCE(cnt21,0) cnt21,
  COALESCE(cnt30,0) cnt30
from
  bike_cond a
  left join order3 b on a.city_id = b.city_id
  and a.bike_sn = b.bike_sn
  left join order7 c on a.city_id = c.city_id
  and a.bike_sn = c.bike_sn
  left join order14 d on a.city_id = d.city_id
  and a.bike_sn = d.bike_sn
  left join order21 e on a.city_id = e.city_id
  and a.bike_sn = e.bike_sn
  left join order30 f on a.city_id = f.city_id
  and a.bike_sn = f.bike_sn 
  """)

In [8]:
df.cache()
df.first()

Row(city_id=161, bike_sn='815731925', is_useable=1, is_used=1, cnt3=17, cnt7=45, cnt14=69, cnt21=90, cnt30=114)

In [9]:
df.count()

1264

In [19]:
df= df.withColumn('label',0.2*F.col('cnt3')+0.2*F.col('cnt7')+0.2*F.col('cnt14')+0.2*F.col('cnt21')+0.2*F.col('cnt30'))


In [21]:
df.head(10)

[Row(city_id=161, bike_sn='815731925', is_useable=1, is_used=1, cnt3=17, cnt7=45, cnt14=69, cnt21=90, cnt30=114, label=67.0),
 Row(city_id=161, bike_sn='815733102', is_useable=1, is_used=1, cnt3=15, cnt7=34, cnt14=65, cnt21=83, cnt30=117, label=62.80000000000001),
 Row(city_id=161, bike_sn='815954020', is_useable=1, is_used=1, cnt3=16, cnt7=29, cnt14=63, cnt21=95, cnt30=119, label=64.4),
 Row(city_id=161, bike_sn='816216920', is_useable=1, is_used=1, cnt3=19, cnt7=19, cnt14=46, cnt21=68, cnt30=92, label=48.800000000000004),
 Row(city_id=161, bike_sn='816221560', is_useable=1, is_used=1, cnt3=17, cnt7=36, cnt14=55, cnt21=99, cnt30=118, label=65.0),
 Row(city_id=161, bike_sn='816223142', is_useable=1, is_used=1, cnt3=14, cnt7=21, cnt14=42, cnt21=70, cnt30=97, label=48.8),
 Row(city_id=161, bike_sn='816232662', is_useable=1, is_used=0, cnt3=4, cnt7=20, cnt14=44, cnt21=62, cnt30=104, label=46.8),
 Row(city_id=161, bike_sn='816233959', is_useable=1, is_used=1, cnt3=5, cnt7=22, cnt14=56, cnt

### 手动最大最小归一化

In [23]:
df_sd = df.groupBy(['city_id']).agg(F.max('label').alias('max_label'),F.min('label').alias('min_label'))

In [24]:
df2 = df.join(df_sd,on=['city_id'],how ='inner')
df2 = df2.withColumn('label2',(F.col('label')-F.col('min_label'))/(F.col('max_label')-F.col('min_label')))

In [25]:
df2.collect()

[Row(city_id=161, bike_sn='815731925', is_useable=1, is_used=1, cnt3=17, cnt7=45, cnt14=69, cnt21=90, cnt30=114, label=67.0, max_label=94.6, min_label=0.0, label2=0.7082452431289641),
 Row(city_id=161, bike_sn='815733102', is_useable=1, is_used=1, cnt3=15, cnt7=34, cnt14=65, cnt21=83, cnt30=117, label=62.80000000000001, max_label=94.6, min_label=0.0, label2=0.6638477801268501),
 Row(city_id=161, bike_sn='815954020', is_useable=1, is_used=1, cnt3=16, cnt7=29, cnt14=63, cnt21=95, cnt30=119, label=64.4, max_label=94.6, min_label=0.0, label2=0.6807610993657506),
 Row(city_id=161, bike_sn='816216920', is_useable=1, is_used=1, cnt3=19, cnt7=19, cnt14=46, cnt21=68, cnt30=92, label=48.800000000000004, max_label=94.6, min_label=0.0, label2=0.5158562367864694),
 Row(city_id=161, bike_sn='816221560', is_useable=1, is_used=1, cnt3=17, cnt7=36, cnt14=55, cnt21=99, cnt30=118, label=65.0, max_label=94.6, min_label=0.0, label2=0.6871035940803383),
 Row(city_id=161, bike_sn='816223142', is_useable=1, i

In [26]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [28]:
 # 构建特征
asembler = VectorAssembler(inputCols=["cnt3", "cnt7", 'cnt14',
                                      'cnt21', "cnt30"],
                           outputCol="features")
output = asembler.transform(df2)

In [33]:
output.collect()

[Row(city_id=161, bike_sn='815731925', is_useable=1, is_used=1, cnt3=17, cnt7=45, cnt14=69, cnt21=90, cnt30=114, label=67.0, max_label=94.6, min_label=0.0, label2=0.7082452431289641, features=DenseVector([17.0, 45.0, 69.0, 90.0, 114.0])),
 Row(city_id=161, bike_sn='815733102', is_useable=1, is_used=1, cnt3=15, cnt7=34, cnt14=65, cnt21=83, cnt30=117, label=62.80000000000001, max_label=94.6, min_label=0.0, label2=0.6638477801268501, features=DenseVector([15.0, 34.0, 65.0, 83.0, 117.0])),
 Row(city_id=161, bike_sn='815954020', is_useable=1, is_used=1, cnt3=16, cnt7=29, cnt14=63, cnt21=95, cnt30=119, label=64.4, max_label=94.6, min_label=0.0, label2=0.6807610993657506, features=DenseVector([16.0, 29.0, 63.0, 95.0, 119.0])),
 Row(city_id=161, bike_sn='816216920', is_useable=1, is_used=1, cnt3=19, cnt7=19, cnt14=46, cnt21=68, cnt30=92, label=48.800000000000004, max_label=94.6, min_label=0.0, label2=0.5158562367864694, features=DenseVector([19.0, 19.0, 46.0, 68.0, 92.0])),
 Row(city_id=161, b

### df可以这样改列名

In [34]:
features_label = output.select("features", "label2").toDF('features', 'label')

In [35]:
features_label.collect()

[Row(features=DenseVector([17.0, 45.0, 69.0, 90.0, 114.0]), label=0.7082452431289641),
 Row(features=DenseVector([15.0, 34.0, 65.0, 83.0, 117.0]), label=0.6638477801268501),
 Row(features=DenseVector([16.0, 29.0, 63.0, 95.0, 119.0]), label=0.6807610993657506),
 Row(features=DenseVector([19.0, 19.0, 46.0, 68.0, 92.0]), label=0.5158562367864694),
 Row(features=DenseVector([17.0, 36.0, 55.0, 99.0, 118.0]), label=0.6871035940803383),
 Row(features=DenseVector([14.0, 21.0, 42.0, 70.0, 97.0]), label=0.5158562367864693),
 Row(features=DenseVector([4.0, 20.0, 44.0, 62.0, 104.0]), label=0.49471458773784355),
 Row(features=DenseVector([5.0, 22.0, 56.0, 86.0, 121.0]), label=0.613107822410148),
 Row(features=DenseVector([8.0, 21.0, 51.0, 78.0, 130.0]), label=0.6088794926004228),
 Row(features=DenseVector([23.0, 33.0, 62.0, 93.0, 134.0]), label=0.7293868921775899),
 Row(features=DenseVector([0.0, 0.0, 20.0, 53.0, 89.0]), label=0.34249471458773795),
 Row(features=DenseVector([5.0, 17.0, 43.0, 67.0, 

In [36]:
# 将数据集分为训练集和测试集
train, test = features_label.randomSplit([0.7, 0.3], 100)

In [37]:
# 模型训练
print("################step4开始模型训练")
lin_reg = LinearRegression(featuresCol='features', labelCol='label')
linear_model = lin_reg.fit(train)
print("################step5输出模型结果")
print("Coefficients: " + str(linear_model.coefficients))
print(" \n Intercept: " + str(linear_model.intercept))

################step4开始模型训练
################step5输出模型结果
Coefficients: [0.002114164904862555,0.002114164904862622,0.00211416490486247,0.002114164904862818,0.002114164904862429]
 
 Intercept: 2.927618905308604e-15


In [38]:
# 训练集评估
trainSummary = linear_model.summary
print("################step6输出训练效果")
print("RMSE: %f " % trainSummary.rootMeanSquaredError)
print(" \n r2: %f " % trainSummary.r2)

################step6输出训练效果
RMSE: 0.000000 
 
 r2: 1.000000 


In [40]:
# 测试集评估
predictions = linear_model.transform(test)
predictions.collect()

[Row(features=DenseVector([16.0, 29.0, 63.0, 95.0, 119.0]), label=0.6807610993657506, prediction=0.6807610993657521),
 Row(features=DenseVector([19.0, 19.0, 46.0, 68.0, 92.0]), label=0.5158562367864694, prediction=0.51585623678647),
 Row(features=DenseVector([5.0, 22.0, 60.0, 84.0, 124.0]), label=0.623678646934461, prediction=0.6236786469344595),
 Row(features=DenseVector([17.0, 21.0, 27.0, 39.0, 79.0]), label=0.38689217758985206, prediction=0.3868921775898499),
 Row(features=DenseVector([10.0, 21.0, 40.0, 76.0, 99.0]), label=0.5200845665961946, prediction=0.5200845665961968),
 Row(features=DenseVector([12.0, 30.0, 52.0, 73.0, 96.0]), label=0.5560253699788584, prediction=0.5560253699788595),
 Row(features=DenseVector([14.0, 43.0, 95.0, 122.0, 162.0]), label=0.9217758985200845, prediction=0.9217758985200832),
 Row(features=DenseVector([3.0, 6.0, 6.0, 29.0, 60.0]), label=0.21987315010570826, prediction=0.21987315010570857),
 Row(features=DenseVector([8.0, 33.0, 78.0, 103.0, 126.0]), labe

### 指标评估r2,rmse,mae

In [50]:
print("################step7输出测试效果")
pred_evaluator = RegressionEvaluator(predictionCol="prediction",
                                     labelCol="label", metricName="r2")
evaluate_result = pred_evaluator.evaluate(predictions)

print("R Squared (R2) on test data = %g " % evaluate_result)


################step7输出测试效果
R Squared (R2) on test data = 1 


In [130]:
pred_evaluator = RegressionEvaluator(predictionCol="prediction",
                                     labelCol="label", metricName="rmse")
evaluate_result = pred_evaluator.evaluate(predictions)

print("rmse on test data = %g " % evaluate_result)

rmse on test data = 1.70797e-15 


In [131]:
pred_evaluator = RegressionEvaluator(predictionCol="prediction",
                                     labelCol="label", metricName="mae")
evaluate_result = pred_evaluator.evaluate(predictions)

print("mae on test data = %g " % evaluate_result)

mae on test data = 1.37964e-15 


### 测试MinMaxScaler归一化

In [133]:
df2.collect()

[Row(city_id=161, bike_sn='815731925', is_useable=1, is_used=1, cnt3=17, cnt7=45, cnt14=69, cnt21=90, cnt30=114, label=67, max_label=94.6, min_label=0.0, label2=0.7082452431289641),
 Row(city_id=161, bike_sn='815733102', is_useable=1, is_used=1, cnt3=15, cnt7=34, cnt14=65, cnt21=83, cnt30=117, label=62, max_label=94.6, min_label=0.0, label2=0.6638477801268501),
 Row(city_id=161, bike_sn='815954020', is_useable=1, is_used=1, cnt3=16, cnt7=29, cnt14=63, cnt21=95, cnt30=119, label=64, max_label=94.6, min_label=0.0, label2=0.6807610993657506),
 Row(city_id=161, bike_sn='816216920', is_useable=1, is_used=1, cnt3=19, cnt7=19, cnt14=46, cnt21=68, cnt30=92, label=48, max_label=94.6, min_label=0.0, label2=0.5158562367864694),
 Row(city_id=161, bike_sn='816221560', is_useable=1, is_used=1, cnt3=17, cnt7=36, cnt14=55, cnt21=99, cnt30=118, label=65, max_label=94.6, min_label=0.0, label2=0.6871035940803383),
 Row(city_id=161, bike_sn='816223142', is_useable=1, is_used=1, cnt3=14, cnt7=21, cnt14=42,

In [55]:
from pyspark.ml.feature import MinMaxScaler

### 这个函数必须将需要归一化的列转成vector

In [135]:
## 错误的。整形，浮点型的原始数据列需加工成矢量
ttt1 = MinMaxScaler(inputCol='label',outputCol='ttt1')
ttt2 = ttt1.fit(df2)

IllegalArgumentException: 'requirement failed: Column label must be of type struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually int.'

In [60]:
### 正确的
asembler_mms = VectorAssembler(inputCols=["label"], outputCol="l2")
output_mms = asembler_mms.transform(df2)
output_mms.collect()

[Row(city_id=161, bike_sn='815731925', is_useable=1, is_used=1, cnt3=17, cnt7=45, cnt14=69, cnt21=90, cnt30=114, label=67, max_label=94.6, min_label=0.0, label2=0.7082452431289641, l2=DenseVector([67.0])),
 Row(city_id=161, bike_sn='815733102', is_useable=1, is_used=1, cnt3=15, cnt7=34, cnt14=65, cnt21=83, cnt30=117, label=62, max_label=94.6, min_label=0.0, label2=0.6638477801268501, l2=DenseVector([62.0])),
 Row(city_id=161, bike_sn='815954020', is_useable=1, is_used=1, cnt3=16, cnt7=29, cnt14=63, cnt21=95, cnt30=119, label=64, max_label=94.6, min_label=0.0, label2=0.6807610993657506, l2=DenseVector([64.0])),
 Row(city_id=161, bike_sn='816216920', is_useable=1, is_used=1, cnt3=19, cnt7=19, cnt14=46, cnt21=68, cnt30=92, label=48, max_label=94.6, min_label=0.0, label2=0.5158562367864694, l2=DenseVector([48.0])),
 Row(city_id=161, bike_sn='816221560', is_useable=1, is_used=1, cnt3=17, cnt7=36, cnt14=55, cnt21=99, cnt30=118, label=65, max_label=94.6, min_label=0.0, label2=0.68710359408033

### 需要先fit一下，将类型MinMaxScaler变为MinMaxScalerModel，才可以transform操作数据变为dataframe

In [64]:
mms = MinMaxScaler(min=0.0, max=1.0,inputCol='l2',outputCol='l2_output') # 归一化到0～1
mms_model = mms.fit(output_mms)

In [137]:
type(mms)

pyspark.ml.feature.MinMaxScaler

In [136]:
type(mms_model)

pyspark.ml.feature.MinMaxScalerModel

In [63]:
mms_model.transform(output_mms).collect()

[Row(city_id=161, bike_sn='815731925', is_useable=1, is_used=1, cnt3=17, cnt7=45, cnt14=69, cnt21=90, cnt30=114, label=67, max_label=94.6, min_label=0.0, label2=0.7082452431289641, l2=DenseVector([67.0]), l2_output=DenseVector([0.7128])),
 Row(city_id=161, bike_sn='815733102', is_useable=1, is_used=1, cnt3=15, cnt7=34, cnt14=65, cnt21=83, cnt30=117, label=62, max_label=94.6, min_label=0.0, label2=0.6638477801268501, l2=DenseVector([62.0]), l2_output=DenseVector([0.6596])),
 Row(city_id=161, bike_sn='815954020', is_useable=1, is_used=1, cnt3=16, cnt7=29, cnt14=63, cnt21=95, cnt30=119, label=64, max_label=94.6, min_label=0.0, label2=0.6807610993657506, l2=DenseVector([64.0]), l2_output=DenseVector([0.6809])),
 Row(city_id=161, bike_sn='816216920', is_useable=1, is_used=1, cnt3=19, cnt7=19, cnt14=46, cnt21=68, cnt30=92, label=48, max_label=94.6, min_label=0.0, label2=0.5158562367864694, l2=DenseVector([48.0]), l2_output=DenseVector([0.5106])),
 Row(city_id=161, bike_sn='816221560', is_use

In [165]:
mms_model.transform(output_mms).show(5)

+-------+---------+----------+-------+----+----+-----+-----+-----+-----+---------+---------+------------------+------+-------------------+
|city_id|  bike_sn|is_useable|is_used|cnt3|cnt7|cnt14|cnt21|cnt30|label|max_label|min_label|            label2|    l2|          l2_output|
+-------+---------+----------+-------+----+----+-----+-----+-----+-----+---------+---------+------------------+------+-------------------+
|    161|815731925|         1|      1|  17|  45|   69|   90|  114|   67|     94.6|      0.0|0.7082452431289641|[67.0]|[67.42765957446808]|
|    161|815733102|         1|      1|  15|  34|   65|   83|  117|   62|     94.6|      0.0|0.6638477801268501|[62.0]|[62.39574468085105]|
|    161|815954020|         1|      1|  16|  29|   63|   95|  119|   64|     94.6|      0.0|0.6807610993657506|[64.0]|[64.40851063829787]|
|    161|816216920|         1|      1|  19|  19|   46|   68|   92|   48|     94.6|      0.0|0.5158562367864694|[48.0]| [48.3063829787234]|
|    161|816221560|        

### 可以利用此操作得到某列最大最小值，等同于原数据groupby

In [72]:
mms_model.originalMax

DenseVector([94.0])

In [138]:
mms_model.originalMax[0]

94.0

In [139]:
mms_model.originalMin

DenseVector([0.0])

In [140]:
df2 = df2.withColumn('min2',F.lit(mms_model.originalMin[0]))
df2.head()

Row(city_id=161, bike_sn='815731925', is_useable=1, is_used=1, cnt3=17, cnt7=45, cnt14=69, cnt21=90, cnt30=114, label=67, max_label=94.6, min_label=0.0, label2=0.7082452431289641, min2=0.0)

### 独热编码不需要fit，需要看使用的spark版本

In [80]:
from pyspark.ml.feature import OneHotEncoder
df_t = spark.createDataFrame([(0.0,), (1.0,), (2.0,),(1.0,),(2.0,),(2.0,)], ["input"])
df_t.show()

+-----+
|input|
+-----+
|  0.0|
|  1.0|
|  2.0|
|  1.0|
|  2.0|
|  2.0|
+-----+



In [82]:
### 是否将某列中最后出现的一个类别干掉，好像是可以增加线性无关性
### 数据显示方式为字典显示该条数据出现的类别，值为1
oh = OneHotEncoder(dropLast=True,inputCol='input',outputCol='ouput')
oh.transform(df_t).collect()

[Row(input=0.0, ouput=SparseVector(2, {0: 1.0})),
 Row(input=1.0, ouput=SparseVector(2, {1: 1.0})),
 Row(input=2.0, ouput=SparseVector(2, {})),
 Row(input=1.0, ouput=SparseVector(2, {1: 1.0})),
 Row(input=2.0, ouput=SparseVector(2, {})),
 Row(input=2.0, ouput=SparseVector(2, {}))]

### 标准化也需要转vector了

In [141]:
from pyspark.ml.feature import StandardScaler

In [144]:
from pyspark.ml.linalg import Vectors

In [175]:
df_t.collect()

[Row(input=0.0, input2=Row(col1='values', col2=[0.0])),
 Row(input=1.0, input2=Row(col1='values', col2=[1.0])),
 Row(input=2.0, input2=Row(col1='values', col2=[2.0])),
 Row(input=1.0, input2=Row(col1='values', col2=[1.0])),
 Row(input=2.0, input2=Row(col1='values', col2=[2.0])),
 Row(input=2.0, input2=Row(col1='values', col2=[2.0]))]

In [176]:
df_t = df_t.drop(F.col('input2'))

In [177]:
df_t.collect()

[Row(input=0.0),
 Row(input=1.0),
 Row(input=2.0),
 Row(input=1.0),
 Row(input=2.0),
 Row(input=2.0)]

In [183]:
va = VectorAssembler(inputCols=['input'], outputCol='input2')
df_t = va.transform(df_t)

In [184]:
df_t.collect()

[Row(input=0.0, input2=DenseVector([0.0])),
 Row(input=1.0, input2=DenseVector([1.0])),
 Row(input=2.0, input2=DenseVector([2.0])),
 Row(input=1.0, input2=DenseVector([1.0])),
 Row(input=2.0, input2=DenseVector([2.0])),
 Row(input=2.0, input2=DenseVector([2.0]))]

In [185]:
sd = StandardScaler(inputCol='input2',outputCol='ouput')
sd_model = sd.fit(df_t)
sd_model.transform(df_t).collect()

[Row(input=0.0, input2=DenseVector([0.0]), ouput=DenseVector([0.0])),
 Row(input=1.0, input2=DenseVector([1.0]), ouput=DenseVector([1.2247])),
 Row(input=2.0, input2=DenseVector([2.0]), ouput=DenseVector([2.4495])),
 Row(input=1.0, input2=DenseVector([1.0]), ouput=DenseVector([1.2247])),
 Row(input=2.0, input2=DenseVector([2.0]), ouput=DenseVector([2.4495])),
 Row(input=2.0, input2=DenseVector([2.0]), ouput=DenseVector([2.4495]))]

In [77]:
spark

### 测试个gbt

In [102]:
from pyspark.ml.regression import GBTRegressor

In [103]:
# 训练
gbt = GBTRegressor(featuresCol='features', labelCol='label')
gbtmodel = gbt.fit(train)

In [104]:
# 测试集评估
gbt_predictions = gbtmodel.transform(test)
print("################step7输出测试效果")
gbt_evaluator = RegressionEvaluator(predictionCol="prediction",
                                     labelCol="label", metricName="r2")
gbt_result = gbt_evaluator.evaluate(gbt_predictions)

print("R Squared (R2) on test data = %g " % gbt_result)
# spark.stop()


################step7输出测试效果
R Squared (R2) on test data = 0.980116 
