In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.getOrCreate()

22/01/10 21:41:01 WARN Utils: Your hostname, daniyal resolves to a loopback address: 127.0.1.1; using 192.168.228.38 instead (on interface wlp1s0)
22/01/10 21:41:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/10 21:41:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/10 21:41:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/01/10 21:41:03 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/01/10 21:41:03 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/01/10 21:41:03 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
22/01/10 21:41:0

In [7]:
df = spark.read.csv('data.csv',header=True,inferSchema=True)

In [10]:
df.select('cnt').show()

+------+
|   cnt|
+------+
| 985.0|
| 801.0|
|1349.0|
|1562.0|
|1600.0|
|1606.0|
|1510.0|
| 959.0|
| 822.0|
|1321.0|
|1263.0|
|1162.0|
|1406.0|
|1421.0|
|1248.0|
|1204.0|
|1000.0|
| 683.0|
|1650.0|
|1927.0|
+------+
only showing top 20 rows



In [45]:
df.groupBy('yr').count().show()

+---+-----+
| yr|count|
+---+-----+
|0.0|  365|
|1.0|  366|
+---+-----+



In [7]:
from pyspark.ml.feature import VectorAssembler,VectorIndexer

In [8]:
cols = [c for c in df.columns if c !='cnt']

In [9]:
va = VectorAssembler(inputCols=cols,outputCol='vafeatures')

In [10]:
vi = VectorIndexer(inputCol='vafeatures',outputCol='features',maxCategories=4)

In [11]:
from pyspark.ml.regression import GBTRegressor

In [12]:
gbt = GBTRegressor(featuresCol='features', labelCol='cnt')

In [13]:
from pyspark.ml.evaluation import RegressionEvaluator 

In [14]:
eval = RegressionEvaluator(predictionCol=gbt.getPredictionCol(),
    labelCol=gbt.getLabelCol(),
    metricName='rmse')

In [15]:
print(gbt.explainParams())

cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. (default: False)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 

In [16]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.pipeline import Pipeline

In [17]:
param = ParamGridBuilder().addGrid(gbt.maxIter,[10,15]).addGrid(gbt.stepSize,[0.05,0.08]).addGrid(gbt.maxDepth,[3,10]).build()

In [18]:
cv = CrossValidator(estimator=gbt,
    estimatorParamMaps=param,
    evaluator=eval,
    numFolds=3,
    parallelism=8)

In [19]:
pipe = Pipeline(stages=[va,vi,cv])

In [20]:
df.printSchema()

root
 |-- season: double (nullable = true)
 |-- yr: double (nullable = true)
 |-- mnth: double (nullable = true)
 |-- holiday: double (nullable = true)
 |-- weekday: double (nullable = true)
 |-- workingday: double (nullable = true)
 |-- weathersit: double (nullable = true)
 |-- temp: double (nullable = true)
 |-- atemp: double (nullable = true)
 |-- hum: double (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- cnt: double (nullable = true)



In [21]:
train, test = df.randomSplit([0.7,0.3])

In [22]:
pModel = pipe.fit(train)

In [23]:
explainPrm = pModel.stages[-1].explainParams()

In [24]:
explainPrm

"estimator: estimator to be cross-validated (current: GBTRegressor_9dd10e034409)\nestimatorParamMaps: estimator param maps (current: [{Param(parent='GBTRegressor_9dd10e034409', name='maxIter', doc='max number of iterations (>= 0).'): 10, Param(parent='GBTRegressor_9dd10e034409', name='stepSize', doc='Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.'): 0.05, Param(parent='GBTRegressor_9dd10e034409', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 3}, {Param(parent='GBTRegressor_9dd10e034409', name='maxIter', doc='max number of iterations (>= 0).'): 10, Param(parent='GBTRegressor_9dd10e034409', name='stepSize', doc='Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.'): 0.05, Param(parent='GBTRegressor_9dd10e034409', name='maxDepth', doc='Maximum depth of the tree. (>= 0) 

In [25]:
bestModel = pModel.stages[-1].bestModel

In [26]:
bestModel

GBTRegressionModel: uid=GBTRegressor_9dd10e034409, numTrees=15, numFeatures=11

In [27]:
evalu = pModel.transform(test)
evalu

DataFrame[season: double, yr: double, mnth: double, holiday: double, weekday: double, workingday: double, weathersit: double, temp: double, atemp: double, hum: double, windspeed: double, cnt: double, vafeatures: vector, features: vector, prediction: double]

In [31]:
finalRes = eval.setMetricName('r2').evaluate(evalu)

In [32]:
finalRes

0.8724726592297816