In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.sql.functions import isnan,count,isnull,count,when

In [3]:
df=spark.read.csv('../Dataset/paktemp.csv',header=True,inferSchema=True)



In [55]:
#Checking Null values
from pyspark.sql.functions import col,isnan,when,count
df.select([count(when( isnan(c)|col(c).isNull(),c)).alias(c) for c in df.columns]).show()



+----+----+-----+
|temp|year|month|
+----+----+-----+
|   0|   0|    0|
+----+----+-----+



In [45]:


df.count()

from pyspark.ml.feature import StringIndexer 
si=StringIndexer(inputCol='month',outputCol='month_o')
df=si.fit(df).transform(df)

In [34]:
df=df.drop('month')

In [38]:
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCol="month_o",
                        outputCol='month')

model = encoder.fit(df)
df = model.transform(df)
df.show()

+-------+----+-------+---------------+
|   temp|year|month_o|          month|
+-------+----+-------+---------------+
|7.72768|1901|    4.0| (11,[4],[1.0])|
|  8.936|1901|    3.0| (11,[3],[1.0])|
|16.9632|1901|    7.0| (11,[7],[1.0])|
|21.2741|1901|    0.0| (11,[0],[1.0])|
|26.0497|1901|    8.0| (11,[8],[1.0])|
|29.3811|1901|    6.0| (11,[6],[1.0])|
|29.2391|1901|    5.0| (11,[5],[1.0])|
|27.9718|1901|    1.0| (11,[1],[1.0])|
|25.2887|1901|   11.0|     (11,[],[])|
|21.2279|1901|   10.0|(11,[10],[1.0])|
|15.7539|1901|    9.0| (11,[9],[1.0])|
| 10.917|1901|    2.0| (11,[2],[1.0])|
|9.90868|1902|    4.0| (11,[4],[1.0])|
|11.2238|1902|    3.0| (11,[3],[1.0])|
|17.3237|1902|    7.0| (11,[7],[1.0])|
|22.1011|1902|    0.0| (11,[0],[1.0])|
|26.7454|1902|    8.0| (11,[8],[1.0])|
|28.6774|1902|    6.0| (11,[6],[1.0])|
|29.3654|1902|    5.0| (11,[5],[1.0])|
|28.2741|1902|    1.0| (11,[1],[1.0])|
+-------+----+-------+---------------+
only showing top 20 rows



In [None]:


# from pyspark.ml.feature import RFormula

# formula = RFormula(
#     formula="temp ~ month+ year",
#     featuresCol="features",
#     labelCol="label")

# output = formula.fit(df).transform(df)
# # 
# output.select("features", "label").show()


In [39]:
df=df.drop("month_o")

In [40]:
df.show()

+-------+----+---------------+
|   temp|year|          month|
+-------+----+---------------+
|7.72768|1901| (11,[4],[1.0])|
|  8.936|1901| (11,[3],[1.0])|
|16.9632|1901| (11,[7],[1.0])|
|21.2741|1901| (11,[0],[1.0])|
|26.0497|1901| (11,[8],[1.0])|
|29.3811|1901| (11,[6],[1.0])|
|29.2391|1901| (11,[5],[1.0])|
|27.9718|1901| (11,[1],[1.0])|
|25.2887|1901|     (11,[],[])|
|21.2279|1901|(11,[10],[1.0])|
|15.7539|1901| (11,[9],[1.0])|
| 10.917|1901| (11,[2],[1.0])|
|9.90868|1902| (11,[4],[1.0])|
|11.2238|1902| (11,[3],[1.0])|
|17.3237|1902| (11,[7],[1.0])|
|22.1011|1902| (11,[0],[1.0])|
|26.7454|1902| (11,[8],[1.0])|
|28.6774|1902| (11,[6],[1.0])|
|29.3654|1902| (11,[5],[1.0])|
|28.2741|1902| (11,[1],[1.0])|
+-------+----+---------------+
only showing top 20 rows



In [41]:
from pyspark.ml.feature import VectorAssembler,VectorIndexer

In [56]:
cols=[c for c in df.columns if c!='temp']

In [58]:
va=VectorAssembler(inputCols=cols,outputCol='vafeatures')

In [59]:
vi=VectorIndexer(inputCol='vafeatures',outputCol='features')

In [61]:

from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol='features', labelCol='temp')

In [62]:
eval = RegressionEvaluator(predictionCol=gbt.getPredictionCol(),
    labelCol=gbt.getLabelCol(),
    metricName='rmse')

In [63]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.pipeline import Pipeline

In [64]:
param = ParamGridBuilder().addGrid(gbt.maxIter,[10,15]).addGrid(gbt.stepSize,[0.05,0.08]).addGrid(gbt.maxDepth,[3,10]).build()

In [65]:
cv = CrossValidator(estimator=gbt,
    estimatorParamMaps=param,
    evaluator=eval,
    numFolds=3,
    parallelism=8)