## Notebook for pickup density for an average day (Approach 1) 

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [None]:

# Load and parse the data file, converting it to a DataFrame.
data = spark.read.load("hdfs:///Projects/ID2223nyctaxi/prepared_taxirides/yellow_and_green_data_finally2",format='com.databricks.spark.csv', header='true',inferSchema='true')
#print(data.take(5))
#data.show()
#print(data.take(5))

timecatIndexer = StringIndexer(inputCol="time_cat", outputCol="time_cat_in")
daycatIndexer = StringIndexer(inputCol="day_cat", outputCol="day_cat_in")
#pickupIndexer = StringIndexer(inputCol="pickup_location", outputCol="pickup_location_in")
#dropoffIndexer = StringIndexer(inputCol="dropoff_location", outputCol="dropoff_location_in")

data1 = timecatIndexer.fit(data).transform(data)
data2 = daycatIndexer.fit(data).transform(data1)
#data3 = pickupIndexer.fit(data).transform(data2)
#data4 = dropoffIndexer.fit(data).transform(data3)

dataprepper = VectorAssembler(inputCols=["year", "month", "day", "time_cat_in", "time_num", "time_cos", "time_sin","day_cat_in", "day_num", "day_cos","day_sin", "weekend", "pickup_latitude", "pickup_longitude"], outputCol="features")

#pipe = Pipeline(stages=[timecatIndexer,daycatIndexer,pickupIndexer, dropoffIndexer,dataprepper])
sdkfj = dataprepper.transform(data2).select("features")

print(sdkfj.take(5))
sdkfj.show(4)

#featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4)
#featureIndexer.transform(sdkfj).show()

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.

featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(sdkfj)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([70, 30])
print(trainingData.take(5))

# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures").setMaxBins(48) #.setMaxDepth(10).setNumTrees(25)

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[timecatIndexer,daycatIndexer, dataprepper, featureIndexer, rf])

# Cross-validator
paramGrid = ParamGridBuilder() \
.addGrid(rf.maxDepth, [10,20]) \
.addGrid(rf.numTrees, [15]) \
.build()
    
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator= RegressionEvaluator(), numFolds=3)  # use 3+ folds in practice    

# Train model. This also runs the indexer.
#model = pipeline.fit(trainingData) 
model = cv.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
#predictions.sort("label", ascending = False).select("prediction", "label").write.format("com.databricks.spark.csv").option("header", "true").save("hdfs:///Projects/ID2223nyctaxi/predictions/tiny_green_2013-10_no_geohash_res2")


# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

#rfModel = model.stages[4]
print(model)
#print(rfModel)  # summary only

model.bestModel.save("hdfs:///Projects/ID2223nyctaxi/models/regressionforest1-small-d1-t1")

#print(predictions.select("features").take(5))
