## Notebook for pickup density for a specific time and date in the future (Approach 2) 

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
import geohash2 as gh

In [None]:

# Load and parse the data file, converting it to a DataFrame.
alldata = spark.read.load("hdfs:///Projects/ID2223nyctaxi/prepared_taxirides/yellow_and_green_data_no_dropoff",format='com.databricks.spark.csv', header='true',inferSchema='true') 
trainingdata = alldata.filter(alldata.year < 2015)
print(alldata.count())
print(trainingdata.count())

#String indexers for string features
timecatIndexer = StringIndexer(inputCol="time_cat", outputCol="time_cat_in")
daycatIndexer = StringIndexer(inputCol="day_cat", outputCol="day_cat_in")


data1 = timecatIndexer.fit(trainingdata).transform(trainingdata)
data2 = daycatIndexer.fit(trainingdata).transform(data1)


cols = ["PRCP", "SNWD", "SNOW", "TMAX", "TMIN", "AWND", "year", "month", "day", "time_cat_in", "time_num", "time_cos", "time_sin","day_cat_in", "day_num", "day_cos","day_sin", "weekend", "pickup_latitude", "pickup_longitude"]
dataprepper = VectorAssembler(inputCols=cols, outputCol="features")

#pipe = Pipeline(stages=[timecatIndexer,daycatIndexer,pickupIndexer, dropoffIndexer,dataprepper])
sdkfj = dataprepper.transform(data2).select("features")

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.

featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(sdkfj)

# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="features").setMaxBins(48).setMaxDepth(5).setNumTrees(30)

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[timecatIndexer,daycatIndexer, dataprepper, rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingdata)

# Load testdata. 
alltestDataRaw = spark.read.load("hdfs:///Projects/ID2223nyctaxi/prepared_taxirides/yellow_and_green_data_no_dropoff",format='com.databricks.spark.csv', header='true',inferSchema='true')
testDataRaw = alltestDataRaw.filter(alltestDataRaw.year == 2015)
print(testDataRaw.count())
# Make predictions.
predictions = model.transform(testDataRaw)
#predictions.show(5)


# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[3]
print(rfModel)  # summary only