In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit

conf = (
    SparkConf()
    .setMaster("local[*]")
    .setAppName("model_cab_prices")
    .set("spark.executor.memory", "4g")
    .set("spark.executor.cores", "5")
)
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

23/03/19 12:29:23 WARN Utils: Your hostname, Davis-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.0.102 instead (on interface en0)
23/03/19 12:29:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/19 12:29:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/03/19 12:29:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
df = spark.read.parquet("../Uber_Lyft_Cab_prices/df_final.parquet")

                                                                                

In [3]:
df_final = (
    df.select(
    "distance", "cab_type", "destination", "source", "price",
    "surge_multiplier", "product_id", "hour", "day_of_week",
    "month", "temp", "clouds", "pressure", "rain", "humidity", "wind"
    )
)

In [4]:
(trainData, testData) = df_final.randomSplit([0.8, 0.2], seed=123)

In [5]:
cols_to_transform = ["cab_type", "destination", "source", "product_id", "day_of_week"]
tokenizer = [Tokenizer(inputCol=col, outputCol=col+"_token") for col in cols_to_transform]
hashingTF = [HashingTF(inputCol=col+"_token", outputCol=col+"_hash") for col in cols_to_transform]
pipeline = [Pipeline(stages=[tokenizer[i], hashingTF[i]]) for i in range(len(cols_to_transform))]
for i in range(len(cols_to_transform)):
    trainData = pipeline[i].fit(trainData).transform(trainData)
    testData = pipeline[i].fit(testData).transform(testData)

In [6]:
feature_list = [
    "cab_type_hash", "destination_hash", "source_hash",
    "product_id_hash", "day_of_week_hash", "distance",
    "surge_multiplier", "hour", "month", "temp", "clouds",
    "pressure", "rain", "humidity", "wind"
    ]
assembler = VectorAssembler(inputCols=feature_list, outputCol="features")

In [7]:
rf = RandomForestRegressor(featuresCol="features", labelCol="price")
pipeline = Pipeline(stages=[assembler, rf])

In [9]:
param_grid = (
    ParamGridBuilder()
    .addGrid(rf.numTrees, [100, 200, 300])
    .addGrid(rf.maxDepth, [5, 10, 15])
    .build()
)

In [10]:
# crossval = CrossValidator(
#     estimator=pipeline,
#     estimatorParamMaps=param_grid,
#     evaluator=RegressionEvaluator(),
#     numFolds=5
#     )

In [10]:
train_valid_split = TrainValidationSplit(
    estimator=pipeline,
    estimatorParamMaps=param_grid,
    evaluator=RegressionEvaluator(),
    trainRatio=0.8
    )

In [11]:
model = train_valid_split.fit(trainData)

23/03/19 12:30:21 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 1:>                                                          (0 + 0) / 1]

23/03/19 12:30:37 WARN DAGScheduler: Broadcasting large task binary with size 81.2 MiB


[Stage 1:>                                                          (0 + 1) / 1]

23/03/19 12:30:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1)
java.lang.OutOfMemoryError: Java heap space
	at java.io.ObjectInputStream$HandleTable$HandleList.add(ObjectInputStream.java:4041)
	at java.io.ObjectInputStream$HandleTable.markDependency(ObjectInputStream.java:3860)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2456)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2378)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2236)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1692)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2454)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2378)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2236)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1692)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:2142)
	at java.io.ObjectInputStream.readObje

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/daviyokogawa/.pyenv/versions/3.10.6/envs/spark_studies/lib/python3.10/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/daviyokogawa/.pyenv/versions/3.10.6/envs/spark_studies/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/daviyokogawa/.pyenv/versions/3.10.6/envs/spark_studies/lib/python3.10/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/daviyokogawa

ConnectionRefusedError: [Errno 61] Connection refused

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/daviyokogawa/.pyenv/versions/3.10.6/envs/spark_studies/lib/python3.10/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/daviyokogawa/.pyenv/versions/3.10.6/envs/spark_studies/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/daviyokogawa/.pyenv/versions/3.10.6/envs/spark_studies/lib/python3.10/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
