In [20]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from pyspark.mllib.evaluation import MulticlassMetrics


In [2]:
spark = (
 SparkSession.builder.appName("project 1 RF")
 .config("spark.sql.repl.eagerEval.enabled", True)
 .config("spark.executor.memory","6G")
 .config("spark.driver.memory","4G")
 .config("spark.sql.parquet.cacheMetadata", "true")
 .config("spark.sql.session.timeZone", "Etc/UTC")
 .config('spark.driver.maxResultSize', '2048m')
 .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/17 11:41:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data = spark.read.parquet('data/merged_sdf.parquet')

                                                                                

In [7]:
data.where(F.col('tip_amount') >= 3).count() / data.count()

0.5631643629905457

In [8]:
column = ['PULocationID','DOLocationID', 'Airport', 'Congestion', 'Weekend', 'tip']

In [9]:
def discretization(sdf):
    sdf = sdf.withColumn('tip', (F.col('tip_amount') >= 3).cast('BOOLEAN'))
    sdf = sdf.select(column)
    return sdf.withColumn('tip_numeric', F.col('tip').cast('int')).select(['PULocationID','DOLocationID', 'Airport', 'Congestion', 'Weekend', 'tip_numeric'])

In [10]:
data = discretization(data)

In [11]:
data.where(F.col('tip_numeric') != 0).count() / data.count()

0.5631643629905457

In [13]:
# Assuming 'label' is the target column and 'features' are the feature columns
assembler = VectorAssembler(inputCols=['PULocationID','DOLocationID', 'Airport', 'Congestion', 'Weekend'], outputCol="features")

In [14]:
rf = RandomForestClassifier(featuresCol="features", labelCol="tip_numeric", numTrees=100, maxDepth=5)

pipeline = Pipeline(stages=[assembler, rf])

train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

model = pipeline.fit(train_data)


23/08/17 11:43:31 WARN MemoryStore: Not enough space to cache rdd_100_5 in memory! (computed 108.7 MiB so far)
23/08/17 11:43:31 WARN BlockManager: Persisting block rdd_100_5 to disk instead.
23/08/17 11:43:31 WARN MemoryStore: Not enough space to cache rdd_100_4 in memory! (computed 108.7 MiB so far)
23/08/17 11:43:31 WARN BlockManager: Persisting block rdd_100_4 to disk instead.
23/08/17 11:43:31 WARN MemoryStore: Not enough space to cache rdd_100_1 in memory! (computed 108.7 MiB so far)
23/08/17 11:43:31 WARN BlockManager: Persisting block rdd_100_1 to disk instead.
23/08/17 11:43:32 WARN MemoryStore: Not enough space to cache rdd_100_0 in memory! (computed 108.7 MiB so far)
23/08/17 11:43:32 WARN BlockManager: Persisting block rdd_100_0 to disk instead.
23/08/17 11:43:32 WARN MemoryStore: Not enough space to cache rdd_100_2 in memory! (computed 108.7 MiB so far)
23/08/17 11:43:32 WARN BlockManager: Persisting block rdd_100_2 to disk instead.
23/08/17 11:43:47 WARN MemoryStore: Not 

In [15]:
predictions = model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(labelCol="tip_numeric", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)




Accuracy: 0.5967512605147532


                                                                                

In [21]:
prediction_and_label = predictions.select("prediction", "tip_numeric").rdd.map(lambda row: (float(row.prediction), float(row.tip_numeric)))

# Instantiate the MulticlassMetrics class
metrics = MulticlassMetrics(prediction_and_label)

# Compute the confusion matrix
confusion_matrix = metrics.confusionMatrix()


                                                                                

In [22]:
print(confusion_matrix)

DenseMatrix([[ 226574., 1095011.],
             [ 125358., 1579400.]])
