In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.linalg import Vectors

Matplotlib is building the font cache; this may take a moment.


In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("BD_Lab03_2.1_MLlib") \
    .master("local[1]") \
    .config("spark.driver.memory", "1g") \
    .config("spark.driver.host", "localhost") \
    .getOrCreate()
sc = spark.sparkContext

25/04/11 09:37:43 WARN SparkContext: Another SparkContext is being constructed (or threw an exception in its constructor). This may indicate an error, since only one SparkContext should be running in this JVM (see SPARK-2243). The other SparkContext was created at:
org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:490)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.Con

In [6]:
train_path = "/Users/macos/Desktop/hcmus/big_data/Lab-3-Big-Data/train.csv"
test_path = "/Users/macos/Desktop/hcmus/big_data/Lab-3-Big-Data/test.csv"
train_data = spark.read.csv(train_path, header=True, inferSchema=True)
test_data = spark.read.csv(test_path, header=True, inferSchema=True)

                                                                                

In [7]:


# Calculate Q1 and Q3 for outlier removal
quantiles = train_data.approxQuantile("trip_duration", [0.25, 0.75], 0.01)
Q1, Q3 = quantiles
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
clean_data = train_data.filter((col("trip_duration") >= lower_bound) & (col("trip_duration") <= upper_bound))
train_data = clean_data
train_data.select("trip_duration").describe().show()

# Prepare features - MLlib uses RDDs of LabeledPoint
def parse_data(row):
    features = [
        row.passenger_count,
        row.pickup_longitude,
        row.pickup_latitude,
        row.dropoff_longitude,
        row.dropoff_latitude
    ]
    return LabeledPoint(row.trip_duration, features)

# Convert DataFrames to RDDs
train_rdd = train_data.rdd.map(parse_data)
test_rdd = test_data.rdd.map(lambda row: (
    row.id,
    [row.passenger_count, row.pickup_longitude, row.pickup_latitude, 
     row.dropoff_longitude, row.dropoff_latitude]
))



+-------+-----------------+
|summary|    trip_duration|
+-------+-----------------+
|  count|          1382668|
|   mean|729.9852343440363|
| stddev|445.7921009969074|
|    min|                1|
|    max|             2075|
+-------+-----------------+



                                                                                

In [8]:
# Split training data
train_train_rdd, val_train_rdd = train_rdd.randomSplit([0.8, 0.2], seed=42)

# Train Decision Tree model
model = DecisionTree.trainRegressor(
    train_train_rdd,
    categoricalFeaturesInfo={},  # Empty dict = all features are continuous
    impurity='variance',
    maxDepth=20
)

25/04/11 09:45:46 WARN DAGScheduler: Broadcasting large task binary with size 1496.7 KiB
25/04/11 09:45:47 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
25/04/11 09:45:48 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB
25/04/11 09:45:49 WARN DAGScheduler: Broadcasting large task binary with size 5.6 MiB
25/04/11 09:45:50 WARN DAGScheduler: Broadcasting large task binary with size 1069.0 KiB
25/04/11 09:45:50 WARN DAGScheduler: Broadcasting large task binary with size 8.0 MiB
25/04/11 09:45:51 WARN DAGScheduler: Broadcasting large task binary with size 1379.4 KiB
25/04/11 09:45:52 WARN DAGScheduler: Broadcasting large task binary with size 11.1 MiB
25/04/11 09:45:53 WARN DAGScheduler: Broadcasting large task binary with size 1652.8 KiB
25/04/11 09:45:54 WARN DAGScheduler: Broadcasting large task binary with size 14.6 MiB
25/04/11 09:45:55 WARN DAGScheduler: Broadcasting large task binary with size 1889.6 KiB
                                     

In [13]:
# Rest of the code remains the same for predictions and evaluation
predictions = model.predict(val_train_rdd.map(lambda x: x.features))
labels_and_preds = val_train_rdd.map(lambda lp: lp.label).zip(predictions)

# Calculate metrics
metrics = RegressionMetrics(labels_and_preds)
rmse = metrics.rootMeanSquaredError
r2 = metrics.r2

print(f"RMSE: {rmse:6f}")
print(f"R²: {r2:6f}")


25/04/11 10:01:13 WARN DAGScheduler: Broadcasting large task binary with size 16.1 MiB
25/04/11 10:01:53 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 55 (TID 101): Attempting to kill Python Worker
25/04/11 10:01:53 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 55 (TID 101): Attempting to kill Python Worker
25/04/11 10:02:05 WARN DAGScheduler: Broadcasting large task binary with size 16.1 MiB

RMSE: 289.315835
R²: 0.366705


                                                                                

In [15]:
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.regression import LabeledPoint

# Assuming train_rdd and test_rdd are already prepared
model = DecisionTree.trainRegressor(train_rdd, categoricalFeaturesInfo={}, impurity='variance', maxDepth=20)

# Make predictions on test set
test_features = test_rdd.map(lambda x: x[1])
predictions = model.predict(test_features)

# Get IDs and combine with predictions
test_ids = test_rdd.map(lambda x: x[0])
output = test_ids.zip(predictions.map(float))  # Convert to native float

# Save results
output_df = output.toDF(["id", "prediction"])
output_df.toPandas().to_csv("results_mllib.csv", index=False)

25/04/11 10:43:29 WARN DAGScheduler: Broadcasting large task binary with size 1517.5 KiB
25/04/11 10:43:30 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
25/04/11 10:43:31 WARN DAGScheduler: Broadcasting large task binary with size 3.8 MiB
25/04/11 10:43:33 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
25/04/11 10:43:34 WARN DAGScheduler: Broadcasting large task binary with size 1142.0 KiB
25/04/11 10:43:34 WARN DAGScheduler: Broadcasting large task binary with size 8.4 MiB
25/04/11 10:43:36 WARN DAGScheduler: Broadcasting large task binary with size 1460.9 KiB
25/04/11 10:43:37 WARN DAGScheduler: Broadcasting large task binary with size 11.6 MiB
25/04/11 10:43:38 WARN DAGScheduler: Broadcasting large task binary with size 1799.7 KiB
25/04/11 10:43:40 WARN DAGScheduler: Broadcasting large task binary with size 15.5 MiB
25/04/11 10:43:41 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB
25/04/11 10:43:43 WARN DAGScheduler: Bro

ImportError: Pandas >= 1.0.5 must be installed; however, it was not found.

In [16]:
output_df.toPandas().to_csv("results_mllib.csv", index=False)

25/04/11 10:48:38 WARN DAGScheduler: Broadcasting large task binary with size 17.2 MiB
                                                                                