### Import libraries

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.linalg import Vectors
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from collections import defaultdict
import re

### Create session

In [2]:
spark = SparkSession.builder \
    .appName("MLlib") \
    .master("local[1]") \
    .config("spark.driver.memory", "1g") \
    .config("spark.driver.host", "localhost") \
    .getOrCreate()
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/11 23:37:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Prepare data and preprocessing

In [41]:
train_path = "/Users/macos/Desktop/hcmus/big_data/Lab-3-Big-Data/train.csv"
test_path = "/Users/macos/Desktop/hcmus/big_data/Lab-3-Big-Data/test.csv"
train_df = spark.read.csv(train_path, header=True, inferSchema=True)
test_df = spark.read.csv(test_path, header=True, inferSchema=True)

                                                                                

#### Remove outliers

In [4]:
quantiles = train_df.approxQuantile("trip_duration", [0.25, 0.75], 0.01)
Q1, Q3 = quantiles
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
train_df = train_df.filter((col("trip_duration") >= lower_bound) & (col("trip_duration") <= upper_bound))


                                                                                

In [None]:
feature_cols = ["passenger_count", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"]

def to_labeled_point(row):
    features = [row[c] for c in feature_cols]
    return LabeledPoint(row["trip_duration"], Vectors.dense(features))

In [6]:
train_rdd = train_df.rdd.map(to_labeled_point)

### Split data to train and test

In [7]:
train_data, val_data = train_rdd.randomSplit([0.8, 0.2], seed=42)

### Training model

In [47]:
model = DecisionTree.trainRegressor(
    train_data,
    categoricalFeaturesInfo={},
    impurity="variance",
    maxDepth=25
)

25/04/12 03:44:35 WARN DAGScheduler: Broadcasting large task binary with size 1496.8 KiB
25/04/12 03:44:35 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
25/04/12 03:44:36 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB
25/04/12 03:44:37 WARN DAGScheduler: Broadcasting large task binary with size 5.6 MiB
25/04/12 03:44:38 WARN DAGScheduler: Broadcasting large task binary with size 1069.0 KiB
25/04/12 03:44:39 WARN DAGScheduler: Broadcasting large task binary with size 8.0 MiB
25/04/12 03:44:39 WARN DAGScheduler: Broadcasting large task binary with size 1379.4 KiB
25/04/12 03:44:40 WARN DAGScheduler: Broadcasting large task binary with size 11.1 MiB
25/04/12 03:44:41 WARN DAGScheduler: Broadcasting large task binary with size 1652.8 KiB
25/04/12 03:44:42 WARN DAGScheduler: Broadcasting large task binary with size 14.6 MiB
25/04/12 03:44:44 WARN DAGScheduler: Broadcasting large task binary with size 1889.6 KiB
25/04/12 03:44:45 WARN DAGScheduler: 

### Testing model

In [None]:
def get_feature_importance_from_debug_string(debug_string, choosen_header):
    feature_counts = defaultdict(int)
    
    pattern = re.compile(r"If \(feature (\d+) <=.*\)")
    
    for line in debug_string.split("\n"):
        match = pattern.search(line)
        if match:
            feature_index = int(match.group(1))
            feature_counts[feature_index] += 1
    
    total_importance = sum(feature_counts.values())
    feature_importances = {choosen_header[i]: (count / total_importance if total_importance > 0 else 0.0)
                          for i, count in feature_counts.items()}
    
    print("\nFeature Importances:")
    for feature, importance in feature_importances.items():
        print(f"{feature}: {importance:.4f}")
    
    return feature_importances

In [None]:
debug_str = model.toDebugString()
get_feature_importance_from_debug_string(debug_str, feature_cols)


Feature Importances:
dropoff_longitude: 0.1956
dropoff_latitude: 0.2399
passenger_count: 0.1215
pickup_longitude: 0.1471
pickup_latitude: 0.2959


{'dropoff_longitude': 0.19557646397977813,
 'dropoff_latitude': 0.23992416795393906,
 'passenger_count': 0.12151383232692038,
 'pickup_longitude': 0.1470720404437579,
 'pickup_latitude': 0.29591349529560457}

In [48]:
predictions = model.predict(val_data.map(lambda x: x.features))
labels_and_preds = val_data.map(lambda lp: lp.label).zip(predictions)

metrics = RegressionMetrics(labels_and_preds)
print(f"RMSE: {metrics.rootMeanSquaredError:.6f}")
print(f"R²: {metrics.r2:.6f}")

25/04/12 04:02:04 WARN DAGScheduler: Broadcasting large task binary with size 38.9 MiB
25/04/12 04:02:45 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 235 (TID 446): Attempting to kill Python Worker
25/04/12 04:02:45 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 235 (TID 446): Attempting to kill Python Worker
25/04/12 04:03:01 WARN DAGScheduler: Broadcasting large task binary with size 38.9 MiB

RMSE: 294.903451
R²: 0.416117


                                                                                

### Run on test file

In [None]:
feature_cols_local = ["passenger_count", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"]

test_rows = test_df.select("id", *feature_cols_local).collect()

results = []
for row in test_rows:
    id_val = row["id"]
    features = Vectors.dense([row[c] for c in feature_cols_local])
    prediction = float(model.predict(features))
    results.append((id_val, prediction))

schema = StructType([
    StructField("id", StringType(), True),
    StructField("prediction", DoubleType(), True)
])
predicted_df = spark.createDataFrame(results, schema=schema)

In [None]:
predicted_df.toPandas().to_csv("results.csv", index=False)

25/04/11 10:47:39 WARN TaskSetManager: Stage 53 contains a task of very large size (14066 KiB). The maximum recommended task size is 1000 KiB.
