# Bài làm chi tiết

## 1. Chuẩn bị

- Thiết lập biến môi trường cho Spark

In [4]:
import findspark
findspark.init("/home/cuong/Downloads/spark-3.5.5-bin-hadoop3")

- Import các thư viện cần thiết

In [5]:
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, sum, when
from pyspark.sql.functions import round


- Khởi tạo park session

In [6]:
spark = SparkSession.builder \
    .appName("NYC Taxi Duration Prediction") \
    .getOrCreate()

## 2. Đọc dữ liệu training và tiền xử lý

- Đọc dữ liệu training từ file

In [7]:
df = spark.read.csv("file:///home/cuong/Downloads/Big_Data/nyc-taxi-trip-duration/train.csv", header = True, inferSchema = True)

                                                                                

- In Schema của dữ liệu

In [8]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- vendor_id: integer (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- trip_duration: integer (nullable = true)



In [24]:
df.count()

                                                                                

1458644

### Nhận xét: Dữ liệu có 11 features và 1458644 samples

- Xem một số mẫu dữ liệu 

In [9]:
df.show(5)

+---------+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+
|       id|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|  pickup_longitude|   pickup_latitude| dropoff_longitude|  dropoff_latitude|store_and_fwd_flag|trip_duration|
+---------+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+
|id2875421|        2|2016-03-14 17:24:55|2016-03-14 17:32:30|              1| -73.9821548461914| 40.76793670654297|-73.96463012695312|40.765602111816406|                 N|          455|
|id2377394|        1|2016-06-12 00:43:35|2016-06-12 00:54:38|              1|-73.98041534423828|40.738563537597656|-73.99948120117188| 40.73115158081055|                 N|          663|
|id3858529|        2|2016-01-19 11:35:24|2016-01-19 12:10:48|    

- Kiểm tra giá trị null của dữ liệu

In [10]:
null_counts = df.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df.columns])
null_counts.show()



+---+---------+---------------+----------------+---------------+----------------+---------------+-----------------+----------------+------------------+-------------+
| id|vendor_id|pickup_datetime|dropoff_datetime|passenger_count|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|store_and_fwd_flag|trip_duration|
+---+---------+---------------+----------------+---------------+----------------+---------------+-----------------+----------------+------------------+-------------+
|  0|        0|              0|               0|              0|               0|              0|                0|               0|                 0|            0|
+---+---------+---------------+----------------+---------------+----------------+---------------+-----------------+----------------+------------------+-------------+



                                                                                

### Nhận xét: Không có bất kỳ giá trị null nào trong tập dữ liệu

- Xóa những dòng duplicate

In [25]:
df = df.dropDuplicates()
df.count()

                                                                                

1458644

### Nhận xét: Số lượng samples vẫn không đổi => tập dữ liệu không có duplicate

- Tạo assembler để làm đầu vào training cho mô hình 

In [None]:
num_cols = ['pickup_longitude', 'pickup_latitude','dropoff_longitude', 'dropoff_latitude']
assembler = VectorAssembler(inputCols= num_cols, outputCol = "features")
assembled_df = assembler.transform(df).select("features", 'trip_duration')
# Việc scaler không cần thiết trong trường hợp này
# scaler = MinMaxScaler(inputCol= 'features', outputCol = 'scaled_features')
# scaler_model = scaler.fit(assembled_df)
# scaler_df = scaler_model.transform(assembled_df)
# scaler_df.sample(withReplacement=False, fraction=0.01).show()
assembled_df.show()


+--------------------+-------------+
|            features|trip_duration|
+--------------------+-------------+
|[-73.982154846191...|          455|
|[-73.980415344238...|          663|
|[-73.979026794433...|         2124|
|[-74.010040283203...|          429|
|[-73.973052978515...|          435|
|[-73.982856750488...|          443|
|[-73.969017028808...|          341|
|[-73.969276428222...|         1551|
|[-73.999481201171...|          255|
|[-73.981048583984...|         1225|
|[-73.982650756835...|         1274|
|[-73.991531372070...|         1128|
|[-73.962982177734...|         1114|
|[-73.956306457519...|          260|
|[-73.992195129394...|         1414|
|[-73.955513000488...|          211|
|[-73.991165161132...|         2316|
|[-73.994255065917...|          731|
|[-74.003982543945...|         1317|
|[-73.98388671875,...|          251|
+--------------------+-------------+
only showing top 20 rows



## 3. Huấn luyện và đánh giá mô hình

- Chia tập dữ liệu thành 2 phần training và validation tỷ lệ 80-20

In [13]:
# training, validation = scaler_df.randomSplit([0.8, 0.2], seed = 42)
training, validation = assembled_df.randomSplit([0.8, 0.2], seed = 42)


- Khởi tạo và training cho mô hình

In [14]:
lr =  DecisionTreeRegressor(featuresCol = "features", labelCol = "trip_duration")
model = lr.fit(training)

                                                                                

- Xem xét mô hình một cách trực quan

In [15]:
print(model.toDebugString)

DecisionTreeRegressionModel: uid=DecisionTreeRegressor_12b6580f3c89, depth=5, numNodes=63, numFeatures=4
  If (feature 0 <= -73.91783905029297)
   If (feature 2 <= -73.9245376586914)
    If (feature 3 <= 40.70591354370117)
     If (feature 1 <= 40.73052978515625)
      If (feature 1 <= 40.71339416503906)
       Predict: 798.3456686291
      Else (feature 1 > 40.71339416503906)
       Predict: 1203.2577360277864
     Else (feature 1 > 40.73052978515625)
      If (feature 3 <= 40.68511962890625)
       Predict: 2068.2443454420836
      Else (feature 3 > 40.68511962890625)
       Predict: 1584.9986710963456
    Else (feature 3 > 40.70591354370117)
     If (feature 1 <= 40.71339416503906)
      If (feature 3 <= 40.738441467285156)
       Predict: 811.6071083755733
      Else (feature 3 > 40.738441467285156)
       Predict: 1498.089804858622
     Else (feature 1 > 40.71339416503906)
      If (feature 3 <= 40.723758697509766)
       Predict: 1020.4631605031506
      Else (feature 3 > 40.7237

- Kiểm tra mô hình bằng dữ liệu validation

In [16]:
predictions_val = model.transform(validation)
predictions_val.show()

[Stage 22:>                                                         (0 + 1) / 1]

+--------------------+-------------+------------------+
|            features|trip_duration|        prediction|
+--------------------+-------------+------------------+
|[-74.455558776855...|           50| 784.0699948290734|
|[-74.347068786621...|           22| 811.6071083755733|
|[-74.289016723632...|           81|    798.3456686291|
|[-74.208259582519...|           40| 784.0699948290734|
|[-74.181678771972...|            2|    798.3456686291|
|[-74.177650451660...|         4470|3042.1834798704303|
|[-74.172668457031...|           87| 811.6071083755733|
|[-74.126724243164...|         3190| 784.0699948290734|
|[-74.100105285644...|         1836|1584.9986710963456|
|[-74.093963623046...|         1002|    798.3456686291|
|[-74.090621948242...|         1049|    798.3456686291|
|[-74.088912963867...|           83| 784.0699948290734|
|[-74.079620361328...|          912|    798.3456686291|
|[-74.073432922363...|          160|    798.3456686291|
|[-74.047920227050...|            9| 784.0699948

                                                                                

- Đánh giá mô hình bằng các độ đo

In [17]:
evaluator = RegressionEvaluator(labelCol = 'trip_duration', predictionCol = 'prediction')
rmse_val = evaluator.setMetricName('rmse').evaluate(predictions_val)
r2_val = evaluator.setMetricName('r2').evaluate(predictions_val)
print(f"[VALIDATION] RMSE: {rmse_val:.2f}, R²: {r2_val:.2f}")



[VALIDATION] RMSE: 3258.98, R²: 0.02


                                                                                

## 4. Đưa ra dự đoán cho tập test và xuất kết quả ra file

- Load dữ liệu từ file test để làm input

In [18]:
df_test = spark.read.csv("file:///home/cuong/Downloads/Big_Data/nyc-taxi-trip-duration/test.csv", header = True, inferSchema = True)
df_test.show()

                                                                                

+---------+---------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+
|       id|vendor_id|    pickup_datetime|passenger_count|  pickup_longitude|   pickup_latitude| dropoff_longitude|  dropoff_latitude|store_and_fwd_flag|
+---------+---------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+
|id3004672|        1|2016-06-30 23:59:58|              1|-73.98812866210938| 40.73202896118164|-73.99017333984375| 40.75667953491211|                 N|
|id3505355|        1|2016-06-30 23:59:53|              1|-73.96420288085938| 40.67999267578125|-73.95980834960938| 40.65540313720703|                 N|
|id1217141|        1|2016-06-30 23:59:47|              1| -73.9974365234375| 40.73758316040039|-73.98616027832031|40.729522705078125|                 N|
|id2150126|        2|2016-06-30 23:59:41|              1|-73.95606994628906| 40.77

- Gom dữ liệu thành assembler để phù hợp với mô hình

In [19]:
num_cols = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude','dropoff_latitude']
assembler = VectorAssembler(inputCols= num_cols, outputCol= 'features')
assembled_df = assembler.transform(df_test).select('id', 'features')
assembled_df.show()


+---------+--------------------+
|       id|            features|
+---------+--------------------+
|id3004672|[-73.988128662109...|
|id3505355|[-73.964202880859...|
|id1217141|[-73.997436523437...|
|id2150126|[-73.956069946289...|
|id1598245|[-73.97021484375,...|
|id0668992|[-73.991302490234...|
|id1765014|[-73.978309631347...|
|id0898117|[-74.012710571289...|
|id3905224|[-73.992332458496...|
|id1543102|[-73.993179321289...|
|id3024712|[-73.968528747558...|
|id3665810|[-73.982772827148...|
|id1836461|[-73.921104431152...|
|id3457080|[-73.986801147460...|
|id3376065|[-73.996345520019...|
|id3008739|[-73.968025207519...|
|id0902216|[-74.007713317871...|
|id3564824|[-73.984298706054...|
|id0820280|[-73.952598571777...|
|id0775088|[-73.966690063476...|
+---------+--------------------+
only showing top 20 rows



- Dự đoán kết quả

In [20]:
prediction_res = model.transform(assembled_df)
prediction_res.show()

+---------+--------------------+------------------+
|       id|            features|        prediction|
+---------+--------------------+------------------+
|id3004672|[-73.988128662109...| 784.0699948290734|
|id3505355|[-73.964202880859...|    798.3456686291|
|id1217141|[-73.997436523437...| 784.0699948290734|
|id2150126|[-73.956069946289...| 784.0699948290734|
|id1598245|[-73.97021484375,...| 784.0699948290734|
|id0668992|[-73.991302490234...| 784.0699948290734|
|id1765014|[-73.978309631347...|1020.4631605031506|
|id0898117|[-74.012710571289...| 811.6071083755733|
|id3905224|[-73.992332458496...|1714.8705337450374|
|id1543102|[-73.993179321289...| 784.0699948290734|
|id3024712|[-73.968528747558...|    798.3456686291|
|id3665810|[-73.982772827148...| 784.0699948290734|
|id1836461|[-73.921104431152...| 784.0699948290734|
|id3457080|[-73.986801147460...| 784.0699948290734|
|id3376065|[-73.996345520019...| 784.0699948290734|
|id3008739|[-73.968025207519...| 784.0699948290734|
|id0902216|[

- Chuẩn hóa theo yêu cầu

In [21]:
prediction_res = prediction_res.withColumn('trip_duration', col('prediction').cast('int')).select('id', 'trip_duration')
prediction_res.show()

+---------+-------------+
|       id|trip_duration|
+---------+-------------+
|id3004672|          784|
|id3505355|          798|
|id1217141|          784|
|id2150126|          784|
|id1598245|          784|
|id0668992|          784|
|id1765014|         1020|
|id0898117|          811|
|id3905224|         1714|
|id1543102|          784|
|id3024712|          798|
|id3665810|          784|
|id1836461|          784|
|id3457080|          784|
|id3376065|          784|
|id3008739|          784|
|id0902216|          784|
|id3564824|          784|
|id0820280|          784|
|id0775088|         1714|
+---------+-------------+
only showing top 20 rows



- Lưu kết quả vào file

In [22]:
prediction_res.coalesce(1) \
    .write.mode("overwrite") \
    .option("header", True) \
    .csv("file:///home/cuong/res")


                                                                                