In [0]:
df = spark.read.parquet(f"wasbs://telematics@dln2blobdatalanding.blob.core.chinacloudapi.cn/FUEL_RANKING_R_TRIP_V4/report_date=2024060*").filter("incount = true AND duration = 0").select("ESN", "ReportDate", "X_AvgVehicleWeight","Trip")
df.count()

2025188

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lag, expr

window_spec = Window.partitionBy("ESN","ReportDate").orderBy("Trip")
combined_df = df.withColumn("PrevWeight", lag("X_AvgVehicleWeight").over(window_spec))
combined_df=combined_df.withColumn("Prevtrip", lag("Trip").over(window_spec))
combined_df_1 = combined_df.withColumn("WeightDiff", col("X_AvgVehicleWeight") - col("PrevWeight"))
combined_df_1 = combined_df_1.withColumn("Trip_difference", col("Trip") - col("Prevtrip"))
combined_df_1.display()


ESN,ReportDate,X_AvgVehicleWeight,Trip,PrevWeight,Prevtrip,WeightDiff,Trip_difference
71117951,20240601,29.15800000000001,8.0,,,,
71119445,20240601,39.83999999999996,1.0,,,,
71119445,20240601,31.00200000000001,3.0,39.83999999999996,1.0,-8.837999999999951,2.0
71119580,20240601,11.410000000000004,1.0,,,,
71119719,20240601,29.851999999999997,3.0,,,,
71119719,20240601,33.706,9.0,29.851999999999997,3.0,3.8540000000000023,6.0
71120871,20240604,26.414000000000016,2.0,,,,
71120871,20240604,30.493999999999996,4.0,26.414000000000016,2.0,4.079999999999977,2.0
71120871,20240609,28.929999999999986,9.0,,,,
71120872,20240602,22.21800000000002,5.0,,,,


In [0]:
combined_df_2 = combined_df_1.dropna(subset=["WeightDiff"])
combined_df_3 = combined_df_2.filter(col("WeightDiff") != 0)
combined_df_3 = combined_df_3.filter(col("Trip_difference")==1)
combined_df_3.count()

325652

In [0]:
combined_df_3.drop(col("Trip_difference")).display()

ESN,ReportDate,X_AvgVehicleWeight,Trip,Prevtrip,PrevWeight,WeightDiff
71122992,20240608,15.057999999999996,2.0,1.0,22.381999999999984,-7.323999999999987
71122992,20240608,31.682000000000095,3.0,2.0,15.057999999999996,16.624000000000102
71123079,20240607,41.12999999999989,2.0,1.0,22.239999999999984,18.889999999999905
71126061,20240609,50.24200000000002,9.0,8.0,47.984,2.258000000000017
71129606,20240601,53.05600000000006,2.0,1.0,55.16599999999998,-2.1099999999999213
71129610,20240608,44.35000000000005,5.0,4.0,41.63000000000002,2.72000000000002
76194030,20240601,38.56299999999998,12.0,11.0,36.11999999999996,2.443000000000019
76208550,20240608,19.600000000000005,6.0,5.0,17.31800000000001,2.2819999999999965
76208550,20240608,17.598000000000013,7.0,6.0,19.600000000000005,-2.001999999999992
76225547,20240608,17.43399999999999,28.0,27.0,14.794000000000024,2.639999999999967


In [0]:
#利用置信区间形成阈值
from pyspark.sql.functions import col, avg, stddev, lit
stats_df = combined_df_3.select(avg(col("WeightDiff")).alias("mean"), stddev(col("WeightDiff")).alias("stddev")).collect()[0]
mean = stats_df["mean"]
stddev = stats_df["stddev"]
threshold = 2 * stddev
lower_bound = mean - threshold
upper_bound = mean + threshold

print(f"Mean: {mean}, Std Dev: {stddev}, Threshold: {threshold}")
print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")

combined_df_4 = combined_df_3.filter((col("WeightDiff") <= lower_bound) | (col("WeightDiff") >= upper_bound))
combined_df_4.drop(col("Trip_difference")).display()

Mean: -0.4499510459017602, Std Dev: 9.703212413025108, Threshold: 19.406424826050216
Lower Bound: -19.856375871951975, Upper Bound: 18.956473780148457


ESN,ReportDate,X_AvgVehicleWeight,Trip,Prevtrip,PrevWeight,WeightDiff
76292610,20240605,15.044000000000006,4.0,3.0,42.51399999999994,-27.46999999999993
76299076,20240609,40.372000000000014,3.0,2.0,17.075999999999983,23.29600000000003
76306727,20240602,72.59399999999995,3.0,2.0,17.778,54.81599999999995
76309325,20240603,19.05999999999999,2.0,1.0,52.86,-33.80000000000001
76317501,20240609,46.92199999999991,2.0,1.0,16.47399999999999,30.44799999999992
76317501,20240609,21.138,3.0,2.0,46.92199999999991,-25.78399999999991
76317501,20240609,48.72599999999997,4.0,3.0,21.138,27.58799999999996
76646127,20240604,13.12,3.0,2.0,39.28600000000012,-26.16600000000012
76649150,20240602,51.21199999999998,4.0,3.0,27.29,23.921999999999983
76649181,20240608,81.174,5.0,4.0,12.651999999999996,68.522


In [0]:
combined_df_4.filter(col("ESN")=="76727917").display()

ESN,ReportDate,X_AvgVehicleWeight,Trip,Prevtrip,PrevWeight,WeightDiff,Trip_difference
76727917,20240605,43.333999999999975,7.0,6.0,14.668,28.665999999999976,1.0
