In [0]:
from math import radians, cos, sin, asin, sqrt
from pyspark.sql.functions import isnan, when, count, col,round
from pyspark.sql import Window
import pyspark.sql.functions as F
import numpy as np
import matplotlib.pyplot as plt
from pyspark.ml.stat import Correlation
from pyspark.ml.linalg import DenseMatrix, Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import *

In [0]:
df_dico = {}
for n in [2018,2019,2020,2021]:  
    path = f"/mnt/datalake/tmp/guillaume/profiling/production/raw_data/{n}_raw_data_user"
    df_dico[n] = spark.read.option("header",True).option("inferSchema",True).csv(path)

In [0]:
path = "/mnt/processed-dev/hist/Daily/CutRide/2022/*/*/"
df_2022=spark.read.parquet(path).select("dateentry","rideid","latitude","longitude","deviceid","calculated_speed","calculated_acceleration")
df_2022=df_2022.withColumnRenamed("calculated_speed","speed").withColumnRenamed("calculated_acceleration","acceleration")

In [0]:
df_2018=df_dico[2018].select("dateentry","rideid","latitude","longitude","deviceid","speed","acceleration")
df_2019=df_dico[2019].select("dateentry","rideid","latitude","longitude","deviceid","speed","acceleration")
df_2020=df_dico[2020].select("dateentry","rideid","latitude","longitude","deviceid","speed","acceleration")
df_2021=df_dico[2021].select("dateentry","rideid","latitude","longitude","deviceid","speed","acceleration")

In [0]:
df_2018.select("deviceid").count()

In [0]:
df_2018.select("rideid").count()

In [0]:
df_2018.select("rideid").distinct().count()

In [0]:
df_2018.select("deviceid").distinct().count()

In [0]:
def get_distance(longit_a, latit_a, longit_b, latit_b):
  # Transform to radians
  longit_a, latit_a, longit_b, latit_b = map(radians, [longit_a,  latit_a, longit_b, latit_b])
  dist_longit = longit_b - longit_a
  dist_latit = latit_b -latit_a
  # Calculate area
  area = sin(dist_latit/2)**2 + cos(latit_a) * cos(latit_b) * sin(dist_longit/2)**2
  # Calculate the central angle
  central_angle = 2 * asin(sqrt(area))
  radius = 6371
  # Calculate Distance
  distance = central_angle * radius
  return abs(distance)

In [0]:
#preparer les données pour calculer la distance de chaque trajet avec la fonction(lag)
def data_distance(data):
  w = Window().partitionBy("rideid").orderBy(F.col("deviceid").asc(), F.col("dateentry").asc())
  data_2 = data
  data_2 = data_2.select("*", F.lag("dateentry").over(w).alias("previousDateentry")).na.drop()
  data_2 = data_2.select("*", F.lag("deviceid").over(w).alias("previousDeviceid")).na.drop()
  data_2 = data_2.select("*", F.lag("latitude").over(w).alias("previousLatitude")).na.drop()
  data_2 = data_2.select("*", F.lag("longitude").over(w).alias("previousLongitude")).na.drop()
  data_2 = data_2.select("*", F.lag("speed").over(w).alias("previousSpeed")).na.drop()
  data_2 = data_2.select("*", F.lag("acceleration").over(w).alias("previousAcceleration")).na.drop()
  data_distance= data_2.withColumn("distance",get_distance(data_2.longitude, data_2.latitude, data_2.previousLongitude, data_2.previousLatitude))
  return data_distance


In [0]:
df_2018=data_distance(df_2018)
df_2019=data_distance(df_2019)
df_2020=data_distance(df_2020)
df_2021=data_distance(df_2021)
df_2022=data_distance(df_2022) 

In [0]:
def all_data():
  df1=df_2018.union(df_2019)
  df2=df1.union(df_2020)
  df3=df2.union(df_2021)
  df4=df3.union(df_2022)
  return df4
data=all_data()

In [0]:
#Convert dateentry to date
#add month and week
def get_date(data):
  data=(data.withColumn("date", F.to_date(F.from_unixtime(F.col('dateentry')/1000)))
                     .withColumn("month",F.month("date"))
                     .withColumn("week",F.weekofyear("date"))
                     .withColumn("year",F.year("date"))
                     )
  return data
data_date=get_date(data)

In [0]:
data_date=data_date.filter(F.col("year")!=1999)

In [0]:
#le nombre de km qui ont fait tt  utilisateurs , chaque mois de tt les années
data_group_semaine=data_date.sort(data_date["date"].asc()).groupby("year","week").agg(F.count("rideid").alias('nombre_trajet'),F.count("deviceid").alias('nombre_device'), F.avg("speed").alias('vitesse_moyenne'), F.avg("acceleration").alias('acceleration_moyenne'),F.sum("distance").alias('distance_totale'))

In [0]:
data_group_semaine.select(F.max("nombre_trajet")).display()

max(nombre_trajet)
4314990


In [0]:
data_group_semaine.select(F.max("nombre_device")).display()

max(nombre_device)
4314990


In [0]:
data_group_semaine.display()

year,week,nombre_trajet,nombre_device,vitesse_moyenne,acceleration_moyenne,distance_totale
2018,22,769085,769085,11.44951329624527,0.0026194133119693,14158.942395113072
2018,24,389359,389359,11.222946055784368,-0.0029093817261904,6905.734532924022
2018,23,298538,298538,11.387792358572502,-0.0042417287013506,5534.113717338772
2018,25,407939,407939,10.56282323380442,-0.0058200052422516,6929.697690504667
2019,14,552647,552647,12.993455013585058,-0.0210372302022075,18922.510142535604
2019,13,480020,480020,10.404429744784824,0.0066401769875706,10609.333100604505
2019,12,601086,601086,11.572293608793782,-0.0131774972274883,15961.979021923811
2019,11,567541,567541,10.684861563989577,-0.004563066247244,12000.663704361985
2019,19,3797886,3797886,12.16257118764966,0.0366843660227841,68532.3245012705
2019,20,4008212,4008212,11.987551914305396,0.0382206666059144,72371.28008305357


In [0]:
dataset=data_group_semaine.select("nombre_device","vitesse_moyenne","distance_totale","acceleration_moyenne")

In [0]:
trainDF, testDF = dataset.randomSplit([0.8, 0.2], seed=42)
print(trainDF.cache().count()) # Cache because accessing training data multiple times
print(testDF.count())

In [0]:
from pyspark.ml.feature import VectorAssembler,StringIndexer
from pyspark.ml.feature import StandardScaler
features = dataset.drop('distance_totale').columns
vector = VectorAssembler(inputCols=features, outputCol='features')
scale=StandardScaler(inputCol='features',outputCol='standardized')

In [0]:
from pyspark.ml.regression import LinearRegression 
# lr = LinearRegression(featuresCol ='features', labelCol ='distance_totale') 
lr = LinearRegression(featuresCol = 'features', labelCol='distance_totale', maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [0]:
from pyspark.ml import Pipeline
 
# Define the pipeline based on the stages created in previous steps.
pipeline = Pipeline(stages=[vector,scale, lr ])
 
# Define the pipeline model.
pipelineModel = pipeline.fit(trainDF)
 
# Apply the pipeline model to the test dataset.
predDF = pipelineModel.transform(testDF)

In [0]:
from pyspark.ml import Pipeline
 
# Define the pipeline based on the stages created in previous steps.
pipeline = Pipeline(stages=[vector,scale, lr ])
 
# Define the pipeline model.
pipelineModel = pipeline.fit(trainDF)
 
# Apply the pipeline model to the test dataset.
predDF = pipelineModel.transform(testDF)

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol='distance_totale',metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(predDF))