In [0]:
from math import radians, cos, sin, asin, sqrt
from pyspark.sql import Window
import pyspark.sql.functions as F
import numpy as np
import matplotlib.pyplot as plt

In [0]:
data = spark.read.csv('/mnt/datalake/tmp/amani/ride_data/2021_ride',inferSchema=True,header=True)
data=data.withColumn("date",F.to_date("date"))

In [0]:
#Calcule le nombre d'occurence pour chaque type ride et deviceid avec temps de  debut et de fin
def occurence(data):
  df=data.sort(data["type_ride_id"],data["date"].asc()).groupby("type_ride_id","deviceid").agg(F.first('date').alias("debut_temps"),F.last('date').alias("fin_temps"),                                            F.count("type_ride_id").alias('occurence'),F.avg("vitesse_moy").alias('vitess_moyenne'),F.avg("distance_trajet").alias('distance_moy'))
  return df
data_occurence=occurence(data)

In [0]:
# Ajout la colonne de "diff_temps" pour  calculer le temps parcouru pour chaque type de trajet et de chaque device
# Ajout la "diff_temps"  en jours, en semaines et en  mois 

def calcule_diff_temps(data):
  df_diff_date=(data.withColumn("diff_temps_j",F.datediff(F.col("fin_temps"),F.col("debut_temps")))
                  .withColumn("diff_temps_s",F.col("diff_temps_j")/7)
                  .withColumn("diff_temps_m",F.col("diff_temps_j")/30)
               )
  return df_diff_date

data_diff_temps=calcule_diff_temps(data_occurence)


In [0]:
#Filtrer data avec la "diff_temps" (qui est en jour)  != 0 pour enlever les trajets exeptionnels 
data_filter=data_diff_temps.filter(F.col("diff_temps_j")!=0)

In [0]:
# Le score: est le calcule le nombre d'occurence par rapport au temps

# Calcule le score sur une periode de temps( en jour ,en semaine et en mois)
def score(data):
  df_score=(data.withColumn('score_jour', F.col('occurence')/F.col("diff_temps_j"))
                .withColumn('score_semaine', F.col('occurence')/F.col("diff_temps_s"))
                .withColumn('score_mois', F.col('occurence')/F.col("diff_temps_m"))
           )
  return df_score
data_score=score(data_filter)

In [0]:
#pour la periode <2 semaine => exceptionnel
#pour la periode entre ]2 semaine..1 MOIS]  => occasionnel
#pour la periode > 1 MOIS
        #cas 1 : si le occ >24 et la periode >3 => quotidien
         #cas 2: si occurence == 2:si la distance >35(10% de données avec occ=2) =>  "excep" si non "occ"
         #cas3:occurence[3,23] et la periode <= 3:"occ"

In [0]:
def data_type_trajet(data):
  df=data.withColumn("type",F.when(F.col("diff_temps_s")<2, "exceptionnel")
                             .when((F.col("diff_temps_m")<=1) &                                                                                                   (F.col("diff_temps_s")>=2),"occasionnel")
                                       
                                     .when((F.col("occurence")>24) &                                                                                                      (F.col("diff_temps_m")>3),"quotidien")
                                     .when((F.col("occurence")==2)&                                                                                                       
                                           (F.col("distance_moy")>35), "exceptionnel")
                                     .otherwise("occasionnel")
                    )
      
  return df
data_type=data_type_trajet(data_score)


In [0]:
data_type_totale=data_type_trajet(data_diff_temps)

In [0]:
type_data_totale=data_type_totale.groupby("type").count().orderBy('count', ascending=False)
type_percent=type_data_totale.withColumn('percent', F.col('count')/F.sum('count').over(Window.partitionBy())*100)
type_percent.display()

type,count,percent
exceptionnel,17505,83.54810996563575
occasionnel,3253,15.525964108438336
quotidien,194,0.9259259259259258


In [0]:
#jointure pour avoir chaque type de trajet
data_jointure = (data_type_totale.join(data.select("rideid","type_ride_id"), on= data_type_totale.type_ride_id== data.type_ride_id,how = "left")
                                 .drop(data.type_ride_id)
                 )


In [0]:
type_data_jounture=data_jointure.groupby("type").count().orderBy('count', ascending=False)
type_percent_jointure=type_data_jounture.withColumn('percent', F.col('count')/F.sum('count').over(Window.partitionBy())*100)
type_percent_jointure.display()


type,count,percent
exceptionnel,19810,37.72542895774219
occasionnel,17082,32.530326979109134
quotidien,15619,29.744244063148678


In [0]:
data_jointure.write.option('header',True).mode('overwrite').csv('/mnt/datalake/tmp/amani/type_ride_data/2021_ride_type')

In [0]:
dataset=data_type_totale.select("distance_moy","vitess_moyenne","occurence","diff_temps_j","type")

In [0]:
trainDF, testDF = dataset.randomSplit([0.8, 0.2], seed=42)
print(trainDF.cache().count()) # Cache because accessing training data multiple times
print(testDF.count())

Data Exploration

In [0]:
display(trainDF)

In [0]:
display(trainDF.select("*").summary())

In [0]:
from pyspark.ml.feature import VectorAssembler,StringIndexer
from pyspark.ml.feature import StandardScaler
labelToIndex = StringIndexer(inputCol="type", outputCol="label")
features = dataset.drop('type').columns
vector = VectorAssembler(inputCols=features, outputCol='features')
scale=StandardScaler(inputCol='features',outputCol='standardized')

In [0]:
from pyspark.ml.classification import RandomForestClassifier
 
forest_model = RandomForestClassifier(featuresCol='features', labelCol="label", 
                    predictionCol='prediction',maxDepth=30, 
                    impurity='gini', subsamplingRate= .5)


In [0]:
from pyspark.ml import Pipeline
 
# Define the pipeline based on the stages created in previous steps.
pipeline = Pipeline(stages=[labelToIndex,vector,scale, forest_model])
 
# Define the pipeline model.
pipelineModel = pipeline.fit(trainDF)
 
# Apply the pipeline model to the test dataset.
predDF = pipelineModel.transform(testDF)

In [0]:
display(predDF.select("*"))

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
 
# bcEvaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
# print(f"Area under ROC curve: {bcEvaluator.evaluate(predDF)}")
 
mcEvaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print(f"Accuracy: {mcEvaluator.evaluate(predDF)}")