In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

from pyspark.sql.functions import col, when, to_timestamp, unix_timestamp
from pyspark.sql.types import DoubleType


# conf = SparkConf().setAppName('appName').setMaster('local')
sc = SparkContext.getOrCreate()
# spark = SparkSession(sc)

In [2]:
from pyspark.sql.types import (StructField,StructType, BooleanType, DoubleType,LongType, IntegerType)
from pyspark.sql.types import *

In [3]:
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

In [4]:
# Définition du type : SPOINT

spoint_schema = [StructField('lat', FloatType(),True),
                 StructField('lon', FloatType(),True)]
spoint = StructType(fields=spoint_schema)

In [5]:
# Définition du type : SECTION

section_schema = [StructField('lat1', FloatType(),True),
                  StructField('lon1', FloatType(),True),
                  StructField('lat2', FloatType(),True),
                  StructField('lon2', FloatType(),True)]
section = StructType(fields=section_schema)

In [6]:
# Définition du type : SLINE

sline_schema = [StructField('rints', ArrayType(section),True)]
sline = StructType(fields=sline_schema)

In [7]:
# Définition du type : USPOINT

uspoint_schema = [StructField('t1', LongType(),True),
                  StructField('t2', LongType(),True),
                  StructField('lat1', FloatType(),True),
                  StructField('lon1', FloatType(),True),
                  StructField('lat2', FloatType(),True),
                  StructField('lon2', FloatType(),True)]
uspoint = StructType(fields=uspoint_schema)

In [8]:
# Définition du type : MSPOINT

mspoint_schema = [StructField('rints', ArrayType(uspoint),True)]
mspoint = StructType(fields=mspoint_schema)

In [9]:
# Définition du type : UINT

uint_schema = [StructField('val', IntegerType(),True),
               StructField('t1', LongType(),True),
               StructField('t2', LongType(),True)]
uint = StructType(fields=uint_schema)

In [10]:
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

In [11]:
# Définition du type : MINT

mint_schema = [StructField('units', ArrayType(uint),True)]
mint = StructType(fields=mint_schema)

In [12]:
# Définition du type : USTRING

ustring_schema = [StructField('val', StringType(),True),
               StructField('t1', LongType(),True),
               StructField('t2', LongType(),True)]
ustring = StructType(fields=ustring_schema)

In [13]:
# Définition du type : MSTRING

mstring_schema = [StructField('units', ArrayType(ustring),True)]
mstring = StructType(fields=mstring_schema)

In [14]:
# Définition du type : UREAL

ureal_schema = [StructField('a', FloatType(),True),
              StructField('b', FloatType(),True),
              StructField('c', FloatType(),True),
              StructField('r', BooleanType(),True),
              StructField('t1', LongType(),True),
              StructField('t2', LongType(),True)]
ureal = StructType(fields=ureal_schema)

In [15]:
# Définition du type : MREAL

mreal_schema = [StructField('units', ArrayType(ureal),True)]
mreal = StructType(fields=mreal_schema)

In [16]:
# Définition du type : USINT

usint_schema = [StructField('val', IntegerType(),True),
              StructField('interval', section,True)]
usint = StructType(fields=usint_schema)

In [17]:
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

In [18]:
# Définition du type : MSINT

msint_schema = [StructField('units', ArrayType(usint),False)]
msint = StructType(fields=msint_schema)

In [19]:
# Définition du type : USSTRING

usstring_schema = [StructField('val', StringType(),True),
              StructField('interval', section,True)]
usstring = StructType(fields=usstring_schema)

In [20]:
# Définition du type : MSSTRING

msstring_schema = [StructField('units', ArrayType(usstring),False)]
msstring = StructType(fields=msstring_schema)

In [21]:
# Définition du type : USREAL

usreal_schema = [StructField('a', FloatType(),True),
                 StructField('b', FloatType(),True),
                 StructField('c', FloatType(),True),
                 StructField('r', BooleanType(),True),
                 StructField('interval', section,True)]
usreal = StructType(fields=usreal_schema)

In [22]:
# Définition du type : MSREAL

msreal_schema = [StructField('units', ArrayType(usreal),False)]
msreal = StructType(fields=msreal_schema)

In [23]:
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

In [24]:
# Définition du type : INTIME

intime_schema = [StructField('val', FloatType(),True),
                 StructField('t1', LongType(),True)]
intime = StructType(fields=intime_schema)

In [25]:
# Définition du type : INSPOINT

inspoint_schema = [StructField('val', FloatType(),True),
                   StructField('sp', spoint,True)]
inspoint = StructType(fields=inspoint_schema)

In [26]:
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

In [27]:
df = spark.read.option("header",True).option("inferSchema",True).csv("VGP-week3-data.csv")
df.printSchema()

root
 |-- kit_id: integer (nullable = true)
 |-- participant_id: integer (nullable = true)
 |-- time: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- PM2-5: string (nullable = true)
 |-- PM10: string (nullable = true)
 |-- PM1-0: string (nullable = true)
 |-- NO2: string (nullable = true)
 |-- BC: string (nullable = true)
 |-- activity: string (nullable = true)
 |-- event: string (nullable = true)



In [28]:
#On change les string "NULL" en null
df = df.withColumn(("BC"), when(col("BC") == "NULL", None).otherwise(col("BC")))
df = df.withColumn(("PM2-5"), when(col("PM2-5") == "NULL", None).otherwise(col("PM2-5")))
df = df.withColumn(("PM10"),  when(col("PM10") == "NULL",  None).otherwise(col("PM10")))
df = df.withColumn(("PM1-0"), when(col("PM1-0") == "NULL", None).otherwise(col("PM1-0")))
df = df.withColumn(("NO2"),   when(col("NO2") == "NULL",   None).otherwise(col("NO2")))
df = df.withColumn(("activity"), when(col("activity") == "NULL", None).otherwise(col("activity")))
df = df.withColumn(("event"), when(col("event") == "NULL", None).otherwise(col("event")))

In [29]:
df = df.withColumn('time',unix_timestamp('time', 'yyyy-MM-dd HH:mm:ss').alias('time'))
df.show()
df.printSchema()

+------+--------------+----------+----------------+----------------+-----+----+-----+----+----+--------+-----+
|kit_id|participant_id|      time|             lat|             lon|PM2-5|PM10|PM1-0| NO2|  BC|activity|event|
+------+--------------+----------+----------------+----------------+-----+----+-----+----+----+--------+-----+
|    80|       9999964|1573718400|48.7717766666667|        2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718410|48.7717766666667|        2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718420|48.7717766666667|        2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718430|48.7717766666667|        2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718440|48.7717766666667|        2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718450|48.7717766666667|        2.006005| null|null| null|null|null|    null| null|
|

In [30]:
#Conversion des types PM2.5, BC ... en double
#Remarque dans notre fichier de données j'ai eliminer le "+00" a chaque fois
df = df.withColumn("PM2-5",df["PM2-5"].cast(FloatType()))
df = df.withColumn("PM10",df["PM10"].cast(FloatType()))
df = df.withColumn("PM1-0",df["PM1-0"].cast(FloatType()))
df = df.withColumn("NO2",df["NO2"].cast(FloatType()))
df = df.withColumn("BC",df["BC"].cast(FloatType()))
df = df.withColumn("lat",df["lat"].cast(FloatType()))
df = df.withColumn("lon",df["lon"].cast(FloatType()))
df.printSchema()

root
 |-- kit_id: integer (nullable = true)
 |-- participant_id: integer (nullable = true)
 |-- time: long (nullable = true)
 |-- lat: float (nullable = true)
 |-- lon: float (nullable = true)
 |-- PM2-5: float (nullable = true)
 |-- PM10: float (nullable = true)
 |-- PM1-0: float (nullable = true)
 |-- NO2: float (nullable = true)
 |-- BC: float (nullable = true)
 |-- activity: string (nullable = true)
 |-- event: string (nullable = true)



In [31]:
df_temp = df
df_temp.show()

+------+--------------+----------+---------+---------+-----+----+-----+----+----+--------+-----+
|kit_id|participant_id|      time|      lat|      lon|PM2-5|PM10|PM1-0| NO2|  BC|activity|event|
+------+--------------+----------+---------+---------+-----+----+-----+----+----+--------+-----+
|    80|       9999964|1573718400| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718410| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718420| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718430| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718440| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718450| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718460| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718

In [32]:
def change_column_names(columns):
    return [c.replace('-', '_') for c in columns]

df_temp = df_temp.toDF(*change_column_names(df_temp.columns))
df_temp.show()
df_temp.count()

+------+--------------+----------+---------+---------+-----+----+-----+----+----+--------+-----+
|kit_id|participant_id|      time|      lat|      lon|PM2_5|PM10|PM1_0| NO2|  BC|activity|event|
+------+--------------+----------+---------+---------+-----+----+-----+----+----+--------+-----+
|    80|       9999964|1573718400| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718410| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718420| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718430| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718440| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718450| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718460| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718

59972

In [33]:
liste_id = df_temp.select('participant_id').distinct().rdd.map(lambda r: r[0])
liste_id.collect()

[9999920,
 9999955,
 9999975,
 9999936,
 9999930,
 9999960,
 9999964,
 9999962,
 999992]

In [34]:
liste_id = [9999964]

In [35]:
data_type = StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True), StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True)])
dataset = spark.createDataFrame(sc.emptyRDD(), data_type)
dataset.show()

+--------------+----+-------+-------+-------+-----+----+----------+-------+
|id_participant|trip|PM2_5_t|PM_10_t|PM1_0_t|NO2_t|BC_t|activity_t|event_t|
+--------------+----+-------+-------+-------+-----+----+----------+-------+
+--------------+----+-------+-------+-------+-----+----+----------+-------+



In [37]:
df_one_participant.show()

+------+--------------+----------+---------+---------+-----+----+-----+---+-----+--------+-------------------+
|kit_id|participant_id|      time|      lat|      lon|PM2_5|PM10|PM1_0|NO2|   BC|activity|              event|
+------+--------------+----------+---------+---------+-----+----+-----+---+-----+--------+-------------------+
|    80|       9999964|1573804980| 48.77173|2.0058634|  3.0| 4.0|  4.0|7.0|376.0|  Bureau|Arrêter De Cuisiner|
|    80|       9999964|1573804990| 48.77173|2.0058634|  3.0| 4.0|  4.0|7.0|376.0|  Bureau|Arrêter De Cuisiner|
|    80|       9999964|1573805000| 48.77173|2.0058634|  3.0| 4.0|  4.0|7.0|376.0|  Bureau|Arrêter De Cuisiner|
|    80|       9999964|1573805010| 48.77173|2.0058634|  3.0| 4.0|  4.0|7.0|376.0|  Bureau|Arrêter De Cuisiner|
|    80|       9999964|1573805020| 48.77173|2.0058634|  3.0| 4.0|  4.0|7.0|376.0|  Bureau|Arrêter De Cuisiner|
|    80|       9999964|1573805030| 48.77173|2.0058634|  3.0| 4.0|  4.0|7.0|376.0|  Bureau|Arrêter De Cuisiner|
|

In [59]:
# New Cell Amir
df_temp = df_temp.where(df_temp['PM2_5'].isNotNull() & df_temp['PM1_0'].isNotNull() & df_temp['PM10'].isNotNull() & df_temp['NO2'].isNotNull() & df_temp['BC'].isNotNull() & df_temp['activity'].isNotNull() & df_temp['event'].isNotNull())
# new_participant_row = spark.createDataFrame(sc.emptyRDD(), StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True),StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True),StructField('PM2_5_t', mreal,True), StructField('PM_10_s', msreal,True), StructField('PM1_0_s', msreal,True), StructField('NO2_s', msreal,True), StructField('BC_s', msreal,True), StructField('activity_s', msstring,True), StructField('event_s', msstring,True)]))
new_participant_row = spark.createDataFrame(sc.emptyRDD(), StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True),StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True)]))    
for id_participant in liste_id : 
    #MOUCHARD 1
    print("participant avec id: {}".format(id_participant))
    
    # Prendre toutes les données d'un seul participant et les trier par le temps
    df_one_participant = df_temp.where(col("participant_id") == id_participant)
    df_one_participant = df_one_participant.sort("time")
    
    trip_df_temp = spark.createDataFrame(sc.emptyRDD(), mspoint)
    PM2_5_df_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    PM10_df_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    PM1_0_df_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    NO2_df_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    BC_df_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    activity_df_temp = spark.createDataFrame(sc.emptyRDD(), mstring)
    event_df_temp = spark.createDataFrame(sc.emptyRDD(), mstring)
    #df_one_participant.show()
    
    # A partir de la je reprend ma methode
    
    #Creation d'une nouvelle DF temp pour faire la jointure et regrouper les tuples (1 avec 2 , 3 avec 4, ect...)
    temp = df_one_participant
    #On renome les colonnes de temp pour qu'il y'ait pas d'ambiguité
    temp = temp.withColumnRenamed('kit_id','kit_id_temp')
    temp = temp.withColumnRenamed('participant_id','participant_id_temp')
    temp = temp.withColumnRenamed('time','time_temp')
    temp = temp.withColumnRenamed('lat','lat_temp')
    temp = temp.withColumnRenamed('lon','lon_temp')
    temp = temp.withColumnRenamed('PM2_5','PM2_5_temp')
    temp = temp.withColumnRenamed('PM10','PM10_temp')
    temp = temp.withColumnRenamed('PM1_0','PM1_0_temp')
    temp = temp.withColumnRenamed('NO2','NO2_temp')
    temp = temp.withColumnRenamed('BC','BC_temp')
    temp = temp.withColumnRenamed('activity','activity_temp')
    temp = temp.withColumnRenamed('event','event_temp')
    
    #temp.show()
    #On decremente le time de temp pour pouvoir faire la jointure avec == (car j'ai pas pu le faire avec == time_temp + 10)
    temp = temp.withColumn('time_temp', temp.time_temp - 10)
    #On fait la jointure
    #On fais le rename ici car y'avais un bug de spark 
    df_one_participant = df_one_participant.withColumnRenamed('time','time')
    x = df_one_participant.join(temp, [temp['time_temp'] == df_one_participant['time'] ])
    # On decremente le time_temp pour revenir a un etat cohérent
    y = x.select(x.lat,x.lon,x['time'],x['time_temp'] + 10, x.lat_temp,x.lon_temp, x['PM2_5'], x['PM2_5_temp'], x['PM10'], x['PM10_temp'], x['PM1_0'], x['PM1_0_temp'], x['NO2'], x['NO2_temp'], x['BC'], x['BC_temp'], x['activity'], x['activity_temp'], x['event'], x['event_temp'])
    #y.show()
    
    # Mettre les données dans les df temporaires
        
PM2_5_df_temp = y.select(y.columns).rdd.map(lambda r :(0.0, ((r[7]-r[6])/(r[3]-r[2])), (((r[6]*r[3])-(r[7]*r[2]))/((r[3]-r[2]))),False,r[2],r[3]),mreal).toDF()
PM10_df_temp = y.select(y.columns).rdd.map(lambda r : (0.0, ((r[9]-r[8])/(r[3]-r[2])), (((r[8]*r[3])-(r[9]*r[2]))/((r[3]-r[2]))),False,r[2],r[3]), mreal).toDF()
PM1_0_df_temp = y.select(y.columns).rdd.map(lambda r :(0.0, ((r[11]-r[10])/(r[3]-r[2])), (((r[10]*r[3])-(r[11]*r[2]))/((r[3]-r[2]))), False, r[2], r[3]), mreal).toDF()
NO2_df_temp = y.select(y.columns).rdd.map(lambda r :  (0.0, ((r[13]-r[12])/(r[3]-r[2])), (((r[12]*r[3])-(r[13]*r[2]))/((r[3]-r[2]))), False, r[2], r[3]), mreal).toDF()
BC_df_temp = y.select(y.columns).rdd.map(lambda r :   (0.0, ((r[15]-r[14])/(r[3]-r[2])), (((r[14]*r[3])-(r[15]*r[2]))/((r[3]-r[2]))), False, r[2], r[3]), mreal).toDF()
activity_df_temp = y.select(y.columns).rdd.map(lambda r :(r[16], r[2], r[3]), mstring).toDF()
event_df_temp = y.select(y.columns).rdd.map(lambda r :(r[18], r[2], r[3]), mstring).toDF()
    
trip_df_temp = y.select(y.columns).rdd.map(lambda r :(r[2], r[3], r[0], r[1], r[4], r[5]), mspoint).toDF()
    
#trip_df_temp = trip_temp.union(spark.createDataFrame([[([(time, t_temp, lat, lon, row.lat, row.lon)])]], mspoint))
#PM2_5_df_temp.show(2000)
# On rename les colonnes car ça a changer 

PM2_5_df_temp = PM2_5_df_temp.withColumnRenamed('_1','a')
PM2_5_df_temp = PM2_5_df_temp.withColumnRenamed('_2','b')
PM2_5_df_temp = PM2_5_df_temp.withColumnRenamed('_3','c')
PM2_5_df_temp = PM2_5_df_temp.withColumnRenamed('_4','r')
PM2_5_df_temp = PM2_5_df_temp.withColumnRenamed('_5','t1')
PM2_5_df_temp = PM2_5_df_temp.withColumnRenamed('_6','t2')

PM10_df_temp = PM10_df_temp.withColumnRenamed('_1','a')
PM10_df_temp = PM10_df_temp.withColumnRenamed('_2','b')
PM10_df_temp = PM10_df_temp.withColumnRenamed('_3','c')
PM10_df_temp = PM10_df_temp.withColumnRenamed('_4','r')
PM10_df_temp = PM10_df_temp.withColumnRenamed('_5','t1')
PM10_df_temp = PM10_df_temp.withColumnRenamed('_6','t2')

PM1_0_df_temp = PM1_0_df_temp.withColumnRenamed('_1','a')
PM1_0_df_temp = PM1_0_df_temp.withColumnRenamed('_2','b')
PM1_0_df_temp = PM1_0_df_temp.withColumnRenamed('_3','c')
PM1_0_df_temp = PM1_0_df_temp.withColumnRenamed('_4','r')
PM1_0_df_temp = PM1_0_df_temp.withColumnRenamed('_5','t1')
PM1_0_df_temp = PM1_0_df_temp.withColumnRenamed('_6','t2')

NO2_df_temp = NO2_df_temp.withColumnRenamed('_1','a')
NO2_df_temp = NO2_df_temp.withColumnRenamed('_2','b')
NO2_df_temp = NO2_df_temp.withColumnRenamed('_3','c')
NO2_df_temp = NO2_df_temp.withColumnRenamed('_4','r')
NO2_df_temp = NO2_df_temp.withColumnRenamed('_5','t1')
NO2_df_temp = NO2_df_temp.withColumnRenamed('_6','t2')

BC_df_temp = BC_df_temp.withColumnRenamed('_1','a')
BC_df_temp = BC_df_temp.withColumnRenamed('_2','b')
BC_df_temp = BC_df_temp.withColumnRenamed('_3','c')
BC_df_temp = BC_df_temp.withColumnRenamed('_4','r')
BC_df_temp = BC_df_temp.withColumnRenamed('_5','t1')
BC_df_temp = BC_df_temp.withColumnRenamed('_6','t2')

activity_df_temp = activity_df_temp.withColumnRenamed('_1','val')
activity_df_temp = activity_df_temp.withColumnRenamed('_2','t1')
activity_df_temp = activity_df_temp.withColumnRenamed('_3','t2')

event_df_temp = event_df_temp.withColumnRenamed('_1','val')
event_df_temp = event_df_temp.withColumnRenamed('_2','t1')
event_df_temp = event_df_temp.withColumnRenamed('_3','t2')

trip_df_temp.show(200)

participant avec id: 9999964
+----------+----------+------------------+------------------+------------------+------------------+
|        _1|        _2|                _3|                _4|                _5|                _6|
+----------+----------+------------------+------------------+------------------+------------------+
|1573804980|1573804990|   48.771728515625|2.0058634281158447|   48.771728515625|2.0058634281158447|
|1573804990|1573805000|   48.771728515625|2.0058634281158447|   48.771728515625|2.0058634281158447|
|1573805000|1573805010|   48.771728515625|2.0058634281158447|   48.771728515625|2.0058634281158447|
|1573805010|1573805020|   48.771728515625|2.0058634281158447|   48.771728515625|2.0058634281158447|
|1573805020|1573805030|   48.771728515625|2.0058634281158447|   48.771728515625|2.0058634281158447|
|1573805760|1573805770|48.771732330322266| 2.005850076675415|48.771732330322266| 2.005850076675415|
|1573805770|1573805780|48.771732330322266| 2.005850076675415| 48.771682

In [65]:
#Suite Cell Amir
d = [(id_participant, trip_df_temp, PM2_5_df_temp, PM10_df_temp, PM1_0_df_temp, NO2_df_temp, BC_df_temp, activity_df_temp, event_df_temp)]
new_participant_row = spark.createDataFrame(sc.emptyRDD(), StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True), StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True)]))
#dataset = dataset.union(new_participant_row)
#dataset.show()
#trip_df_temp.rdd.collect()[0]
print(d)
new_participant_row.printSchema()
# print ('tototootototototo ///////////////absabs/////////')

#trip_df_temp.rdd.collect()

[(9999964, DataFrame[_1: bigint, _2: bigint, _3: double, _4: double, _5: double, _6: double], DataFrame[a: double, b: double, c: double, r: boolean, t1: bigint, t2: bigint], DataFrame[a: double, b: double, c: double, r: boolean, t1: bigint, t2: bigint], DataFrame[a: double, b: double, c: double, r: boolean, t1: bigint, t2: bigint], DataFrame[a: double, b: double, c: double, r: boolean, t1: bigint, t2: bigint], DataFrame[a: double, b: double, c: double, r: boolean, t1: bigint, t2: bigint], DataFrame[val: string, t1: bigint, t2: bigint], DataFrame[val: string, t1: bigint, t2: bigint])]
root
 |-- id_participant: integer (nullable = true)
 |-- trip: struct (nullable = true)
 |    |-- rints: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- t1: long (nullable = true)
 |    |    |    |-- t2: long (nullable = true)
 |    |    |    |-- lat1: float (nullable = true)
 |    |    |    |-- lon1: float (nullable = true)
 |    |    |    |-- lat2: float (

In [None]:
df_temp = df_temp.where(df_temp['PM2_5'].isNotNull() & df_temp['PM1_0'].isNotNull() & df_temp['PM10'].isNotNull() & df_temp['NO2'].isNotNull() & df_temp['BC'].isNotNull() & df_temp['activity'].isNotNull() & df_temp['event'].isNotNull())
# new_participant_row = spark.createDataFrame(sc.emptyRDD(), StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True),StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True),StructField('PM2_5_t', mreal,True), StructField('PM_10_s', msreal,True), StructField('PM1_0_s', msreal,True), StructField('NO2_s', msreal,True), StructField('BC_s', msreal,True), StructField('activity_s', msstring,True), StructField('event_s', msstring,True)]))
new_participant_row = spark.createDataFrame(sc.emptyRDD(), StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True),StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True)]))    
for id_participant in liste_id:   
        
    #MOUCHARD 1
    print("participant avec id: {}".format(id_participant))
    
    # Prendre toutes les données d'un seul participant et les trier par le temps
    df_one_participant = df_temp.where(col("participant_id") == id_participant)
    df_one_participant = df_one_participant.sort("time")
    
    #initialiser les df temporaires
    trip_temp = spark.createDataFrame(sc.emptyRDD(), mspoint)
    PM2_5_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    PM10_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    PM1_0_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    NO2_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    BC_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    activity_temp = spark.createDataFrame(sc.emptyRDD(), mstring)
    event_temp = spark.createDataFrame(sc.emptyRDD(), mstring)
    
#     PM2_5_s_temp = spark.createDataFrame(sc.emptyRDD(), msreal)
#     PM10_s_temp = spark.createDataFrame(sc.emptyRDD(), msreal)
#     PM1_0_s_temp = spark.createDataFrame(sc.emptyRDD(), msreal)
#     NO2_s_temp = spark.createDataFrame(sc.emptyRDD(), msreal)
#     BC_s_temp = spark.createDataFrame(sc.emptyRDD(), msreal)
#     activity_s_temp = spark.createDataFrame(sc.emptyRDD(), msstring)
#     event_s_temp = spark.createDataFrame(sc.emptyRDD(), msstring)
    
    
    # Prendre les premieres données
    row = df_one_participant.first()
    kit_id = row.kit_id
    time=row.time+3600*id_participant
    lat=row.lat
    lon=row.lon
    PM2_5=row.PM2_5
    PM10=row.PM10
    PM1_0=row.PM1_0
    NO2=row.NO2
    BC=row.BC
    activity=row.activity
    event=row.event
    
    # Boucler toutes les données du participant
    while (df_one_participant.count() > 0):
        row = df_one_participant.first()

        if (row.time+3600*id_participant == time):
            time = row.time+3600*id_participant-1
        t_temp = row.time+3600*id_participant;
        
#         # Temporel
#         dpm2_5 = [(0.0, (row.PM2_5-PM2_5)/(t_temp-time), (PM2_5*t_temp-row.PM2_5*time)/(t_temp-time), False, time, t_temp)]
#         df_pm2_5 = spark.createDataFrame(dpm2_5, ureal)
#         print("ureal pm2_5:")
#         df_pm2_5.show()
        
#         dpm10 = [(0.0, (row.PM10-PM10)/(t_temp-time), (PM10*t_temp-row.PM10*time)/(t_temp-time), False, time, t_temp)]
#         df_pm10 = spark.createDataFrame(dpm10, ureal)
        
#         dpm1_0 = [(0.0, (row.PM1_0-PM1_0)/(t_temp-time), (PM1_0*t_temp-row.PM1_0*time)/(t_temp-time), False, time, t_temp)]
#         df_pm1_0 = spark.createDataFrame(dpm1_0, ureal)
        
#         dno2 = [(0.0, (row.NO2-NO2)/(t_temp-time), (NO2*t_temp-row.NO2*time)/(t_temp-time), False, time, t_temp)]
#         df_no2 = spark.createDataFrame(dno2, ureal)
        
#         dbc = [(0.0, (row.BC-BC)/(t_temp-time), (BC*t_temp-row.BC*time)/(t_temp-time), False, time, t_temp)]
#         df_bc = spark.createDataFrame(dbc, ureal)
        
#         dactivity = [(activity, time, t_temp)]
#         df_activity = spark.createDataFrame(dactivity, ustring)
        
#         devent = [(row.event, time, t_temp)]
#         df_event = spark.createDataFrame(devent, ustring)
        
#         # Spatial
#         dsp = [(time, t_temp, lat, lon, row.lat, row.lon)]
#         df_sp = spark.createDataFrame(dsp, uspoint)
        
#         dspm2_5 = sc.parallelize([0, (row.PM2_5-PM2_5)/(row.lat-lat), (PM2_5*row.lat-row.PM2_5*lat)/(row.lat-lat), 0, SECTION(lat, lon, row.lat, row.lon)])
#         df_s_pm2_5 = spark.createDataFrame(dspm2_5, usreal)
#         dspm10 = sc.parallelize([0, (row.PM10-PM10)/(row.lat-lat), (PM10*row.lat-row.PM10*lat)/(row.lat-lat), 0, SECTION(lat, lon, row.lat, row.lon)])
#         df_s_pm10 = spark.createDataFrame(dspm10, usreal)
#         dspm1_0 = sc.parallelize([0, (row.PM1_0-PM1_0)/(row.lat-lat), (PM1_0*row.lat-row.PM1_0*lat)/(row.lat-lat), 0, SECTION(lat, lon, row.lat, row.lon)])
#         df_s_pm1_0 = spark.createDataFrame(dspm1_0, usreal)
#         dsno2 = sc.parallelize([0, (row.NO2-NO2)/(row.lat-lat), (NO2*row.lat-row.NO2*lat)/(row.lat-lat), 0, SECTION(lat, lon, row.lat, row.lon)])
#         df_s_no2 = spark.createDataFrame(dsno2, usreal)
#         dsbc = sc.parallelize([0, (row.BC-BC)/(row.lat-lat), (BC10*row.lat-row.BC10*lat)/(row.lat-lat), 0, SECTION(lat, lon, row.lat, row.lon)])
#         df_s_bc = spark.createDataFrame(dsbc, usreal)
#         dsactivity = sc.parallelize([row.activity, SECTION(lat, lon, row.lat, row.lon)])
#         df_s_activity = spark.createDataFrame(dsactivity, usstring)
#         dsevent = sc.parallelize([row.event, SECTION(lat, lon, row.lat, row.lon)])
#         df_s_activity = spark.createDataFrame(dsevent, usstring)
        
        # Mettre les données dans les df temporaires
        
        PM2_5_temp = PM2_5_temp.union(spark.createDataFrame([[([(0.0, (row.PM2_5-PM2_5)/(t_temp-time), (PM2_5*t_temp-row.PM2_5*time)/(t_temp-time), False, time, t_temp)])]], mreal))
        PM10_temp = PM10_temp.union(spark.createDataFrame([[([(0.0, (row.PM10-PM10)/(t_temp-time), (PM10*t_temp-row.PM10*time)/(t_temp-time), False, time, t_temp)])]], mreal))
        PM1_0_temp = PM1_0_temp.union(spark.createDataFrame([[([(0.0, (row.PM1_0-PM1_0)/(t_temp-time), (PM1_0*t_temp-row.PM1_0*time)/(t_temp-time), False, time, t_temp)])]], mreal))
        NO2_temp = NO2_temp.union(spark.createDataFrame([[([(0.0, (row.NO2-NO2)/(t_temp-time), (NO2*t_temp-row.NO2*time)/(t_temp-time), False, time, t_temp)])]], mreal))
        BC_temp = BC_temp.union(spark.createDataFrame([[([(0.0, (row.BC-BC)/(t_temp-time), (BC*t_temp-row.BC*time)/(t_temp-time), False, time, t_temp)])]], mreal))
        activity_temp = activity_temp.union(spark.createDataFrame([[([(activity, time, t_temp)])]], mstring))
        event_temp = event_temp.union(spark.createDataFrame([[([(row.event, time, t_temp)])]], mstring))
        
        trip_temp = trip_temp.union(spark.createDataFrame([[([(time, t_temp, lat, lon, row.lat, row.lon)])]], mspoint))
#         PM2_5_s_temp = PM2_5_s_temp.union([df_s_pm2_5])
#         PM10_s_temp = PM10_s_temp.union([df_s_pm10])
#         PM1_0_s_temp = PM1_0_s_temp.union([df_s_pm1_0])
#         NO2_s_temp = NO2_s_temp.union([df_s_no2])
#         BC_s_temp = BC_s_temp.union([df_s_bc])
#         activity_s_temp = activity_s_temp.union([df_s_activity])
#         event_s_temp = event_s_temp.union([de_s_event])
#         print("mreal pm2_5:")
#         PM2_5_temp.show()
        
        # Décallage des variables
        kit_id = row.kit_id
        time=row.time+3600*id_participant
        lat=row.lat
        lon=row.lon
        PM2_5=row.PM2_5
        PM10=row.PM10
        PM1_0=row.PM1_0
        NO2=row.NO2
        BC=row.BC
        activity=row.activity
        event=row.event
        
        # Passer à la ligne suivante
        df_one_participant = df_one_participant.where(col("time") > row.time)
        
    # FIN WHILE
    
#     d = sc.parallelize([id_participant, trip_temp, PM2_5_temp, PM10_temp, PM1_0_temp, NO2_temp, BC_temp, activity_temp, event_temp, PM2_5_s_temp, PM10_s_temp, PM1_0_s_temp, NO2_s_temp, BC_s_temp, activity_s_temp, event_s_temp ])
#     new_participant_row = spark.createDataFrame(d, StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True), StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True),StructField('PM2_5_t', mreal,True), StructField('PM_10_s', msreal,True), StructField('PM1_0_s', msreal,True), StructField('NO2_s', msreal,True), StructField('BC_s', msreal,True), StructField('activity_s', msstring,True), StructField('event_s', msstring,True)]))
    d = [(id_participant, trip_temp.rdd.collect()[0], PM2_5_temp.rdd.collect()[0], PM10_temp.rdd.collect()[0], PM1_0_temp.rdd.collect()[0], NO2_temp.rdd.collect()[0], BC_temp.rdd.collect()[0], activity_temp.rdd.collect()[0], event_temp.rdd.collect()[0])]
    new_participant_row = spark.createDataFrame(d, StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True), StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True)]))
    
    dataset = dataset.union(new_participant_row)
# FIN DU FOR
dataset.show()
dataset.toPandas().to_csv('dataset.csv')

In [None]:
dfffff = spark.createDataFrame([('0.38030685472943737', '0.34728188900913715')])

In [None]:
dfffff.count()

In [None]:
dfffff.printSchema()

In [None]:
dfffff.rdd.collect()

In [None]:
data_type = StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True), StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True)])
dataset = spark.createDataFrame(sc.emptyRDD(), data_type)
dataset.show()

trip_temp = spark.createDataFrame(sc.emptyRDD(), mspoint)
PM2_5_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
PM10_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
PM1_0_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
NO2_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
BC_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
activity_temp = spark.createDataFrame(sc.emptyRDD(), mstring)
event_temp = spark.createDataFrame(sc.emptyRDD(), mstring)

PM2_5_temp = PM2_5_temp.union(spark.createDataFrame([[([(0.0, 0.0, 2.0, False, 0, 1)])]], mreal))
PM10_temp = PM10_temp.union(spark.createDataFrame([[([(0.0, 0.0, 2.0, False, 0, 1)])]], mreal))
PM1_0_temp = PM1_0_temp.union(spark.createDataFrame([[([(0.0, 0.0, 2.0, False, 0, 1)])]], mreal))
NO2_temp = NO2_temp.union(spark.createDataFrame([[([(0.0, 0.0, 2.0, False, 0, 1)])]], mreal))
BC_temp = BC_temp.union(spark.createDataFrame([[([(0.0, 0.0, 2.0, False, 0, 1)])]], mreal))
activity_temp = activity_temp.union(spark.createDataFrame([[([("home", 0, 1)])]], mstring))
event_temp = event_temp.union(spark.createDataFrame([[([("cooking", 0, 1)])]], mstring))
trip_temp = trip_temp.union(spark.createDataFrame([[([(0, 1, 4.5, 2.0, 4.8, 2.6)])]], mspoint))
trip_temp = trip_temp.union(spark.createDataFrame([[([(1, 2, 8.5, 4.0, 8.8, 5.6)])]], mspoint))

print(trip_temp.rdd)
print (trip_temp.rdd.collect())

d = [(id_participant, trip_temp.rdd.collect()[0], PM2_5_temp.collect()[0], PM10_temp.collect()[0], PM1_0_temp.collect()[0], NO2_temp.collect()[0], BC_temp.collect()[0], activity_temp.collect()[0], event_temp.collect()[0])]
new_participant_row = spark.createDataFrame(d, StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True), StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True)]))

print("dataframe new_participant_row")
new_participant_row.show()

dataset = dataset.union(new_participant_row)
print("dataframe dataset")
dataset.show()


In [None]:
dataset.toPandas().to_csv('mycsv.csv')