In [7]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

from pyspark.sql.functions import col, when, to_timestamp, unix_timestamp
from pyspark.sql.types import DoubleType


# conf = SparkConf().setAppName('appName').setMaster('local')
sc = SparkContext.getOrCreate()
# spark = SparkSession(sc)

In [8]:
from pyspark.sql.types import (StructField,StructType, BooleanType, DoubleType,LongType, IntegerType)
from pyspark.sql.types import *

In [9]:
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

In [10]:
# Définition du type : SPOINT

spoint_schema = [StructField('lat', FloatType(),True),
                 StructField('lon', FloatType(),True)]
spoint = StructType(fields=spoint_schema)

In [11]:
# Définition du type : SECTION

section_schema = [StructField('lat1', FloatType(),True),
                  StructField('lon1', FloatType(),True),
                  StructField('lat2', FloatType(),True),
                  StructField('lon2', FloatType(),True)]
section = StructType(fields=section_schema)

In [12]:
# Définition du type : SLINE

sline_schema = [StructField('rints', ArrayType(section),True)]
sline = StructType(fields=sline_schema)

In [13]:
# Définition du type : USPOINT

uspoint_schema = [StructField('t1', LongType(),True),
                  StructField('t2', LongType(),True),
                  StructField('lat1', FloatType(),True),
                  StructField('lon1', FloatType(),True),
                  StructField('lat2', FloatType(),True),
                  StructField('lon2', FloatType(),True)]
uspoint = StructType(fields=uspoint_schema)

In [14]:
# Définition du type : MSPOINT

mspoint_schema = [StructField('rints', ArrayType(uspoint),True)]
mspoint = StructType(fields=mspoint_schema)

In [15]:
# Définition du type : UINT

uint_schema = [StructField('val', IntegerType(),True),
               StructField('t1', LongType(),True),
               StructField('t2', LongType(),True)]
uint = StructType(fields=uint_schema)

In [16]:
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

In [17]:
# Définition du type : MINT

mint_schema = [StructField('units', ArrayType(uint),True)]
mint = StructType(fields=mint_schema)

In [18]:
# Définition du type : USTRING

ustring_schema = [StructField('val', StringType(),True),
               StructField('t1', LongType(),True),
               StructField('t2', LongType(),True)]
ustring = StructType(fields=ustring_schema)

In [19]:
# Définition du type : MSTRING

mstring_schema = [StructField('units', ArrayType(ustring),True)]
mstring = StructType(fields=mstring_schema)

In [20]:
# Définition du type : UREAL

ureal_schema = [StructField('a', FloatType(),True),
              StructField('b', FloatType(),True),
              StructField('c', FloatType(),True),
              StructField('r', BooleanType(),True),
              StructField('t1', LongType(),True),
              StructField('t2', LongType(),True)]
ureal = StructType(fields=ureal_schema)

In [21]:
# Définition du type : MREAL

mreal_schema = [StructField('units', ArrayType(ureal),True)]
mreal = StructType(fields=mreal_schema)

In [22]:
# Définition du type : USINT

usint_schema = [StructField('val', IntegerType(),True),
              StructField('interval', section,True)]
usint = StructType(fields=usint_schema)

In [23]:
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

In [24]:
# Définition du type : MSINT

msint_schema = [StructField('units', ArrayType(usint),False)]
msint = StructType(fields=msint_schema)

In [25]:
# Définition du type : USSTRING

usstring_schema = [StructField('val', StringType(),True),
              StructField('interval', section,True)]
usstring = StructType(fields=usstring_schema)

In [26]:
# Définition du type : MSSTRING

msstring_schema = [StructField('units', ArrayType(usstring),False)]
msstring = StructType(fields=msstring_schema)

In [27]:
# Définition du type : USREAL

usreal_schema = [StructField('a', FloatType(),True),
                 StructField('b', FloatType(),True),
                 StructField('c', FloatType(),True),
                 StructField('r', BooleanType(),True),
                 StructField('interval', section,True)]
usreal = StructType(fields=usreal_schema)

In [28]:
# Définition du type : MSREAL

msreal_schema = [StructField('units', ArrayType(usreal),False)]
msreal = StructType(fields=msreal_schema)

In [29]:
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

In [30]:
# Définition du type : INTIME

intime_schema = [StructField('val', FloatType(),True),
                 StructField('t1', LongType(),True)]
intime = StructType(fields=intime_schema)

In [31]:
# Définition du type : INSPOINT

inspoint_schema = [StructField('val', FloatType(),True),
                   StructField('sp', spoint,True)]
inspoint = StructType(fields=inspoint_schema)

In [32]:
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

In [33]:
df = spark.read.option("header",True).option("inferSchema",True).csv("VGP-week3-data.csv")
df.printSchema()

root
 |-- kit_id: integer (nullable = true)
 |-- participant_id: integer (nullable = true)
 |-- time: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- PM2-5: string (nullable = true)
 |-- PM10: string (nullable = true)
 |-- PM1-0: string (nullable = true)
 |-- NO2: string (nullable = true)
 |-- BC: string (nullable = true)
 |-- activity: string (nullable = true)
 |-- event: string (nullable = true)



In [34]:
#On change les string "NULL" en null
df = df.withColumn(("BC"), when(col("BC") == "NULL", None).otherwise(col("BC")))
df = df.withColumn(("PM2-5"), when(col("PM2-5") == "NULL", None).otherwise(col("PM2-5")))
df = df.withColumn(("PM10"),  when(col("PM10") == "NULL",  None).otherwise(col("PM10")))
df = df.withColumn(("PM1-0"), when(col("PM1-0") == "NULL", None).otherwise(col("PM1-0")))
df = df.withColumn(("NO2"),   when(col("NO2") == "NULL",   None).otherwise(col("NO2")))
df = df.withColumn(("activity"), when(col("activity") == "NULL", None).otherwise(col("activity")))
df = df.withColumn(("event"), when(col("event") == "NULL", None).otherwise(col("event")))

In [35]:
df = df.withColumn('time',unix_timestamp('time', 'yyyy-MM-dd HH:mm:ss').alias('time'))
df.show()
df.printSchema()

+------+--------------+----------+----------------+----------------+-----+----+-----+----+----+--------+-----+
|kit_id|participant_id|      time|             lat|             lon|PM2-5|PM10|PM1-0| NO2|  BC|activity|event|
+------+--------------+----------+----------------+----------------+-----+----+-----+----+----+--------+-----+
|    80|       9999964|1573718400|48.7717766666667|        2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718410|48.7717766666667|        2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718420|48.7717766666667|        2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718430|48.7717766666667|        2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718440|48.7717766666667|        2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718450|48.7717766666667|        2.006005| null|null| null|null|null|    null| null|
|

In [36]:
#Conversion des types PM2.5, BC ... en double
#Remarque dans notre fichier de données j'ai eliminer le "+00" a chaque fois
df = df.withColumn("PM2-5",df["PM2-5"].cast(FloatType()))
df = df.withColumn("PM10",df["PM10"].cast(FloatType()))
df = df.withColumn("PM1-0",df["PM1-0"].cast(FloatType()))
df = df.withColumn("NO2",df["NO2"].cast(FloatType()))
df = df.withColumn("BC",df["BC"].cast(FloatType()))
df = df.withColumn("lat",df["lat"].cast(FloatType()))
df = df.withColumn("lon",df["lon"].cast(FloatType()))
df.printSchema()

root
 |-- kit_id: integer (nullable = true)
 |-- participant_id: integer (nullable = true)
 |-- time: long (nullable = true)
 |-- lat: float (nullable = true)
 |-- lon: float (nullable = true)
 |-- PM2-5: float (nullable = true)
 |-- PM10: float (nullable = true)
 |-- PM1-0: float (nullable = true)
 |-- NO2: float (nullable = true)
 |-- BC: float (nullable = true)
 |-- activity: string (nullable = true)
 |-- event: string (nullable = true)



In [37]:
df_temp = df
df_temp.show()

+------+--------------+----------+---------+---------+-----+----+-----+----+----+--------+-----+
|kit_id|participant_id|      time|      lat|      lon|PM2-5|PM10|PM1-0| NO2|  BC|activity|event|
+------+--------------+----------+---------+---------+-----+----+-----+----+----+--------+-----+
|    80|       9999964|1573718400| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718410| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718420| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718430| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718440| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718450| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718460| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718

In [38]:
def change_column_names(columns):
    return [c.replace('-', '_') for c in columns]

df_temp = df_temp.toDF(*change_column_names(df_temp.columns))
df_temp.show()
df_temp.count()

+------+--------------+----------+---------+---------+-----+----+-----+----+----+--------+-----+
|kit_id|participant_id|      time|      lat|      lon|PM2_5|PM10|PM1_0| NO2|  BC|activity|event|
+------+--------------+----------+---------+---------+-----+----+-----+----+----+--------+-----+
|    80|       9999964|1573718400| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718410| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718420| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718430| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718440| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718450| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718460| 48.77178| 2.006005| null|null| null|null|null|    null| null|
|    80|       9999964|1573718

59972

In [39]:
liste_id = df_temp.select('participant_id').distinct().rdd.map(lambda r: r[0])
liste_id.collect()

[9999920,
 9999955,
 9999975,
 9999936,
 9999930,
 9999960,
 9999964,
 9999962,
 999992]

In [40]:
liste_id = [9999964]

In [41]:
data_type = StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True), StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True)])
dataset = spark.createDataFrame(sc.emptyRDD(), data_type)
dataset.show()

+--------------+----+-------+-------+-------+-----+----+----------+-------+
|id_participant|trip|PM2_5_t|PM_10_t|PM1_0_t|NO2_t|BC_t|activity_t|event_t|
+--------------+----+-------+-------+-------+-----+----+----------+-------+
+--------------+----+-------+-------+-------+-----+----+----------+-------+



In [None]:
#Cell Amir 

import pandas as pd
import numpy as np
df_temp = df_temp.where(df_temp['PM2_5'].isNotNull() & df_temp['PM1_0'].isNotNull() & df_temp['PM10'].isNotNull() & df_temp['NO2'].isNotNull() & df_temp['BC'].isNotNull() & df_temp['activity'].isNotNull() & df_temp['event'].isNotNull())
# new_participant_row = spark.createDataFrame(sc.emptyRDD(), StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True),StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True),StructField('PM2_5_t', mreal,True), StructField('PM_10_s', msreal,True), StructField('PM1_0_s', msreal,True), StructField('NO2_s', msreal,True), StructField('BC_s', msreal,True), StructField('activity_s', msstring,True), StructField('event_s', msstring,True)]))
new_participant_row = spark.createDataFrame(sc.emptyRDD(), StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True),StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True)]))    
for id_participant in liste_id:   
        
    #MOUCHARD 1
    print("participant avec id: {}".format(id_participant))
    
    # Prendre toutes les données d'un seul participant et les trier par le temps
    df_one_participant = df_temp.where(col("participant_id") == id_participant)
    df_one_participant = df_one_participant.sort("time")
#     df_one_participant.show(400000)
    #initialiser les df temporaires
    trip_temp = spark.createDataFrame(sc.emptyRDD(), mspoint)
    PM2_5_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    PM10_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    PM1_0_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    NO2_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    BC_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    activity_temp = spark.createDataFrame(sc.emptyRDD(), mstring)
    event_temp = spark.createDataFrame(sc.emptyRDD(), mstring)
    # Prendre les premieres données
    row = df_one_participant.first()
    kit_id = row.kit_id
    time=row.time+3600*id_participant
    lat=row.lat
    lon=row.lon
    PM2_5=row.PM2_5
    PM10=row.PM10
    PM1_0=row.PM1_0
    NO2=row.NO2
    BC=row.BC
    activity=row.activity
    event=row.event
#     Modif Amir
    df_one_participant_panda = df_one_participant.toPandas()
    #for row in df_one_participant_panda.index:
        #print(df_one_participant_panda['event'][row],df_one_participant_panda['time'][row])
        
    while (not df_one_participant_panda.empty):
        row = df_one_participant.first()

        if (row.time+3600*id_participant == time):
            time = row.time+3600*id_participant-1
        t_temp = row.time+3600*id_participant; 
    

participant avec id: 9999964


In [42]:
df_temp = df_temp.where(df_temp['PM2_5'].isNotNull() & df_temp['PM1_0'].isNotNull() & df_temp['PM10'].isNotNull() & df_temp['NO2'].isNotNull() & df_temp['BC'].isNotNull() & df_temp['activity'].isNotNull() & df_temp['event'].isNotNull())
# new_participant_row = spark.createDataFrame(sc.emptyRDD(), StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True),StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True),StructField('PM2_5_t', mreal,True), StructField('PM_10_s', msreal,True), StructField('PM1_0_s', msreal,True), StructField('NO2_s', msreal,True), StructField('BC_s', msreal,True), StructField('activity_s', msstring,True), StructField('event_s', msstring,True)]))
new_participant_row = spark.createDataFrame(sc.emptyRDD(), StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True),StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True)]))    
for id_participant in liste_id:   
        
    #MOUCHARD 1
    print("participant avec id: {}".format(id_participant))
    
    # Prendre toutes les données d'un seul participant et les trier par le temps
    df_one_participant = df_temp.where(col("participant_id") == id_participant)
    df_one_participant = df_one_participant.sort("time")
    
    #initialiser les df temporaires
    trip_temp = spark.createDataFrame(sc.emptyRDD(), mspoint)
    PM2_5_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    PM10_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    PM1_0_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    NO2_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    BC_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
    activity_temp = spark.createDataFrame(sc.emptyRDD(), mstring)
    event_temp = spark.createDataFrame(sc.emptyRDD(), mstring)
    
#     PM2_5_s_temp = spark.createDataFrame(sc.emptyRDD(), msreal)
#     PM10_s_temp = spark.createDataFrame(sc.emptyRDD(), msreal)
#     PM1_0_s_temp = spark.createDataFrame(sc.emptyRDD(), msreal)
#     NO2_s_temp = spark.createDataFrame(sc.emptyRDD(), msreal)
#     BC_s_temp = spark.createDataFrame(sc.emptyRDD(), msreal)
#     activity_s_temp = spark.createDataFrame(sc.emptyRDD(), msstring)
#     event_s_temp = spark.createDataFrame(sc.emptyRDD(), msstring)
    
    
    # Prendre les premieres données
    row = df_one_participant.first()
    kit_id = row.kit_id
    time=row.time+3600*id_participant
    lat=row.lat
    lon=row.lon
    PM2_5=row.PM2_5
    PM10=row.PM10
    PM1_0=row.PM1_0
    NO2=row.NO2
    BC=row.BC
    activity=row.activity
    event=row.event
    
    # Boucler toutes les données du participant
    while (df_one_participant.count() > 0):
        row = df_one_participant.first()

        if (row.time+3600*id_participant == time):
            time = row.time+3600*id_participant-1
        t_temp = row.time+3600*id_participant;
        
#         # Temporel
#         dpm2_5 = [(0.0, (row.PM2_5-PM2_5)/(t_temp-time), (PM2_5*t_temp-row.PM2_5*time)/(t_temp-time), False, time, t_temp)]
#         df_pm2_5 = spark.createDataFrame(dpm2_5, ureal)
#         print("ureal pm2_5:")
#         df_pm2_5.show()
        
#         dpm10 = [(0.0, (row.PM10-PM10)/(t_temp-time), (PM10*t_temp-row.PM10*time)/(t_temp-time), False, time, t_temp)]
#         df_pm10 = spark.createDataFrame(dpm10, ureal)
        
#         dpm1_0 = [(0.0, (row.PM1_0-PM1_0)/(t_temp-time), (PM1_0*t_temp-row.PM1_0*time)/(t_temp-time), False, time, t_temp)]
#         df_pm1_0 = spark.createDataFrame(dpm1_0, ureal)
        
#         dno2 = [(0.0, (row.NO2-NO2)/(t_temp-time), (NO2*t_temp-row.NO2*time)/(t_temp-time), False, time, t_temp)]
#         df_no2 = spark.createDataFrame(dno2, ureal)
        
#         dbc = [(0.0, (row.BC-BC)/(t_temp-time), (BC*t_temp-row.BC*time)/(t_temp-time), False, time, t_temp)]
#         df_bc = spark.createDataFrame(dbc, ureal)
        
#         dactivity = [(activity, time, t_temp)]
#         df_activity = spark.createDataFrame(dactivity, ustring)
        
#         devent = [(row.event, time, t_temp)]
#         df_event = spark.createDataFrame(devent, ustring)
        
#         # Spatial
#         dsp = [(time, t_temp, lat, lon, row.lat, row.lon)]
#         df_sp = spark.createDataFrame(dsp, uspoint)
        
#         dspm2_5 = sc.parallelize([0, (row.PM2_5-PM2_5)/(row.lat-lat), (PM2_5*row.lat-row.PM2_5*lat)/(row.lat-lat), 0, SECTION(lat, lon, row.lat, row.lon)])
#         df_s_pm2_5 = spark.createDataFrame(dspm2_5, usreal)
#         dspm10 = sc.parallelize([0, (row.PM10-PM10)/(row.lat-lat), (PM10*row.lat-row.PM10*lat)/(row.lat-lat), 0, SECTION(lat, lon, row.lat, row.lon)])
#         df_s_pm10 = spark.createDataFrame(dspm10, usreal)
#         dspm1_0 = sc.parallelize([0, (row.PM1_0-PM1_0)/(row.lat-lat), (PM1_0*row.lat-row.PM1_0*lat)/(row.lat-lat), 0, SECTION(lat, lon, row.lat, row.lon)])
#         df_s_pm1_0 = spark.createDataFrame(dspm1_0, usreal)
#         dsno2 = sc.parallelize([0, (row.NO2-NO2)/(row.lat-lat), (NO2*row.lat-row.NO2*lat)/(row.lat-lat), 0, SECTION(lat, lon, row.lat, row.lon)])
#         df_s_no2 = spark.createDataFrame(dsno2, usreal)
#         dsbc = sc.parallelize([0, (row.BC-BC)/(row.lat-lat), (BC10*row.lat-row.BC10*lat)/(row.lat-lat), 0, SECTION(lat, lon, row.lat, row.lon)])
#         df_s_bc = spark.createDataFrame(dsbc, usreal)
#         dsactivity = sc.parallelize([row.activity, SECTION(lat, lon, row.lat, row.lon)])
#         df_s_activity = spark.createDataFrame(dsactivity, usstring)
#         dsevent = sc.parallelize([row.event, SECTION(lat, lon, row.lat, row.lon)])
#         df_s_activity = spark.createDataFrame(dsevent, usstring)
        
        # Mettre les données dans les df temporaires
        
        PM2_5_temp = PM2_5_temp.union(spark.createDataFrame([[([(0.0, (row.PM2_5-PM2_5)/(t_temp-time), (PM2_5*t_temp-row.PM2_5*time)/(t_temp-time), False, time, t_temp)])]], mreal))
        PM10_temp = PM10_temp.union(spark.createDataFrame([[([(0.0, (row.PM10-PM10)/(t_temp-time), (PM10*t_temp-row.PM10*time)/(t_temp-time), False, time, t_temp)])]], mreal))
        PM1_0_temp = PM1_0_temp.union(spark.createDataFrame([[([(0.0, (row.PM1_0-PM1_0)/(t_temp-time), (PM1_0*t_temp-row.PM1_0*time)/(t_temp-time), False, time, t_temp)])]], mreal))
        NO2_temp = NO2_temp.union(spark.createDataFrame([[([(0.0, (row.NO2-NO2)/(t_temp-time), (NO2*t_temp-row.NO2*time)/(t_temp-time), False, time, t_temp)])]], mreal))
        BC_temp = BC_temp.union(spark.createDataFrame([[([(0.0, (row.BC-BC)/(t_temp-time), (BC*t_temp-row.BC*time)/(t_temp-time), False, time, t_temp)])]], mreal))
        activity_temp = activity_temp.union(spark.createDataFrame([[([(activity, time, t_temp)])]], mstring))
        event_temp = event_temp.union(spark.createDataFrame([[([(row.event, time, t_temp)])]], mstring))
        
        trip_temp = trip_temp.union(spark.createDataFrame([[([(time, t_temp, lat, lon, row.lat, row.lon)])]], mspoint))
#         PM2_5_s_temp = PM2_5_s_temp.union([df_s_pm2_5])
#         PM10_s_temp = PM10_s_temp.union([df_s_pm10])
#         PM1_0_s_temp = PM1_0_s_temp.union([df_s_pm1_0])
#         NO2_s_temp = NO2_s_temp.union([df_s_no2])
#         BC_s_temp = BC_s_temp.union([df_s_bc])
#         activity_s_temp = activity_s_temp.union([df_s_activity])
#         event_s_temp = event_s_temp.union([de_s_event])
#         print("mreal pm2_5:")
#         PM2_5_temp.show()
        
        # Décallage des variables
        kit_id = row.kit_id
        time=row.time+3600*id_participant
        lat=row.lat
        lon=row.lon
        PM2_5=row.PM2_5
        PM10=row.PM10
        PM1_0=row.PM1_0
        NO2=row.NO2
        BC=row.BC
        activity=row.activity
        event=row.event
        
        # Passer à la ligne suivante
        df_one_participant = df_one_participant.where(col("time") > row.time)
        
    # FIN WHILE
    
#     d = sc.parallelize([id_participant, trip_temp, PM2_5_temp, PM10_temp, PM1_0_temp, NO2_temp, BC_temp, activity_temp, event_temp, PM2_5_s_temp, PM10_s_temp, PM1_0_s_temp, NO2_s_temp, BC_s_temp, activity_s_temp, event_s_temp ])
#     new_participant_row = spark.createDataFrame(d, StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True), StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True),StructField('PM2_5_t', mreal,True), StructField('PM_10_s', msreal,True), StructField('PM1_0_s', msreal,True), StructField('NO2_s', msreal,True), StructField('BC_s', msreal,True), StructField('activity_s', msstring,True), StructField('event_s', msstring,True)]))
    d = [(id_participant, trip_temp.rdd.collect()[0], PM2_5_temp.rdd.collect()[0], PM10_temp.rdd.collect()[0], PM1_0_temp.rdd.collect()[0], NO2_temp.rdd.collect()[0], BC_temp.rdd.collect()[0], activity_temp.rdd.collect()[0], event_temp.rdd.collect()[0])]
    new_participant_row = spark.createDataFrame(d, StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True), StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True)]))
    
    dataset = dataset.union(new_participant_row)
# FIN DU FOR
dataset.show()
dataset.toPandas().to_csv('dataset.csv')

participant avec id: 9999964


Py4JJavaError: An error occurred while calling o55840.count.
: java.lang.OutOfMemoryError: Java heap space
	at java.util.HashMap.newNode(HashMap.java:1742)
	at java.util.HashMap.putVal(HashMap.java:641)
	at java.util.HashMap.putMapEntries(HashMap.java:514)
	at java.util.HashMap.putAll(HashMap.java:784)
	at org.codehaus.janino.UnitCompiler.buildLocalVariableMap(UnitCompiler.java:3689)
	at org.codehaus.janino.UnitCompiler.access$5800(UnitCompiler.java:226)
	at org.codehaus.janino.UnitCompiler$12.visitLocalVariableDeclarationStatement(UnitCompiler.java:3574)
	at org.codehaus.janino.UnitCompiler$12.visitLocalVariableDeclarationStatement(UnitCompiler.java:3542)
	at org.codehaus.janino.Java$LocalVariableDeclarationStatement.accept(Java.java:3712)
	at org.codehaus.janino.UnitCompiler.buildLocalVariableMap(UnitCompiler.java:3541)
	at org.codehaus.janino.UnitCompiler.buildLocalVariableMap(UnitCompiler.java:3598)
	at org.codehaus.janino.UnitCompiler.access$4700(UnitCompiler.java:226)
	at org.codehaus.janino.UnitCompiler$12.visitBlock(UnitCompiler.java:3560)
	at org.codehaus.janino.UnitCompiler$12.visitBlock(UnitCompiler.java:3542)
	at org.codehaus.janino.Java$Block.accept(Java.java:2969)
	at org.codehaus.janino.UnitCompiler.buildLocalVariableMap(UnitCompiler.java:3541)
	at org.codehaus.janino.UnitCompiler.buildLocalVariableMap(UnitCompiler.java:3604)
	at org.codehaus.janino.UnitCompiler.access$4800(UnitCompiler.java:226)
	at org.codehaus.janino.UnitCompiler$12.visitDoStatement(UnitCompiler.java:3561)
	at org.codehaus.janino.UnitCompiler$12.visitDoStatement(UnitCompiler.java:3542)
	at org.codehaus.janino.Java$DoStatement.accept(Java.java:3664)
	at org.codehaus.janino.UnitCompiler.buildLocalVariableMap(UnitCompiler.java:3541)
	at org.codehaus.janino.UnitCompiler.buildLocalVariableMap(UnitCompiler.java:3598)
	at org.codehaus.janino.UnitCompiler.access$4700(UnitCompiler.java:226)
	at org.codehaus.janino.UnitCompiler$12.visitBlock(UnitCompiler.java:3560)
	at org.codehaus.janino.UnitCompiler$12.visitBlock(UnitCompiler.java:3542)
	at org.codehaus.janino.Java$Block.accept(Java.java:2969)
	at org.codehaus.janino.UnitCompiler.buildLocalVariableMap(UnitCompiler.java:3541)
	at org.codehaus.janino.UnitCompiler.buildLocalVariableMap(UnitCompiler.java:3675)
	at org.codehaus.janino.UnitCompiler.access$5600(UnitCompiler.java:226)
	at org.codehaus.janino.UnitCompiler$12.visitWhileStatement(UnitCompiler.java:3569)
	at org.codehaus.janino.UnitCompiler$12.visitWhileStatement(UnitCompiler.java:3542)


In [None]:
dfffff = spark.createDataFrame([('0.38030685472943737', '0.34728188900913715')])

In [None]:
dfffff.count()

In [None]:
dfffff.printSchema()

In [None]:
dfffff.rdd.collect()

In [None]:
data_type = StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True), StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True)])
dataset = spark.createDataFrame(sc.emptyRDD(), data_type)
dataset.show()

trip_temp = spark.createDataFrame(sc.emptyRDD(), mspoint)
PM2_5_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
PM10_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
PM1_0_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
NO2_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
BC_temp = spark.createDataFrame(sc.emptyRDD(), mreal)
activity_temp = spark.createDataFrame(sc.emptyRDD(), mstring)
event_temp = spark.createDataFrame(sc.emptyRDD(), mstring)

PM2_5_temp = PM2_5_temp.union(spark.createDataFrame([[([(0.0, 0.0, 2.0, False, 0, 1)])]], mreal))
PM10_temp = PM10_temp.union(spark.createDataFrame([[([(0.0, 0.0, 2.0, False, 0, 1)])]], mreal))
PM1_0_temp = PM1_0_temp.union(spark.createDataFrame([[([(0.0, 0.0, 2.0, False, 0, 1)])]], mreal))
NO2_temp = NO2_temp.union(spark.createDataFrame([[([(0.0, 0.0, 2.0, False, 0, 1)])]], mreal))
BC_temp = BC_temp.union(spark.createDataFrame([[([(0.0, 0.0, 2.0, False, 0, 1)])]], mreal))
activity_temp = activity_temp.union(spark.createDataFrame([[([("home", 0, 1)])]], mstring))
event_temp = event_temp.union(spark.createDataFrame([[([("cooking", 0, 1)])]], mstring))
trip_temp = trip_temp.union(spark.createDataFrame([[([(0, 1, 4.5, 2.0, 4.8, 2.6)])]], mspoint))
trip_temp = trip_temp.union(spark.createDataFrame([[([(1, 2, 8.5, 4.0, 8.8, 5.6)])]], mspoint))

print(trip_temp.rdd)
print (trip_temp.rdd.collect())

d = [(id_participant, trip_temp.rdd.collect()[0], PM2_5_temp.collect()[0], PM10_temp.collect()[0], PM1_0_temp.collect()[0], NO2_temp.collect()[0], BC_temp.collect()[0], activity_temp.collect()[0], event_temp.collect()[0])]
new_participant_row = spark.createDataFrame(d, StructType([StructField('id_participant', IntegerType(),True), StructField('trip', mspoint,True), StructField('PM2_5_t', mreal,True), StructField('PM_10_t', mreal,True), StructField('PM1_0_t', mreal,True), StructField('NO2_t', mreal,True), StructField('BC_t', mreal,True), StructField('activity_t', mstring,True), StructField('event_t', mstring,True)]))

print("dataframe new_participant_row")
new_participant_row.show()

dataset = dataset.union(new_participant_row)
print("dataframe dataset")
dataset.show()


In [None]:
dataset.toPandas().to_csv('mycsv.csv')