# Download Data

### Test
```
mkdir bronze
cd bronze
mkdir v1.0-test_meta
cd v1.0-trainval_meta
wget https://d36yt3mvayqw5m.cloudfront.net/public/v1.0/v1.0-test_meta.tgz
gunzip v1.0-test_meta.tgz
tar -xf v1.0-test_meta.tar
rm v1.0-test_meta.tar
```

### Trainval
```
mkdir bronze
cd bronze
mkdir v1.0-trainval_meta
cd v1.0-trainval_meta
wget https://d36yt3mvayqw5m.cloudfront.net/public/v1.0/v1.0-trainval_meta.tgz
gunzip v1.0-trainval_meta.tgz
tar -xf v1.0-trainval_meta.tar
rm v1.0-trainval_meta.tar
```

# Import Libraries

In [1]:
import pyspark
from pyspark.sql import types
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Create SparkSession

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

# Available files to read

In [3]:
import glob

In [4]:
file_list_trainval = glob.glob('bronze\\v1.0-trainval_meta\\v1.0-trainval/*')
file_list_test = glob.glob('bronze\\v1.0-test_meta\\v1.0-test/*')

In [5]:
file_list_trainval

['bronze\\v1.0-trainval_meta\\v1.0-trainval\\attribute.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\calibrated_sensor.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\category.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\ego_pose.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\instance.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\log.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\map.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\sample.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\sample_annotation.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\sample_data.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\scene.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\sensor.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\visibility.json']

In [6]:
file_list_test

['bronze\\v1.0-test_meta\\v1.0-test\\attribute.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\calibrated_sensor.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\category.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\ego_pose.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\instance.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\log.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\map.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\sample.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\sample_annotation.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\sample_data.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\scene.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\sensor.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\visibility.json']

# How to read JSON files with PySpark

In [7]:
df_category = spark.read.option("multiline", True).json("bronze\\v1.0-test_meta\\v1.0-test\\category.json")

In [8]:
df_category.show()

+--------------------+--------------------+--------------------+
|         description|                name|               token|
+--------------------+--------------------+--------------------+
|  Adult subcategory.|human.pedestrian....|1fa93b757fc74fb19...|
|  Child subcategory.|human.pedestrian....|b1c6de4c57f14a538...|
|Wheelchairs. If a...|human.pedestrian....|b2d7c6c701254928a...|
|Strollers. If a p...|human.pedestrian....|6a5888777ca14867a...|
|A small electric ...|human.pedestrian....|403fede16c8842688...|
|     Police officer.|human.pedestrian....|bb867e20640142798...|
| Construction worker|human.pedestrian....|909f1237d34a49d6b...|
|All animals, e.g....|              animal|63a94dfa99bb47529...|
|Vehicle designed ...|         vehicle.car|fd69059b62a3469fb...|
|Gasoline or elect...|  vehicle.motorcycle|dfd26f200ade4d24b...|
|Human or electric...|     vehicle.bicycle|fc95c87b806f48f8a...|
|Bendy bus subcate...|   vehicle.bus.bendy|003edbfb9ca849ee8...|
|Rigid bus subcate...|   

In [9]:
df_visibility = spark.read.option("multiline", True).json("bronze\\v1.0-test_meta\\v1.0-test\\visibility.json")

# Create schema with PySpark

## Schemas

How to create a schema:
```python
from pyspark.sql import types

schema = types.StructType([
    types.StructField('hour', types.TimestampType(), True),
    types.StructField('zone', types.IntegerType(), True),
    types.StructField('revenue', types.DoubleType(), True),
    types.StructField('count', types.IntegerType(), True)
])
```

In [10]:
schema_log = types.StructType([
    types.StructField('token', types.StringType(), True),
    types.StructField('logfile', types.StringType(), True),
    types.StructField('vehicle', types.StringType(), True),
    types.StructField('date_captured', types.DateType(), True),
    types.StructField('location', types.StringType(), True)
])

In [11]:
schema_scene = types.StructType([
    types.StructField('token', types.StringType(), True),
    types.StructField('log_token', types.StringType(), True),
    types.StructField('nbr_samples', types.IntegerType(), True),
    types.StructField('first_sample_token', types.StringType(), True),
    types.StructField('last_sample_token', types.StringType(), True),
    types.StructField('name', types.StringType(), True),
    types.StructField('description', types.StringType(), True)
])

In [12]:
schema_sample = types.StructType([
    types.StructField('token', types.StringType(), True),
    types.StructField('timestamp', types.TimestampType(), True),
    types.StructField('prev', types.StringType(), True),
    types.StructField('next', types.StringType(), True),
    types.StructField('scene_token', types.StringType(), True)
])

In [13]:
schema_sample_data = types.StructType([
    types.StructField('token', types.StringType(), True),
    types.StructField('sample_token', types.StringType(), True),
    types.StructField('ego_pose_token', types.StringType(), True),
    types.StructField('calibrated_sensor_token', types.StringType(), True),
    types.StructField('timestamp', types.TimestampType(), True),
    types.StructField('fileformat', types.StringType(), True),
    types.StructField('is_key_frame', types.BooleanType(), True),
    types.StructField('height', types.IntegerType(), True),
    types.StructField('width', types.IntegerType(), True),
    types.StructField('filename', types.StringType(), True),
    types.StructField('prev', types.StringType(), True),
    types.StructField('next', types.StringType(), True)
])

In [14]:
schema_sample_annotation = types.StructType([
    types.StructField('token', types.StringType(), True),
    types.StructField('sample_token', types.StringType(), True),
    types.StructField('instance_token', types.StringType(), True),
    types.StructField('visibility_token', types.DecimalType(), True),
    types.StructField('attribute_tokens', types.StringType(), True),
    types.StructField('translation', types.StringType(), True),
    types.StructField('size', types.StringType(), True),
    types.StructField('rotation', types.StringType(), True),
    types.StructField('prev', types.StringType(), True),
    types.StructField('next', types.StringType(), True),
    types.StructField('num_lidar_pts', types.IntegerType(), True),
    types.StructField('num_radar_pts', types.IntegerType(), True)
])

In [15]:
schema_instance = types.StructType([
    types.StructField('token', types.StringType(), True),
    types.StructField('category_token', types.StringType(), True),
    types.StructField('nbr_annotations', types.IntegerType(), True),
    types.StructField('first_annotation_token', types.StringType(), True),
    types.StructField('last_annotation_token', types.StringType(), True)
])

## Read JSON files with defined Schema

### Trainval

In [16]:
file_list_trainval

['bronze\\v1.0-trainval_meta\\v1.0-trainval\\attribute.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\calibrated_sensor.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\category.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\ego_pose.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\instance.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\log.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\map.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\sample.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\sample_annotation.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\sample_data.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\scene.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\sensor.json',
 'bronze\\v1.0-trainval_meta\\v1.0-trainval\\visibility.json']

In [17]:
df_log_trainval = spark.read.option("multiline", True).schema(schema_log).json('bronze\\v1.0-trainval_meta\\v1.0-trainval\\log.json')
df_scene_trainval = spark.read.option("multiline", True).schema(schema_scene).json('bronze\\v1.0-trainval_meta\\v1.0-trainval\\scene.json')
df_sample_trainval = spark.read.option("multiline", True).schema(schema_sample).json('bronze\\v1.0-trainval_meta\\v1.0-trainval\\sample.json')
df_sample_data_trainval = spark.read.option("multiline", True).schema(schema_sample_data).json('bronze\\v1.0-trainval_meta\\v1.0-trainval\\sample_data.json')
df_sample_annotation_trainval = spark.read.option("multiline", True).schema(schema_sample_annotation).json('bronze\\v1.0-trainval_meta\\v1.0-trainval\\sample_annotation.json')
df_instance_trainval = spark.read.option("multiline", True).schema(schema_instance).json('bronze\\v1.0-trainval_meta\\v1.0-trainval\\instance.json')

### Test

In [18]:
file_list_test

['bronze\\v1.0-test_meta\\v1.0-test\\attribute.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\calibrated_sensor.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\category.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\ego_pose.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\instance.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\log.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\map.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\sample.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\sample_annotation.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\sample_data.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\scene.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\sensor.json',
 'bronze\\v1.0-test_meta\\v1.0-test\\visibility.json']

In [19]:
df_log_test = spark.read.option("multiline", True).schema(schema_log).json('bronze\\v1.0-test_meta\\v1.0-test\\log.json')
df_scene_test = spark.read.option("multiline", True).schema(schema_scene).json('bronze\\v1.0-test_meta\\v1.0-test\\scene.json')
df_sample_test = spark.read.option("multiline", True).schema(schema_sample).json('bronze\\v1.0-test_meta\\v1.0-test\\sample.json')
df_sample_data_test = spark.read.option("multiline", True).schema(schema_sample_data).json('bronze\\v1.0-test_meta\\v1.0-test\\sample_data.json')
df_sample_annotation_test = spark.read.option("multiline", True).schema(schema_sample_annotation).json('bronze\\v1.0-test_meta\\v1.0-test\\sample_annotation.json')
df_instance_test = spark.read.option("multiline", True).schema(schema_instance).json('bronze\\v1.0-test_meta\\v1.0-test\\instance.json')

In [20]:
df_log = df_log_trainval.union(df_log_test)
df_scene = df_scene_trainval.union(df_scene_test)
df_sample = df_sample_trainval.union(df_sample_test)
df_sample_data = df_sample_data_trainval.union(df_sample_data_test)
df_sample_annotation = df_sample_annotation_trainval.union(df_sample_annotation_test)
df_instance = df_instance_trainval.union(df_instance_test)

# Clean Data with PySpark - Silver

In [21]:
df_log = df_log.select('token', 'logfile', 'vehicle','location')

In [22]:
df_scene = df_scene.select('token','log_token','name','description')

In [23]:
df_sample = df_sample.select('token', 'timestamp','scene_token')

In [24]:
df_sample_data = df_sample_data.select('sample_token','ego_pose_token','timestamp','is_key_frame','filename')

In [25]:
df_sample_annotation = df_sample_annotation.select('sample_token','instance_token','visibility_token','attribute_tokens')

In [26]:
df_instance = df_instance.select('token','category_token','nbr_annotations')

## Store as Parquet(CSV for test)

In [27]:
# df_log.write.mode('overwrite').option("header",True).csv("silver/sources/log/log.csv")
# df_log.write.partitionBy("vehicle").mode('overwrite').parquet("silver/sources/log/")

In [28]:
# df_scene.write.mode('overwrite').option("header",True).csv("silver/sources/scene/scene.csv")
# df_scene.write.partitionBy("name").mode('overwrite').parquet("silver/sources/scene/")

In [29]:
# df_sample.write.mode('overwrite').option("header",True).csv("silver/sources/sample/sample.csv")
# df_sample.coalesce(10).write.mode('overwrite').parquet("silver/sources/sample/")

In [30]:
# df_sample_data.write.mode('overwrite').option("header",True).csv("silver/sources/sample_data/sample_data.csv")
# df_sample_data.write.partitionBy("sample_token").mode('overwrite').parquet("silver/sources/sample_data/")

In [31]:
# df_sample_annotation.write.mode('overwrite').option("header",True).csv("silver/sources/sample_data_annotation/sample_data_annotation.csv")
# df_sample_annotation.write.partitionBy("sample_token").mode('overwrite').parquet("silver/sources/sample_annotation/")

In [32]:
# df_instance.write.mode('overwrite').option("header",True).csv("silver/sources/instance/instance.csv")
# df_instance.write.partitionBy("category_token").mode('overwrite').parquet("silver/sources/instance/")

# Clean Data with PySpark - Gold

In [33]:
df_sample_data.filter(df.is_key_frame == True).limit(2).show()

NameError: name 'df' is not defined

In [None]:
spark.stop()