# Imports

In [None]:
import os
import sys

import pyspark.sql.functions as f
from pyspark.sql import SparkSession

# Required for Spark to find Python executable
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [None]:
spark = SparkSession.builder.appName("pipedrive_deal_fields").getOrCreate()

In [None]:
df_deal_fields = spark.read.json('data/pipedrive_deal_fields.json', multiLine=True)

In [None]:
# df_deal_fields.count()
df_deal_fields.printSchema()

In [56]:
df_deal_fields_data = (df_deal_fields
                       .withColumn("data_exploded", f.explode("data"))
                       .select("data_exploded.*"))

In [57]:
df_deal_fields_data.printSchema()

root
 |-- active_flag: boolean (nullable = true)
 |-- add_time: string (nullable = true)
 |-- add_visible_flag: boolean (nullable = true)
 |-- bulk_edit_allowed: boolean (nullable = true)
 |-- created_by_user_id: long (nullable = true)
 |-- description: string (nullable = true)
 |-- details_visible_flag: boolean (nullable = true)
 |-- edit_flag: boolean (nullable = true)
 |-- field_type: string (nullable = true)
 |-- filtering_allowed: boolean (nullable = true)
 |-- group_id: long (nullable = true)
 |-- id: long (nullable = true)
 |-- id_suffix: string (nullable = true)
 |-- important_flag: boolean (nullable = true)
 |-- is_subfield: boolean (nullable = true)
 |-- json_column_flag: boolean (nullable = true)
 |-- key: string (nullable = true)
 |-- last_updated_by_user_id: long (nullable = true)
 |-- link: string (nullable = true)
 |-- mandatory_flag: string (nullable = true)
 |-- name: string (nullable = true)
 |-- options: array (nullable = true)
 |    |-- element: struct (containsNull

In [70]:
column_to_keep = [
    'id',
    'key',
    f.explode('options').alias('options'),
    f.col('add_time').cast("timestamp"),
    f.col('update_time').cast("timestamp")]

In [71]:
df_deal_fields_data_options = df_deal_fields_data.select(*column_to_keep)

In [73]:
# df_deal_fields_data_options.show(2)
df_deal_fields_data_options.printSchema()

root
 |-- id: long (nullable = true)
 |-- key: string (nullable = true)
 |-- options: struct (nullable = true)
 |    |-- color: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- label: string (nullable = true)
 |-- add_time: timestamp (nullable = true)
 |-- update_time: timestamp (nullable = true)



In [74]:
df_deal_fields_clean = (df_deal_fields_data_options
                        .select("id",
                                "key",
                                "add_time",
                                "update_time",
                                f.col("options.id").alias("option_id"),
                                f.col("options.label").alias("option_label")))

In [75]:
df_deal_fields_clean.printSchema()

root
 |-- id: long (nullable = true)
 |-- key: string (nullable = true)
 |-- add_time: timestamp (nullable = true)
 |-- update_time: timestamp (nullable = true)
 |-- option_id: string (nullable = true)
 |-- option_label: string (nullable = true)



In [76]:
df_deal_fields_clean.show(5)

+-----+-----------+-------------------+-------------------+---------+--------------------+
|   id|        key|           add_time|        update_time|option_id|        option_label|
+-----+-----------+-------------------+-------------------+---------+--------------------+
|12457|     status|2017-06-20 18:34:47|2017-06-20 18:34:45|     open|              Aberto|
|12457|     status|2017-06-20 18:34:47|2017-06-20 18:34:45|     lost|             Perdido|
|12457|     status|2017-06-20 18:34:47|2017-06-20 18:34:45|      won|               Ganho|
|12457|     status|2017-06-20 18:34:47|2017-06-20 18:34:45|  deleted|            Excluído|
|12468|lost_reason|2017-06-20 18:34:47|2024-07-25 19:30:02|     1269|TRIALS e SDR - Mo...|
+-----+-----------+-------------------+-------------------+---------+--------------------+
only showing top 5 rows

