# Imports

In [None]:
import os
import sys

import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType

# Required for Spark to find Python executable
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
spark = SparkSession.builder.appName("pipedrive_deals").getOrCreate()

In [3]:
df_deals = spark.read.json('data/pipedrive_deals.json', multiLine=True)

In [None]:
# df_deal_fields.count()
df_deals.printSchema()

In [19]:
column_to_keep = [
    'active',
    'deleted',
    f.col('next_activity_date').cast("date"),
    f.col('last_activity_date').cast("date"),
    f.col('close_time').cast("timestamp"),
    f.col('won_time').cast("timestamp"),
    f.col('lost_time').cast("timestamp"),
    f.col('add_time').cast("timestamp"),
    f.col('update_time').cast("timestamp"),
    f.col('stage_change_time').cast("timestamp"),
    'id',
    'stage_id',
    f.col('next_activity_id').cast('long'),
    f.col('last_activity_id').cast('long'),
    'pipeline_id',
    'activities_count',
    'done_activities_count',
    'undone_activities_count',
    'stage_order_nr',
    f.col('person_id.value').alias("person_id"),
    f.col('user_id.value').alias("user_id"),
    f.split(f.col('label'), ',').cast('array<int>').alias('labels'),
    'status',
    'title',
    'lost_reason'
]

In [20]:
custom_columns_to_keep = [
    {
        'key': '887fca406fde38eaf6ecdedbc0a2da27c8b2d1cb',
        'alias': 'data_renovacao_anual',
        'cast': 'date'
    },
    {
        'key': 'd867c2c23f0e995c0381e33f0a06f2174ed111a8',
        'alias': 'cs_data_atividade_renovacao',
        'cast': 'date'
    },
    {
        'key': '97a9344d1155a76ecbbb118db5265c29cab5c26f',
        'alias': 'plano',
        'cast': 'int'
    },
    {
        'key': '46c1f54f65dbbe8ec25d9897d3645306f15f1364',
        'alias': 'possui_secretaria',
        'cast': 'int'
    },
    {
        'key': '6daed313ce9dbbd6c1d1cf12a1cba7358a453d27',
        'alias': 'possui_computador_clinica',
        'cast': 'int'
    },
    {
        'key': 'cd1807339972e4fba0dec4509635a1e897fc9de5',
        'alias': 'qual_problema_deseja_resolver',
        'cast': 'array<int>',
        'split': True
    },
    {
        'key': 'ed8a8d7d4ca118d2472eea5b8a0ac2bc6cbf2052',
        'alias': 'resultado_desejado',
        'cast': 'array<int>',
        'split': True
    },
    {
        'key': '9fc8a3a860f1fee5ca055bb289b45b66ea0e7a9b',
        'alias': 'detalhes_cancelamento',
        'cast': 'string'
    },
    {
        'key': '4f9ad922ecba0372694470a41b6c13ece01ec6d3',
        'alias': 'origem_vendas',
        'cast': 'string'
    },
    {
        'key': 'ba8520544e3f3fb8e8b8b5a6e42f475a3fc3a307',
        'alias': 'link_vendas',
        'cast': 'string'
    },
    {
        'key': '58b5a663994afc6258d0d7051856ab675b6118d6',
        'alias': 'papel_usuario',
        'cast': 'string'
    },
    {
        'key': 'eabdab9be01e6e50406c904159ed4639f8a0a2e0',
        'alias': 'cs_ponto_de_atencao',
        'cast': 'string'
    }
]

In [21]:
existing_columns = df_deals.columns

for custom_column in custom_columns_to_keep:
    key = custom_column['key']
    if key in existing_columns:

        if 'split' in custom_column and custom_column['split']:
            column_to_keep.append(f.split(f.col(key), ',')
                                  .alias(custom_column['alias'])
                                  .cast(custom_column['cast']))
            continue

        column_to_keep.append(f.col(key)
                              .alias(custom_column['alias'])
                              .cast(custom_column['cast']))

print(column_to_keep)

['active', 'deleted', Column<'CAST(next_activity_date AS DATE)'>, Column<'CAST(last_activity_date AS DATE)'>, Column<'CAST(close_time AS TIMESTAMP)'>, Column<'CAST(won_time AS TIMESTAMP)'>, Column<'CAST(lost_time AS TIMESTAMP)'>, Column<'CAST(add_time AS TIMESTAMP)'>, Column<'CAST(update_time AS TIMESTAMP)'>, Column<'CAST(stage_change_time AS TIMESTAMP)'>, 'id', 'stage_id', Column<'CAST(next_activity_id AS BIGINT)'>, Column<'CAST(last_activity_id AS BIGINT)'>, 'pipeline_id', 'activities_count', 'done_activities_count', 'undone_activities_count', 'stage_order_nr', Column<'person_id.value AS person_id'>, Column<'user_id.value AS user_id'>, Column<'split(label, ,, -1) AS labels'>, 'status', 'title', 'lost_reason', Column<'CAST(887fca406fde38eaf6ecdedbc0a2da27c8b2d1cb AS data_renovacao_anual AS DATE)'>, Column<'CAST(d867c2c23f0e995c0381e33f0a06f2174ed111a8 AS cs_data_atividade_renovacao AS DATE)'>, Column<'CAST(97a9344d1155a76ecbbb118db5265c29cab5c26f AS plano AS INT)'>, Column<'CAST(46c1f

In [22]:
df_deal_fields_selected = df_deals.select(*column_to_keep)

In [23]:
# df_deal_fields_data_options.show(2)
df_deal_fields_selected.printSchema()

root
 |-- active: boolean (nullable = true)
 |-- deleted: boolean (nullable = true)
 |-- next_activity_date: date (nullable = true)
 |-- last_activity_date: date (nullable = true)
 |-- close_time: timestamp (nullable = true)
 |-- won_time: timestamp (nullable = true)
 |-- lost_time: timestamp (nullable = true)
 |-- add_time: timestamp (nullable = true)
 |-- update_time: timestamp (nullable = true)
 |-- stage_change_time: timestamp (nullable = true)
 |-- id: long (nullable = true)
 |-- stage_id: long (nullable = true)
 |-- next_activity_id: long (nullable = true)
 |-- last_activity_id: long (nullable = true)
 |-- pipeline_id: long (nullable = true)
 |-- activities_count: long (nullable = true)
 |-- done_activities_count: long (nullable = true)
 |-- undone_activities_count: long (nullable = true)
 |-- stage_order_nr: long (nullable = true)
 |-- person_id: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- labels: array (nullable = true)
 |    |-- element: integer (containsN