In this notebook I consume Jira's REST API to extract issues. The process includes the follow ELT(L) process:
- E - Extract issues from Jira
- L - Save data to a JSON file
- T - Transforms data keeping only the required fields
- L - Store the transformed data into a new JSON file 

In [None]:
import math
import os
from datetime import datetime as dt, timedelta
from pathlib import Path

import pyspark.sql.functions as f
import requests
from pyspark.errors.exceptions.captured import AnalysisException
from pyspark.sql import SparkSession

In [None]:
print(os.environ['jira_base64_auth_token'])

# EXTRACT + LOAD

In [None]:
max_results_per_page = 100
fields_to_return = ("lastViewed,assignee,reporter,customfield_10710,issuetype,project,resolutiondate,customfield_10709,"
                    "customfield_10016,updated,customfield_10015,summary,duedate,customfield_10590,customfield_10591,"
                    "customfield_10588,customfield_10589,customfield_10469,priority,customfield_10578,status,creator,"
                    "creator,created,parent,customfield_10344,customfield_10314,customfield_10311,customfield_10312,"
                    "customfield_10273,customfield_10337,customfield_10572,customfield_10749,customfield_10750,"
                    "customfield_10751,customfield_10748,customfield_10747")
# jql = "created > '-3d' or updated > '-3d'"
jql = "KEY IN (DAK-520, DAK-532, DAK-588, SDA-2854, SDA-2859, SD-15702, SDE-2544, SDC-677)"

total_pages = 1
current_page = 0

headers = {
    'Accept': 'application/json',
    'Authorization': os.environ['jira_base64_auth_token']
}

while current_page < total_pages:
    start_at = current_page * max_results_per_page
    url = (f"https://company.atlassian.net/rest/api/3/search?expand="
           f"&maxResults={max_results_per_page}"
           f"&fields={fields_to_return}"
           f"&jql={jql}"
           f"&startAt={start_at}")

    response = requests.request("GET", url, headers=headers)

    if response.status_code != 200:
        raise Exception(f"Erro na requisicao da API: {response.status_code} - {response.text}")

    # So precisamos calcular o maximo de paginas na primeira interacao
    if current_page == 0:
        json_response = response.json()
        total_pages = math.ceil(json_response['total'] / max_results_per_page)

    current_page += 1

    response_text = response.text
    file_path = f"{Path().absolute()}/jira_issues_json_{current_page}.json"

    with open(file_path, 'w', encoding="utf-8") as json_file:
        json_file.write(response_text)

    print(f"Carregado pagina {current_page} de {total_pages}")


In [None]:
json_response = response.json()

print(json_response['maxResults'])
print(json_response['total'])

# Transform 

In [None]:
spark = SparkSession.builder.appName("jira_extract_issues").getOrCreate()

Create a new list of issues keeping only the required fields and renaming custom ones

In [None]:
from os import listdir
from os.path import join

file_path = f"{Path().absolute()}"
# Como o arquivo com os dados tratados é salvo no mesmo
# path precisamos tratar para nao importar esse arquivo
# caso ele exista
json_files = [f for f in listdir(file_path)
              if f.endswith(".json")
              and not f == "jira_issues_json_transformed.json"]

json_files_list = []
for file in json_files:
    json_files_list.append(join(file_path, file))

df = spark.read.json(json_files_list, multiLine=True)

In [None]:
# df.printSchema()
# df.show()
df.count()

In [None]:
df_issues = df.select("issues")

In [None]:
# df_issues.show(truncate=False)
# df_issues.printSchema()
df_issues.count()

In [None]:
df_issues_flat = df_issues.withColumn("issues_exploded", f.explode("issues")) \
    .drop("issues")

In [None]:
df_issues_flat.printSchema()
# df_issues_flat.show(truncate=False)
# df_issues_flat.count()

In [None]:
columns_to_keep = ["issues_exploded.id",
                   "issues_exploded.key",
                   f.col("issues_exploded.fields.parent.key").alias("parent_key"),
                   f.col("issues_exploded.fields.lastViewed").cast("timestamp").alias("last_viewed"),
                   f.col("issues_exploded.fields.assignee.accountId").alias("assignee_id"),
                   f.col("issues_exploded.fields.assignee.displayName").alias("assignee_name"),
                   f.col("issues_exploded.fields.reporter.accountId").alias("reporter_id"),
                   f.col("issues_exploded.fields.reporter.displayName").alias("reporter_name"),
                   f.col("issues_exploded.fields.customfield_10710.value").alias("pilar_de_atuacao"),
                   f.col("issues_exploded.fields.issuetype.name").alias("issue_type"),
                   f.col("issues_exploded.fields.project.id").alias("project_id"),
                   f.col("issues_exploded.fields.project.key").alias("project_key"),
                   f.col("issues_exploded.fields.project.name").alias("project_name"),
                   f.col("issues_exploded.fields.resolutiondate").cast("timestamp").alias("resolution_date"),
                   f.col("issues_exploded.fields.customfield_10709.value").alias("time_solicitante"),
                   f.col("issues_exploded.fields.customfield_10016").cast("int").alias("story_point_estimate"),
                   f.col("issues_exploded.fields.updated").cast("timestamp").alias("updated"),
                   f.col("issues_exploded.fields.customfield_10015").alias("start_date"),
                   f.col("issues_exploded.fields.summary"),
                   f.col("issues_exploded.fields.duedate").cast("date").alias("due_date"),
                   f.col("issues_exploded.fields.priority.name").alias("priority"),
                   f.col("issues_exploded.fields.status.name").alias("status"),
                   f.col("issues_exploded.fields.creator.accountId").alias("creator_id"),
                   f.col("issues_exploded.fields.creator.displayName").alias("creator_name"),
                   f.col("issues_exploded.fields.created").cast("timestamp").alias("created"),

                   f.col("issues_exploded.fields.customfield_10344.value").getItem(0).alias("dentro_cobertura_testes"),

                   f.col("issues_exploded.fields.customfield_10314.value").alias("stack"),
                   f.col("issues_exploded.fields.customfield_10311.value").alias("funcionalidade"),
                   f.col("issues_exploded.fields.customfield_10312.value").alias("origem"),
                   f.col("issues_exploded.fields.customfield_10273.value").alias("criticidade"),
                   f.col("issues_exploded.fields.customfield_10337.value").alias("severidade"),
                   f.col("issues_exploded.fields.customfield_10572.value").alias("causa_do_problema"),

                   # De acordo com os ajustes realizados pelo Bernardo em Ag/2024 para manter
                   # os campos padroes em todos os projetos do Jira foi necessario implementar 
                   # o tratamento de CAST abaixo, assim garantindo que as Issues terão os
                   # dados preenchidos de acordo com o campo correto
                   # Logica = Se o campo novo for preenchido entao usa o campos novo, senao o campo velho
                   # To Release (Padrão) ou To Release
                   f.coalesce("issues_exploded.fields.customfield_10750",
                              "issues_exploded.fields.customfield_10590").cast("date").alias("to_release_date"),
                   # Blocked (Padrão) ou Blocked
                   f.coalesce("issues_exploded.fields.customfield_10751",
                              "issues_exploded.fields.customfield_10591").cast("date").alias("blocked_date"),
                   # In Progress (Padrão) ou In Progress
                   f.coalesce("issues_exploded.fields.customfield_10748",
                              "issues_exploded.fields.customfield_10588").cast("date").alias("in_progress_date"),
                   # Data da Solicitação (Padrão) ou Data da Solicitação
                   f.coalesce("issues_exploded.fields.customfield_10747",
                              "issues_exploded.fields.customfield_10578").cast("date").alias("data_da_solicitacao"),
                   # Review (Padrão) ou To Review
                   f.coalesce("issues_exploded.fields.customfield_10749",
                              "issues_exploded.fields.customfield_10589").cast("date").alias("to_review_date"),

                   ]

try:
    column_solicitantes_type = df_issues_flat.select("issues_exploded.fields.customfield_10469").dtypes[0][1]

    # Quando a coluna foi identificado com o tipo String e porque so possui valorez nulos
    # ou esta ausente no retorno do Jira, neste caso apenas carregamos ela como Array<String>
    if column_solicitantes_type != 'string':
        columns_to_keep.append(f.col("issues_exploded.fields.customfield_10469.displayName").alias("solicitantes"))
    else:
        columns_to_keep.append(f.split(f.col("issues_exploded.fields.customfield_10469"), ",").alias("solicitantes"))

except AnalysisException as e:
    # Se a coluna nao for encontrada silenciamos o erro pois não tem problema
    # Significa que nenhuma task possui essa coluna e isso é normal
    if not e.desc.startswith("[FIELD_NOT_FOUND]"):
        raise e

df_issues_flat_clean = df_issues_flat.select(*columns_to_keep)

In [None]:
df_issues_flat_clean.printSchema()
# df_issues_flat_clean.show()

In [None]:
load_datetime = dt.strftime(dt.now() - timedelta(hours=3), '%Y-%m-%d %H:%M:%S')
df_issues_flat_clean = df_issues_flat_clean.withColumn('load_datetime', f.to_timestamp(f.lit(load_datetime)))

In [None]:
# df_issues_flat_clean.printSchema()
df_issues_flat_clean.show()

# Load

In [None]:
output_path = f"{Path().absolute()}/jira_issues_json_transformed.json"
json_array = df_issues_flat_clean.toJSON().collect()

with open(output_path, 'w', encoding="utf-8") as json_file:
    json_file.writelines(json_array)

In [None]:
file_path = (f"/Volumes/landing/inc/"
             f"last_sync.json")