# Final Project: ETL Process for Close Approach JSON Data

In [1]:
import os
# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.0.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to security.u0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to security.u0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.152)                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.152)                                                                               Get:3 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [Connecting to security.ubu                                                                               Ign:4 https://developer.download.nvidia.com/compute/machine-learni

In [2]:
# Download the Postgres driver that will allow Spark to interact with Postgres.
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2022-01-13 02:43:20--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2022-01-13 02:43:20 (11.4 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [3]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Neo_Json").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [4]:
# Import pySpark libraries
from pyspark import SparkFiles
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [5]:
def process_cad_data(json_filename, url_endpoint):
  """
  Loads new dataframe for json filename and url endpoint parameters

  params:
    json_filename: name of json filename from AWS S3
    url_endpoint: endpoint of Url for Spark to read S3 file

  returns:
    final dataframe to be loaded into postgres table
  """

  spark.sparkContext.addFile(url_endpoint)
  
  # read cad json file into spark session
  cad_json_file = SparkFiles.get(json_filename)
  json_df = spark.read.json(cad_json_file, multiLine=True)

  # create temporary dataframe from data column in dataframe
  array_data_df = json_df.select(F.explode("data").alias('data'))

  # create tabular formatted dataframe
  tabular_df = array_data_df.select(array_data_df['data'].getItem(0).alias('des'), 
                 array_data_df['data'].getItem(1).alias('orbit_id'),
                 array_data_df['data'].getItem(2).alias('jd'),
                 array_data_df['data'].getItem(3).alias('cd'),
                 array_data_df['data'].getItem(4).alias('dist'),
                 array_data_df['data'].getItem(5).alias('dist_min'),
                 array_data_df['data'].getItem(6).alias('dist_max'),
                 array_data_df['data'].getItem(7).alias('v_rel'),
                 array_data_df['data'].getItem(8).alias('v_inf'),
                 array_data_df['data'].getItem(9).alias('t_sigma_f'),
                 array_data_df['data'].getItem(10).alias('h')
                 )
  
  # create final dataframe for loading postgres table
  cad_final_df = (tabular_df
    .transform(lambda df: df.withColumn("cd", F.to_timestamp(tabular_df["cd"], 'yyyy-MMM-dd HH:mm')))
    .transform(lambda df: df.withColumn("dist", tabular_df["dist"].cast(T.DecimalType(precision=24, scale=16))))
    .transform(lambda df: df.withColumn("dist_min", tabular_df["dist_min"].cast(T.DecimalType(precision=24, scale=16))))   
    .transform(lambda df: df.withColumn("dist_max", tabular_df["dist_max"].cast(T.DecimalType(precision=24, scale=16))))             
    .transform(lambda df: df.withColumn("v_rel", tabular_df["v_rel"].cast(T.DecimalType(precision=24, scale=16))))
    .transform(lambda df: df.withColumn("v_inf", tabular_df["v_inf"].cast(T.DecimalType(precision=24, scale=16))))
    .transform(lambda df: df.withColumn("h", tabular_df["h"].cast(T.DecimalType(precision=24, scale=16))))
  )

  return cad_final_df

In [6]:
json_filename = "sample_json.json"
url_endpoint = f"https://ucb-neo-project.s3.us-east-2.amazonaws.com/json/{json_filename}"

In [7]:
# call function to get dataframe for loading postgres table
cad_final_df = process_cad_data(json_filename, url_endpoint)

In [8]:
cad_final_df.show()

+---+--------+-----------------+-------------------+------------------+------------------+------------------+------------------+------------------+---------+-------------------+
|des|orbit_id|               jd|                 cd|              dist|          dist_min|          dist_max|             v_rel|             v_inf|t_sigma_f|                  h|
+---+--------+-----------------+-------------------+------------------+------------------+------------------+------------------+------------------+---------+-------------------+
|433|     659|2426371.671249267|1931-01-30 04:07:00|0.1740731458281430|0.1740730212498710|0.1740732704086420|5.9208185341028800|5.9182327415933800|  < 00:01|10.4300000000000000|
|433|     659|2442435.819062972|1975-01-23 07:39:00|0.1511341908798190|0.1511341417827440|0.1511342399769020|5.8253076832997600|5.8222804658278100|  < 00:01|10.4300000000000000|
|433|     659|2455957.958753891|2012-01-31 11:01:00|0.1786758135906200|0.1786757924967120|0.1786758346845290|5

In [9]:
cad_final_df.printSchema()

root
 |-- des: string (nullable = true)
 |-- orbit_id: string (nullable = true)
 |-- jd: string (nullable = true)
 |-- cd: timestamp (nullable = true)
 |-- dist: decimal(24,16) (nullable = true)
 |-- dist_min: decimal(24,16) (nullable = true)
 |-- dist_max: decimal(24,16) (nullable = true)
 |-- v_rel: decimal(24,16) (nullable = true)
 |-- v_inf: decimal(24,16) (nullable = true)
 |-- t_sigma_f: string (nullable = true)
 |-- h: decimal(24,16) (nullable = true)



## Load Close Approaches data into AWS RDS Postgres database instance

### Connect to AWS RDS Postgres database

In [10]:
# Store environment variable
from getpass import getpass
password = getpass('Enter database password')

# Configure settings for RDS
jdbc_url="jdbc:postgresql://neo-db.ctohlxwhjvlb.us-east-1.rds.amazonaws.com:5432/neo"
config = {"user":"postgres", 
          "password": password, 
          "driver":"org.postgresql.Driver"}

Enter database password··········


### Write cad dataframe to postgres table

In [12]:
mode = 'overwrite'
cad_final_df.write.jdbc(url=jdbc_url, table='public.cad', mode=mode, properties=config)