In [53]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace
import requests
import json
from sqlalchemy import create_engine

df = spark.read.json("/home/naum/studies/bases/books.json")
df.printSchema()
df.show()


root
 |-- _id: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- isbn: string (nullable = true)
 |-- longDescription: string (nullable = true)
 |-- pageCount: long (nullable = true)
 |-- publishedDate: struct (nullable = true)
 |    |-- $date: string (nullable = true)
 |-- shortDescription: string (nullable = true)
 |-- status: string (nullable = true)
 |-- thumbnailUrl: string (nullable = true)
 |-- title: string (nullable = true)

+---+--------------------+--------------------+----------+--------------------+---------+--------------------+--------------------+-------+--------------------+--------------------+
|_id|             authors|          categories|      isbn|     longDescription|pageCount|       publishedDate|    shortDescription| status|        thumbnailUrl|               title|
+---+--------------------+------------------

In [54]:
df.printSchema()

root
 |-- _id: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- isbn: string (nullable = true)
 |-- longDescription: string (nullable = true)
 |-- pageCount: long (nullable = true)
 |-- publishedDate: struct (nullable = true)
 |    |-- $date: string (nullable = true)
 |-- shortDescription: string (nullable = true)
 |-- status: string (nullable = true)
 |-- thumbnailUrl: string (nullable = true)
 |-- title: string (nullable = true)



In [55]:
# Read JSON file into dataframe
df = spark.read.format('org.apache.spark.sql.json') \
        .load("/home/naum/studies/bases/books.json")
        
# Changing datatypes
df = df.withColumn("authors", col("authors").cast("string")) \
       .withColumn("categories", col("categories").cast("string")) \
       .withColumn("publishedDate", col("publishedDate").cast("string")) \
       .withColumn("pageCount", col("pageCount").cast("float"))
       
# Removing [ e ]       
df = df.withColumn("authors", regexp_replace(col("authors"), "[\[\]]", "")) \
        .withColumn("categories", regexp_replace(col("categories"), "[\[\]]", "")) \
        .withColumn("publishedDate", regexp_replace(col("publishedDate"), "[{}]", ""))

In [56]:
csv_file_path = "/home/naum/studies/bases/books.csv"
df.write.csv(csv_file_path, header=True, mode="overwrite")
df.show()

+---+--------------------+--------------------+----------+--------------------+---------+--------------------+--------------------+-------+--------------------+--------------------+
|_id|             authors|          categories|      isbn|     longDescription|pageCount|       publishedDate|    shortDescription| status|        thumbnailUrl|               title|
+---+--------------------+--------------------+----------+--------------------+---------+--------------------+--------------------+-------+--------------------+--------------------+
|  1|W. Frank Ableson,...| Open Source, Mobile|1933988673|Android is an ope...|    416.0|2009-04-01T00:00:...|Unlocking Android...|PUBLISH|https://s3.amazon...|   Unlocking Android|
|  2|W. Frank Ableson,...|                Java|1935182722|When it comes to ...|    592.0|2011-01-14T00:00:...|Android in Action...|PUBLISH|https://s3.amazon...|Android in Action...|
|  3|         Gojko Adzic|Software Engineering|1617290084|                NULL|      0.0|2

In [66]:
# Converter DataFrame do PySpark para DataFrame do pandas
df_pandas = df.toPandas()

# Configurações de conexão com o banco de dados PostgreSQL
engine = create_engine('postgresql://postgres:changeme@localhost:5432/postgres')

# Escrever DataFrame do pandas no banco de dados PostgreSQL
# 'books' é o nome da tabela em que você deseja salvar os dados
df_pandas.to_sql("books", con=engine, if_exists="append", index=False)

431