In [6]:
from pyspark.sql import SparkSession,Row,DataFrame
from delta import configure_spark_with_delta_pip
from delta.tables import DeltaTable
import pyspark.sql.functions as F
import os.path as path
import traceback


builder = SparkSession \
    .builder \
    .appName("emp") \
    .master("local[4]") \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.warehouse.dir", "/Users/eduardoalberto/LoadFile/output/") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

sc = spark.sparkContext
spark.sparkContext.setLogLevel("OFF") 
print('PySpark Version :'+spark.version)
print('PySpark Version :'+spark.sparkContext.version)

spark

PySpark Version :3.5.4
PySpark Version :3.5.4


In [2]:
df = spark.read.csv("/Users/eduardoalberto/LoadFile/input/kc_house_data.csv", header=True,inferSchema=True)\
                .withColumn("id_number", F.monotonically_increasing_id())
df.show(truncate=False)

                                                                                

+----------+--------+---------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+------------+-------+-------+--------+-------------+----------+---------+
|id        |dt_date |price    |bedrooms|bathrooms|sqft_living|sqft_lot|floors|waterfront|view|condition|grade|sqft_above|sqft_basement|yr_built|yr_renovated|zipcode|lat    |long    |sqft_living15|sqft_lot15|id_number|
+----------+--------+---------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+------------+-------+-------+--------+-------------+----------+---------+
|7129300520|20141013|221900.0 |3       |1.0      |1180       |5650    |1.0   |0         |0   |3        |7    |1180      |0            |1955    |0           |98178  |47.5112|-122.257|1340         |5650      |0        |
|6414100192|20141209|538000.0 |3       |2.25     |2570       |7242    |2.0   |0         |0   |3        |7    |2170      |400    

### GERA DELTATABLE

In [3]:
path_delta = "/Users/eduardoalberto/LoadFile/output/emp"
df = df.withColumn("dt_ref_carga", F.current_date())



df.write.format("delta")\
        .option("overwriteSchema", "true")\
        .option("path",path_delta)\
        .partitionBy("dt_ref_carga")\
        .mode("overwrite")\
        .saveAsTable("default.tb_emp")


# df.write.format("delta")\
#         .option("overwriteSchema", "true")\
#         .partitionBy("dt_ref_carga")\
#         .mode("overwrite")\
#         .saveAsTable("default.tb_emp")\



                                                                                

In [4]:
spark.sql("describe history tb_emp").show()


+-------+--------------------+------+--------+--------------------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|userId|userName|           operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+------+--------+--------------------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|     11|2025-04-15 22:17:...|  NULL|    NULL|CREATE OR REPLACE...|{isManaged -> fal...|NULL|    NULL|     NULL|         10|  Serializable|        false|{numFiles -> 1, n...|        NULL|Apache-Spark/3.5....|
|     10|2025-04-09 23:23:...|  NULL|    NULL|               WRITE|{mode -> Overwrit...|NULL|    NULL|     NULL|          9|  Serializable|        false|{numFiles -

### CONSULTA DELTATABLE

In [None]:
# spark.read.format("delta").load(path_delta).toPandas()

Unnamed: 0,id,dt_date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,id_number,dt_ref_carga
0,7129300520,20141013,221900.0,3.0,1.00,1180,5650,1.0,0,0,...,0,1955,0,98178,47.5112,-122.257,1340,5650,0,2025-04-15
1,6414100192,20141209,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,400,1951,1991,98125,47.7210,-122.319,1690,7639,1,2025-04-15
2,5631500400,20150225,180000.0,2.0,1.00,770,10000,1.0,0,0,...,0,1933,0,98028,47.7379,-122.233,2720,8062,2,2025-04-15
3,2487200875,20141209,604000.0,4.0,3.00,1960,5000,1.0,0,0,...,910,1965,0,98136,47.5208,-122.393,1360,5000,3,2025-04-15
4,1954400510,20150218,510000.0,3.0,2.00,1680,8080,1.0,0,0,...,0,1987,0,98074,47.6168,-122.045,1800,7503,4,2025-04-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521,360000.0,3.0,2.50,1530,1131,3.0,0,0,...,0,2009,0,98103,47.6993,-122.346,1530,1509,21608,2025-04-15
21609,6600060120,20150223,400000.0,4.0,2.50,2310,5813,2.0,0,0,...,0,2014,0,98146,47.5107,-122.362,1830,7200,21609,2025-04-15
21610,1523300141,20140623,402101.0,2.0,0.75,1020,1350,2.0,0,0,...,0,2009,0,98144,47.5944,-122.299,1020,2007,21610,2025-04-15
21611,291310100,20150116,400000.0,3.0,2.50,1600,2388,2.0,0,0,...,0,2004,0,98027,47.5345,-122.069,1410,1287,21611,2025-04-15


### SCRIPT FULL

In [None]:
inptFile = '/Users/eduardoalberto/LoadFile/input/kc_house_data.csv'

try:
    if path.isfile(inptFile):
        
        CONF = {'kcf_house': '/Users/eduardoalberto/LoadFile/input/kc_house_data.csv',
                'path_delta': '/Users/eduardoalberto/LoadFile/output/emp'
                }

        kc_house_dt = spark.read.csv(CONF["kcf_house"], header=True,inferSchema=True)\
                                .withColumn("id_number", F.monotonically_increasing_id())\
                                .withColumn("dt_ref_carga", F.current_date())


        kc_house_dt.write.format("delta")\
                         .option("overwriteSchema", "true")\
                         .option("path",CONF["path_delta"])\
                         .partitionBy("dt_ref_carga")\
                         .mode("overwrite")\
                         .saveAsTable("default.tb_emp")
        # mysql

    else:
        print("arquivo não existe!")

except Exception as e:
    print(f"Ocorreu o seguinte erro: {e}")
    traceback.print_exc()

spark.table("default.tb_emp").toPandas()

                                                                                

Unnamed: 0,id,dt_date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,id_number,dt_ref_carga
0,7129300520,20141013,221900.0,3.0,1.00,1180,5650,1.0,0,0,...,0,1955,0,98178,47.5112,-122.257,1340,5650,0,2025-04-16
1,6414100192,20141209,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,400,1951,1991,98125,47.7210,-122.319,1690,7639,1,2025-04-16
2,5631500400,20150225,180000.0,2.0,1.00,770,10000,1.0,0,0,...,0,1933,0,98028,47.7379,-122.233,2720,8062,2,2025-04-16
3,2487200875,20141209,604000.0,4.0,3.00,1960,5000,1.0,0,0,...,910,1965,0,98136,47.5208,-122.393,1360,5000,3,2025-04-16
4,1954400510,20150218,510000.0,3.0,2.00,1680,8080,1.0,0,0,...,0,1987,0,98074,47.6168,-122.045,1800,7503,4,2025-04-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521,360000.0,3.0,2.50,1530,1131,3.0,0,0,...,0,2009,0,98103,47.6993,-122.346,1530,1509,21608,2025-04-16
21609,6600060120,20150223,400000.0,4.0,2.50,2310,5813,2.0,0,0,...,0,2014,0,98146,47.5107,-122.362,1830,7200,21609,2025-04-16
21610,1523300141,20140623,402101.0,2.0,0.75,1020,1350,2.0,0,0,...,0,2009,0,98144,47.5944,-122.299,1020,2007,21610,2025-04-16
21611,291310100,20150116,400000.0,3.0,2.50,1600,2388,2.0,0,0,...,0,2004,0,98027,47.5345,-122.069,1410,1287,21611,2025-04-16
