In [8]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
from delta.tables import DeltaTable
import pyspark.sql.functions as F

builder = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

sc = spark.sparkContext
spark.sparkContext.setLogLevel("OFF") 
print('PySpark Version :'+spark.version)
print('PySpark Version :'+spark.sparkContext.version)

spark


PySpark Version :3.5.4
PySpark Version :3.5.4


In [2]:
path_delta = "/Users/eduardoalberto/LoadFile/dataDelta/movie"
df = spark.read.parquet("/Users/eduardoalberto/LoadFile/parquet/movie")
df.write.format("delta").mode("overwrite").save(path_delta)

                                                                                

### Dataframe

In [4]:
spark.read.format("delta").load(path_delta).show(truncate=False)

                                                                                

+----------+-----------+-----------------------------------------------------------------------------+-----------------------------------------------------------------------------+-----------------+--------------+----------------+--------------------+
|nconst    |primaryName|birthYear                                                                    |deathYear                                                                    |primaryProfession|knownForTitles|profession_array|knownForTitles_array|
+----------+-----------+-----------------------------------------------------------------------------+-----------------------------------------------------------------------------+-----------------+--------------+----------------+--------------------+
|tt14495706|short      |La Rosace Magique                                                            |La Rosace Magique                                                            |0                |1877          |0               |1877          

In [None]:
dfs = spark.read.option("delimiter",';')\
                .option("header", "True")\
                .option("inferSchema", "True")\
                .csv("/Users/eduardoalberto/LoadFile/part-00000-055103f0-b275-4e27-b667-0c2c25d0636a-c000.csv")
dfs.printSchema()



root
 |-- id01: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- backdrop_path: string (nullable = true)
 |-- budget: integer (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- path_poster: string (nullable = true)
 |-- dt_release: date (nullable = true)
 |-- revenue: integer (nullable = true)
 |-- runtime: double (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- video: boolean (nullable = true)
 |-- vote_average: integer (nullable = true)
 |-- vote_count: integer (nullable = true)
 |-- id02: integer (nullable = true)
 |-- name_geners: string (nullable = true)



                                                                                

In [6]:
dfs.toPandas()

Unnamed: 0,id01,id,name,poster_path,backdrop_path,budget,imdb_id,original_language,original_title,overview,...,revenue,runtime,status,tagline,title,video,vote_average,vote_count,id02,name_geners
0,28,,,,,31500000,tt0078788,en,Apocalypse Now,"At the height of the Vietnam war, Captain Benj...",...,89460381,153.0,Released,This is the end...,Apocalypse Now,False,8,2112,28,Action
1,28,,,,,31500000,tt0078788,en,Apocalypse Now,"At the height of the Vietnam war, Captain Benj...",...,89460381,153.0,Released,This is the end...,Apocalypse Now,False,8,2112,28,Action
2,28,,,,,31500000,tt0078788,en,Apocalypse Now,"At the height of the Vietnam war, Captain Benj...",...,89460381,153.0,Released,This is the end...,Apocalypse Now,False,8,2112,28,Action
3,28,,,,,31500000,tt0078788,en,Apocalypse Now,"At the height of the Vietnam war, Captain Benj...",...,89460381,153.0,Released,This is the end...,Apocalypse Now,False,8,2112,28,Action
4,28,,,,,31500000,tt0078788,en,Apocalypse Now,"At the height of the Vietnam war, Captain Benj...",...,89460381,153.0,Released,This is the end...,Apocalypse Now,False,8,2112,28,Action
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40116,10749,,,,,0,tt0329632,de,Sophiiiie!,The film starts in the early evening of a norm...,...,0,107.0,Released,,Sophiiiie!,False,6,1,10749,Romance
40117,10749,,,,,0,tt0329632,de,Sophiiiie!,The film starts in the early evening of a norm...,...,0,107.0,Released,,Sophiiiie!,False,6,1,10749,Romance
40118,10749,,,,,0,tt0329632,de,Sophiiiie!,The film starts in the early evening of a norm...,...,0,107.0,Released,,Sophiiiie!,False,6,1,10749,Romance
40119,10749,,,,,0,tt0329632,de,Sophiiiie!,The film starts in the early evening of a norm...,...,0,107.0,Released,,Sophiiiie!,False,6,1,10749,Romance


In [13]:
df01 = dfs.groupBy("name","imdb_id","overview","revenue","runtime","status","title","vote_average","vote_count","popularity","name_geners")\
          .agg(F.count("title").alias("total"))\
          .withColumn("dt_ref_carga", F.current_date())


df01.toPandas()

Unnamed: 0,name,imdb_id,overview,revenue,runtime,status,title,vote_average,vote_count,popularity,name_geners,total,dt_ref_carga
0,,tt0119116,"In 2257, a taxi driver is unintentionally give...",263920180,126.0,Released,The Fifth Element,7,3962,24.30526,Drama,11966,2025-02-17
1,,tt0078788,"At the height of the Vietnam war, Captain Benj...",89460381,153.0,Released,Apocalypse Now,8,2112,13.5963,Action,4489,2025-02-17
2,Before... Collection,tt0381681,Nine years ago two strangers met by chance and...,15992615,80.0,Released,Before Sunset,7,734,7.048957,Crime,1682,2025-02-17
3,Heart of Gold Collection,tt0168629,"Selma, a Czech immigrant on the verge of blind...",40031879,140.0,Released,Dancer in the Dark,7,392,10.684806,Animation,1124,2025-02-17
4,Deuce Bigalow Collection,tt0205000,"Deuce Bigalow is a less than attractive, down ...",65535067,88.0,Released,Deuce Bigalow: Male Gigolo,5,314,6.567794,Music,487,2025-02-17
5,,tt0185125,A single mother in Madrid sees her only son di...,67872296,101.0,Released,All About My Mother,7,337,10.000915,Documentary,3415,2025-02-17
6,,tt0169547,"Lester Burnham, a depressed suburban father in...",356296601,122.0,Released,American Beauty,7,3438,20.726578,Fantasy,704,2025-02-17
7,Finding Nemo Collection,tt0266543,"Nemo, an adventurous young clownfish, is unexp...",940335536,100.0,Released,Finding Nemo,7,6292,25.497794,Adventure,1514,2025-02-17
8,In China They Eat Dogs Collection,tt0246692,"The last wish of the dying ""Monk"" is for his f...",0,95.0,Released,Old Men in New Cars: In China They Eat Dogs II,6,37,4.003761,Family,524,2025-02-17
9,,tt0268437,A lawyer becomes a fisherman from frustration....,0,97.0,Released,The Man Who Sued God,6,23,1.741405,Mystery,554,2025-02-17


In [22]:
arqDelta = "/Users/eduardoalberto/LoadFile/dataDelta/ratingMovie"
# df = spark.read.parquet("/Users/eduardoalberto/LoadFile/parquet/movie")
df01.write.format("delta").mode("overwrite").partitionBy("dt_ref_carga").option("overwriteSchema", "true").save(arqDelta)



                                                                                

In [21]:
spark.read.format("delta").load(arqDelta).toPandas()

Unnamed: 0,name,imdb_id,overview,revenue,runtime,status,title,vote_average,vote_count,popularity,name_geners,total,dt_ref_carga
0,,tt0119116,"In 2257, a taxi driver is unintentionally give...",263920180,126.0,Released,The Fifth Element,7,3962,24.30526,Drama,11966,2025-02-17
1,,tt0078788,"At the height of the Vietnam war, Captain Benj...",89460381,153.0,Released,Apocalypse Now,8,2112,13.5963,Action,4489,2025-02-17
2,Before... Collection,tt0381681,Nine years ago two strangers met by chance and...,15992615,80.0,Released,Before Sunset,7,734,7.048957,Crime,1682,2025-02-17
3,Heart of Gold Collection,tt0168629,"Selma, a Czech immigrant on the verge of blind...",40031879,140.0,Released,Dancer in the Dark,7,392,10.684806,Animation,1124,2025-02-17
4,Deuce Bigalow Collection,tt0205000,"Deuce Bigalow is a less than attractive, down ...",65535067,88.0,Released,Deuce Bigalow: Male Gigolo,5,314,6.567794,Music,487,2025-02-17
5,,tt0185125,A single mother in Madrid sees her only son di...,67872296,101.0,Released,All About My Mother,7,337,10.000915,Documentary,3415,2025-02-17
6,,tt0169547,"Lester Burnham, a depressed suburban father in...",356296601,122.0,Released,American Beauty,7,3438,20.726578,Fantasy,704,2025-02-17
7,Finding Nemo Collection,tt0266543,"Nemo, an adventurous young clownfish, is unexp...",940335536,100.0,Released,Finding Nemo,7,6292,25.497794,Adventure,1514,2025-02-17
8,In China They Eat Dogs Collection,tt0246692,"The last wish of the dying ""Monk"" is for his f...",0,95.0,Released,Old Men in New Cars: In China They Eat Dogs II,6,37,4.003761,Family,524,2025-02-17
9,,tt0268437,A lawyer becomes a fisherman from frustration....,0,97.0,Released,The Man Who Sued God,6,23,1.741405,Mystery,554,2025-02-17


### MERGE UPDATE