### Ingestion del archivo "movie.csv"

In [0]:
dbutils.widgets.help()

In [0]:
dbutils.widgets.text("p_environment", "")
v_environment = dbutils.widgets.get("p_environment")

In [0]:
dbutils.widgets.text("p_file_date", "2024-12-30")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

#### Paso 1 - Leer el archivo CSV usando "DataFrameReader" de Spark

In [0]:
# importamos los data types
from pyspark.sql.types import *

In [0]:
# definimos el schema
movie_schema = StructType( fields= [
    StructField("movieId", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("budget", DoubleType(), True),
    StructField("homePage", StringType(), True),
    StructField("overview", StringType(), True),
    StructField("popularity", DoubleType(), True),
    StructField("yearReleaseDate", StringType(), True),
    StructField("releaseDate", DateType(), True),
    StructField("revenue", DoubleType(), True),
    StructField("durationTime", IntegerType(), True),
    StructField("movieStatus", StringType(), True),
    StructField("tagline", StringType(), True),
    StructField("voteAverage", DoubleType(), True),
    StructField("voteCount", IntegerType(), True)
])

In [0]:
movie_df = spark.read \
    .option("header", True) \
    .schema(movie_schema) \
    .csv(f"{bronze_folder_path}/{v_file_date}/movie.csv")

In [0]:
display(movie_df)

In [0]:
display(dbutils.fs.mounts())

In [0]:
%fs
ls /mnt/moviehistory2025/bronze

#### Paso 2 - Seleccionamos solo las columnas requeridas

In [0]:
# forma 4
from pyspark.sql.functions import col

movies_selected_df = movie_df.select(col("movieId"), col("title"), col("budget"), col("popularity"), col("yearReleaseDate"), col("releaseDate"), col("revenue"), col("durationTime"), col("voteAverage"), col("voteCount"))

#### Paso 3 - Cambiar el nombre de las columnas segun lo requerido

In [0]:
# forma 1
movies_renamed_df = movies_selected_df \
                    .withColumnRenamed("movieId", "movie_id") \
                    .withColumnRenamed('yearReleaseDate', 'year_release_date') \
                    .withColumnRenamed('releaseDate', 'release_date') \
                    .withColumnRenamed('durationTime', 'duration_time') \
                    .withColumnRenamed('voteAverage', 'vote_average') \
                    .withColumnRenamed('voteCount', 'vote_count')

#### Paso 4 - Agregar la columna "ingestion_date" al DF

In [0]:
from pyspark.sql.functions import current_timestamp, lit

# forma 1
movies_final_df = add_ingestion_date(movies_renamed_df) \
                    .withColumn("environment", lit(v_environment)) \
                    .withColumn("file_date", lit(v_file_date))

#### Paso 5 - Escribir datos en el datalake en formato "parquet"

In [0]:
#overwrite_partition(movies_final_df, "movie_silver", "movies", "file_date")

In [0]:
merge_condition = 'tgt.movie_id = src.movie_id AND tgt.file_date = src.file_date'
merge_delte_lake(movies_final_df, "movie_silver", "movies", silver_folder_path, merge_condition, "file_date")

In [0]:
%sql
SELECT file_date, count(1)
FROM movie_silver.movies
GROUP BY file_date;

In [0]:
display(spark.read.format("delta").load("/mnt/moviehistory2025/silver/movies"))

In [0]:
dbutils.notebook.exit("success")