# This notebooks is to handle data Ingestion

## Create Unity Catelog

In [0]:
%sql
CREATE CATALOG IF NOT EXISTS hotel_catalog;
USE CATALOG hotel_catalog;
-- Create schemas for medallion architecture
CREATE SCHEMA IF NOT EXISTS bronze;
CREATE SCHEMA IF NOT EXISTS silver;
CREATE SCHEMA IF NOT EXISTS gold;
USE SCHEMA bronze;

## Import Libraries

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import mlflow

## Initialize Spark session

In [0]:
spark = SparkSession.builder.appName("Hotel_Churn_Ingestion").getOrCreate()

## Define schema based on dataset

In [0]:
schema = StructType([
    StructField("hotel", StringType(), True),
    StructField("is_canceled", IntegerType(), True),
    StructField("lead_time", IntegerType(), True),
    StructField("arrival_date_year", IntegerType(), True),
    StructField("arrival_date_month", StringType(), True),
    StructField("arrival_date_week_number", IntegerType(), True),
    StructField("arrival_date_day_of_month", IntegerType(), True),
    StructField("stays_in_weekend_nights", IntegerType(), True),
    StructField("stays_in_week_nights", IntegerType(), True),
    StructField("adults", IntegerType(), True),
    StructField("children", StringType(), True),
    StructField("babies", IntegerType(), True),
    StructField("meal", StringType(), True),
    StructField("country", StringType(), True),
    StructField("market_segment", StringType(), True),
    StructField("distribution_channel", StringType(), True),
    StructField("is_repeated_guest", IntegerType(), True),
    StructField("previous_cancellations", IntegerType(), True),
    StructField("previous_bookings_not_canceled", IntegerType(), True),
    StructField("reserved_room_type", StringType(), True),
    StructField("assigned_room_type", StringType(), True),
    StructField("booking_changes", IntegerType(), True),
    StructField("deposit_type", StringType(), True),
    StructField("agent", StringType(), True),
    StructField("company", StringType(), True),
    StructField("days_in_waiting_list", IntegerType(), True),
    StructField("customer_type", StringType(), True),
    StructField("adr", DoubleType(), True),
    StructField("required_car_parking_spaces", IntegerType(), True),
    StructField("total_of_special_requests", IntegerType(), True),
    StructField("reservation_status", StringType(), True),
    StructField("reservation_status_date", StringType(), True)
])


## Load data from FileStore

In [0]:
file_path = "/FileStore/Tables/hotel_bookings.csv"

# Read with schema
hotel_booking_raw_df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "false") \
    .schema(schema) \
    .csv(file_path)

print(f"Total records loaded: {hotel_booking_raw_df.count()}")
print(f"Total columns: {len(hotel_booking_raw_df.columns)}")

## Data Quality Checks

In [0]:
from pyspark.sql.functions import col

print("Null counts in key columns:")
key_columns = ["hotel", "is_canceled", "lead_time", "arrival_date_year", "adults"]
for column in key_columns:
    null_count = hotel_booking_raw_df.filter(col(column).isNull()).count()
    print(f"{column}: {null_count} nulls")

# Check data types
print("\nSchema:")
hotel_booking_raw_df.printSchema()

## Create Table in Bronze Layer in Unity Catalog

In [0]:
# Create Bronze table
bronze_table_name = "hotel_catalog.bronze.raw_hotel_bookings"

hotel_booking_raw_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(bronze_table_name)

## Create initial statistics

In [0]:
print("\n=== Initial Data Statistics ===")
print(f"Total bookings: {hotel_booking_raw_df.count():,}")
print(f"Cancelation rate: {hotel_booking_raw_df.filter(col('is_canceled') == 1).count() / hotel_booking_raw_df.count() * 100:.2f}%")


# Hotel distribution
hotel_dist = hotel_booking_raw_df.groupBy("hotel").agg(
    count("*").alias("total_bookings"),
    (sum("is_canceled") / count("*") * 100).alias("cancel_rate_percent")
)
hotel_dist.show()

# Year distribution
year_dist = hotel_booking_raw_df.groupBy("arrival_date_year").agg(
    count("*").alias("total_bookings")
).orderBy("arrival_date_year")
year_dist.show()

print("\n Data ingestion complete! Data saved to Bronze layer.")