In [1]:
import os
os.environ["PYSPARK_PYTHON"] = "./env/Scripts/python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = "./env/Scripts/python.exe"

In [2]:
# Check for new files in the data directory

import os
import json

# Constants
DATA_DIR = "data"
TRACKER_FILE = "file_tracker.json"

# Load previously seen files (if any)
if os.path.exists(TRACKER_FILE):
    with open(TRACKER_FILE, "r") as f:
        seen_files = set(json.load(f))
else:
    seen_files = set()

# Scan current files
current_files = set(
    os.path.join(DATA_DIR, file)
    for file in os.listdir(DATA_DIR)
    if file.endswith(".csv") and os.path.isfile(os.path.join(DATA_DIR, file))
)

# Identify new files
new_files = current_files - seen_files

# Store relative paths in `path`
path = sorted(new_files)

# Save current state for next run
with open(TRACKER_FILE, "w") as f:
    json.dump(sorted(current_files), f)

# Output the new files found

print("New CSV files found:", path)


New CSV files found: ['data\\controllers_raw_scraped.csv', 'data\\headphones_raw_scraped.csv']


In [3]:
# One-time creation of base directories
os.makedirs("data_lake/bronze", exist_ok=True)

In [4]:
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [5]:
# Process new files
if len(path) == 0:
    print("No new files to process.")
else:
    for file in path:
        filename = os.path.basename(file)
        first_word = filename.split('_')[0]
        os.makedirs(os.path.join("data_lake", "bronze", first_word), exist_ok=True)

        # Read the CSV file
        df = spark.read.option("header", "true").csv(file, inferSchema=False)
        

        # Write to Delta Lake
        df.write.format("delta").mode("overwrite").save(os.path.join("data_lake", "bronze", first_word))
        print(f"Processed and saved {file} to Delta Lake in {first_word} directory.")


Processed and saved data\controllers_raw_scraped.csv to Delta Lake in controllers directory.
Processed and saved data\headphones_raw_scraped.csv to Delta Lake in headphones directory.


In [6]:
# Verify the data in Delta Lake
bronze_df = spark.read.format("delta").load("data_lake/bronze/controllers")

# Print number of records
print(f"Total records: {bronze_df.count()}")

# Show schema
bronze_df.printSchema()

# Show sample records
bronze_df.show(5, truncate=False)

# Optional: Describe table for more metadata
bronze_df.describe().show() 

Total records: 432
root
 |-- Title: string (nullable = true)
 |-- Number_of_Reviews: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Bought_Last_Month: string (nullable = true)
 |-- Price_After_Discount: string (nullable = true)
 |-- MRP: string (nullable = true)
 |-- Image_URL: string (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Star_Rating_Percentage: string (nullable = true)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+----------------------------------+-------------------------+--------------------+------+--------------------------------------------------------------+----------+------------------------------------------------+
|Title                                                                                         