# Extracting Superstore Sales Dataset from Kaggle

In [1]:
!pip install kaggle
!pip install kagglehub

StatementMeta(, cfbf282b-6a59-4350-bde1-b2b94a8b3bba, 3, Finished, Available, Finished)

Collecting kaggle
  Downloading kaggle-1.7.4.5-py3-none-any.whl.metadata (16 kB)
Downloading kaggle-1.7.4.5-py3-none-any.whl (181 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.2/181.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: kaggle
Successfully installed kaggle-1.7.4.5
Collecting kagglehub
  Downloading kagglehub-0.3.13-py3-none-any.whl.metadata (38 kB)
Downloading kagglehub-0.3.13-py3-none-any.whl (68 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.3/68.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kagglehub
Successfully installed kagglehub-0.3.13


In [2]:
from datetime import datetime

PROCESSING_START_TIME = datetime.now()

StatementMeta(, cfbf282b-6a59-4350-bde1-b2b94a8b3bba, 4, Finished, Available, Finished)

In [3]:
# PIPELINE PARAMETERS
from datetime import datetime

# Toggle runtime parameters
KAGGLE_DATASET = "ishanshrivastava28/superstore-sales"
LANDING_PATH = "abfss://fabric_dev@onelake.dfs.fabric.microsoft.com/fabric_LH_sales.Lakehouse/Files/Landing"

# Automatically capture current timestamp
PROCESSING_TIME = datetime.now()
YEAR = PROCESSING_TIME.year
MONTH = PROCESSING_TIME.month

print(f" Parameters Loaded — Year: {YEAR}, Month: {MONTH}, Dataset: {KAGGLE_DATASET}")


StatementMeta(, cfbf282b-6a59-4350-bde1-b2b94a8b3bba, 5, Finished, Available, Finished)

 Parameters Loaded — Year: 2025, Month: 10, Dataset: ishanshrivastava28/superstore-sales


In [4]:
import kagglehub, os, pandas as pd, chardet

print(" Downloading Kaggle dataset...")
path = kagglehub.dataset_download(KAGGLE_DATASET)
print(f" Dataset downloaded to: {path}")

# Locate data file
file_path = next(
    (os.path.join(root, f)
     for root, _, files in os.walk(path)
     for f in files if f.endswith((".csv", ".xls", ".xlsx"))),
    None
)
if not file_path:
    raise FileNotFoundError("No CSV or Excel file found in dataset.")

print(f" Found data file: {file_path}")

# Detect encoding
with open(file_path, 'rb') as f:
    raw = f.read(100000)
    detected = chardet.detect(raw)['encoding']

print(f" Detected encoding: {detected}")

# Load to pandas
df = pd.read_csv(file_path, encoding=detected) if file_path.endswith(".csv") else pd.read_excel(file_path)
print(f" Loaded dataset with {len(df):,} rows and {len(df.columns)} columns.")

StatementMeta(, cfbf282b-6a59-4350-bde1-b2b94a8b3bba, 6, Finished, Available, Finished)

 Downloading Kaggle dataset...
 Dataset downloaded to: /home/trusted-service-user/.cache/kagglehub/datasets/ishanshrivastava28/superstore-sales/versions/1
 Found data file: /home/trusted-service-user/.cache/kagglehub/datasets/ishanshrivastava28/superstore-sales/versions/1/Superstore.csv
 Detected encoding: Windows-1252
 Loaded dataset with 9,994 rows and 21 columns.


  0%|          | 0.00/1.80M [00:00<?, ?B/s]

In [5]:
from pyspark.sql import functions as F

# Convert to Spark DataFrame
spark_df = spark.createDataFrame(df)

# Clean column names
for old_col in spark_df.columns:
    new_col = (
        old_col.strip()
        .replace(" ", "_")
        .replace("(", "")
        .replace(")", "")
        .replace("/", "_")
        .replace("-", "_")
    )
    spark_df = spark_df.withColumnRenamed(old_col, new_col)

# Add metadata columns
spark_df = (
    spark_df
    .withColumn("Processing_Time", F.current_timestamp())
    .withColumn("Year", F.lit(YEAR))
    .withColumn("Month", F.lit(MONTH))
)


StatementMeta(, cfbf282b-6a59-4350-bde1-b2b94a8b3bba, 7, Finished, Available, Finished)

In [6]:
(
    spark_df.write
    .mode("append")                     # safe for incremental loads
    .option("mergeSchema", "true")      # allow schema evolution
    .partitionBy("Year", "Month")       # organize data efficiently
    .parquet(LANDING_PATH)
)

print(f" Successfully written to Landing: {LANDING_PATH}")

StatementMeta(, cfbf282b-6a59-4350-bde1-b2b94a8b3bba, 8, Finished, Available, Finished)

 Successfully written to Landing: abfss://fabric_dev@onelake.dfs.fabric.microsoft.com/fabric_LH_sales.Lakehouse/Files/Landing


In [7]:

row_count = spark_df.count()

PROCESSING_END_TIME = datetime.now()

duration_seconds = (PROCESSING_END_TIME - PROCESSING_START_TIME).total_seconds()
duration_minutes = round(duration_seconds / 60, 2)



log_df = spark.createDataFrame([
    (KAGGLE_DATASET, PROCESSING_START_TIME, PROCESSING_END_TIME, duration_minutes, "Kaggle -> Landing", LANDING_PATH, row_count)
], ["Dataset", "Start_Timestamp", "End_Timestamp", "run_duration", "Stage", "Destination", "Row_Count"])

log_df.write.mode("append").saveAsTable("dbo.pipeline_log")

print(" Pipeline load logged successfully.")

StatementMeta(, cfbf282b-6a59-4350-bde1-b2b94a8b3bba, 9, Finished, Available, Finished)

 Pipeline load logged successfully.
