In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession

# Create Spark session (if you haven't already)
spark = SparkSession.builder.appName("SuperstoreSales").getOrCreate()

# Load superstore sales data from a CSV file
superstore_sales_data_url = "/FileStore/tables/train-7.csv"  # Ensure this is the correct path
df = spark.read.format("csv") \
                .option("header", "true") \
                .option("inferSchema", "true") \
                .load(superstore_sales_data_url)

# Clean column names: Replace spaces with underscores and remove special characters
def clean_column_names(df):
    for col_name in df.columns:
        # Replace spaces with underscores and remove any special characters
        new_col_name = col_name.replace(" ", "_") \
                                .replace("(", "") \
                                .replace(")", "") \
                                .replace(",", "") \
                                .replace(";", "") \
                                .replace("\n", "") \
                                .replace("\t", "") \
                                .replace("=", "")
        df = df.withColumnRenamed(col_name, new_col_name)
    return df

# Clean the column names in the DataFrame
df = clean_column_names(df)

# Show the cleaned column names and the first few rows of the dataset
df.printSchema()
df.show(5)

# Save the raw cleaned data into a Delta table
df.write.format("delta").mode("overwrite").save("/FileStore/delta/superstore_raw")





root
 |-- Row_ID: integer (nullable = true)
 |-- Order_ID: string (nullable = true)
 |-- Order_Date: date (nullable = true)
 |-- Ship_Date: date (nullable = true)
 |-- Ship_Mode: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- Customer_Name: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal_Code: integer (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Sub-Category: string (nullable = true)
 |-- Product_Name: string (nullable = true)
 |-- Sales: string (nullable = true)

+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+--------+
|Row_ID|      Order_ID|Order_Date