In [0]:
bucket_name = "msklspace"
mount_name = "ayushi_ecom"
dbutils.fs.mount(
  f"gs://{bucket_name}",
  f"/mnt/databricks/{mount_name}",
  extra_configs = {"fs.gs.project.id": "mentorsko-1700569460412"}
)

In [0]:
%sql
create schema bronze;

In [0]:
%sql
use schema bronze

In [0]:
# Read the CSV file into a DataFrame
df_cust = spark.read.csv("dbfs:/mnt/ayushi_ecom/customers.csv", header=True, inferSchema=True)

# Show the DataFrame
df_cust.show()

In [0]:
%sql
--drop table customers;
CREATE TABLE customers;
COPY INTO customers
FROM
  "dbfs:/mnt/ayushi_ecom/customers.csv" FILEFORMAT = CSV FORMAT_OPTIONS(
    "header" = "true",
    "inferSchema" = "true",
    "mergeSchema" = "true",
    "timestampFormat" = "dd-MM-yyyy HH.mm"
  ) COPY_OPTIONS("mergeSchema" = "true");
select
  *
from
  customers;
ALTER TABLE
  customers
SET
  TBLPROPERTIES (delta.enableChangeDataFeed = true);

In [0]:
%sql
drop table orders;
CREATE TABLE orders;
COPY INTO orders
FROM
  "dbfs:/mnt/ayushi_ecom/orders.csv" FILEFORMAT = CSV FORMAT_OPTIONS(
    "header" = "true",
    "inferSchema" = "true",
    "mergeSchema" = "true",
    "timestampFormat" = "dd-MM-yyyy HH.mm"
  ) COPY_OPTIONS("mergeSchema" = "true");
select
  *
from
  orders;
ALTER TABLE
  orders
SET
  TBLPROPERTIES (delta.enableChangeDataFeed = true);

In [0]:
# Read the CSV file into a DataFrame
df_add = spark.read.csv("dbfs:/mnt/ayushi_ecom/addresses.csv", header=True, inferSchema=True)

# Show the DataFrame
df_add.display()

In [0]:
import pandas as pd
import csv

# Use dbutils to read the CSV file
file_path = "/mnt/ayushi_ecom/addresses.csv"
file_content = dbutils.fs.head(file_path)

# Convert the file content to a DataFrame
from io import StringIO
df_add = pd.read_csv(StringIO(file_content), quoting=csv.QUOTE_ALL)

# Show the DataFrame
display(df_add)

In [0]:
# Convert the pandas DataFrame to a Spark DataFrame
spark_df = spark.createDataFrame(df_add)

# Write the Spark DataFrame to a Delta table
spark_df.write.format("delta").mode("overwrite").saveAsTable("addresses")

In [0]:
%sql
select
  *
from
  addresses --drop table addresses

In [0]:
%sql
drop table order_items;
CREATE TABLE order_items;
COPY INTO order_items
FROM
  "dbfs:/mnt/ayushi_ecom/orders_items.csv" FILEFORMAT = CSV FORMAT_OPTIONS(
    "header" = "true",
    "inferSchema" = "true",
    "mergeSchema" = "true",
    "timestampFormat" = "dd-MM-yyyy HH.mm"
  ) COPY_OPTIONS("mergeSchema" = "true");
SELECT
  *
FROM
  order_items;
ALTER TABLE
  order_items
SET
  TBLPROPERTIES (delta.enableChangeDataFeed = true)

In [0]:
%sql
drop table payments;
CREATE TABLE payments;
COPY INTO payments
FROM
  "dbfs:/mnt/ayushi_ecom/payments.csv" FILEFORMAT = CSV FORMAT_OPTIONS(
    "header" = "true",
    "inferSchema" = "true",
    "mergeSchema" = "true",
    "timestampFormat" = "dd-MM-yyyy HH.mm"
  ) COPY_OPTIONS("mergeSchema" = "true");
SELECT
  *
FROM
  payments;
ALTER TABLE
  payments
SET
  TBLPROPERTIES (delta.enableChangeDataFeed = true);

In [0]:
%sql
--drop table payment_methods;
CREATE TABLE payment_methods;
COPY INTO payment_methods
FROM
  "dbfs:/mnt/ayushi_ecom/payment_methods.csv" FILEFORMAT = CSV FORMAT_OPTIONS(
    "header" = "true",
    "inferSchema" = "true",
    "mergeSchema" = "true",
    "timestampFormat" = "dd-MM-yyyy HH.mm"
  ) COPY_OPTIONS("mergeSchema" = "true");
SELECT
  *
FROM
  payment_methods;

In [0]:
%sql
-- Drop the table if it already exists
DROP TABLE IF EXISTS bronze.products;

In [0]:
# Read the CSV file into a Spark DataFrame
file_path = "/mnt/ayushi_ecom/products_1.csv"
df_prod = spark.read.csv(file_path, header=True, inferSchema=True)

# Display the DataFrame
display(df_prod)

In [0]:
file_path = "dbfs:/mnt/ayushi_ecom/products_1.csv"
file_content = dbutils.fs.head(file_path)

# Convert the file content to a DataFrame
from io import StringIO
df_prod = pd.read_csv(StringIO(file_content), quoting=csv.QUOTE_ALL)
display(df_prod)

In [0]:
import pandas as pd
import csv

# Read the CSV file into a Spark DataFrame
file_path = "dbfs:/mnt/ayushi_ecom/products_1.csv"
df_spark = spark.read.csv(file_path, header=True, inferSchema=True, quote='"', escape='"')

# Convert the Spark DataFrame to a Pandas DataFrame
df_prod = df_spark.toPandas()

# Display the DataFrame
display(df_prod)

In [0]:
# Convert the pandas DataFrame to a Spark DataFrame
spark_df = spark.createDataFrame(df_prod)

# Write the Spark DataFrame to a Delta table
spark_df.write.format("delta").mode("overwrite").saveAsTable("bronze.products")

In [0]:
%sql
select
  *
from
  bronze.products

In [0]:
%sql
drop table returns;
CREATE TABLE returns;
COPY INTO returns
FROM
  "dbfs:/mnt/ayushi_ecom/returns.csv" FILEFORMAT = CSV FORMAT_OPTIONS(
    "header" = "true",
    "inferSchema" = "true",
    "mergeSchema" = "true",
    "timestampFormat" = "dd-MM-yyyy HH.mm"
  ) COPY_OPTIONS("mergeSchema" = "true");
SELECT
  *
FROM
  returns;

In [0]:
%sql
drop table shipping_methods;
CREATE TABLE shipping_methods;
COPY INTO shipping_methods
FROM
  "dbfs:/mnt/ayushi_ecom/shipping_methods.csv" FILEFORMAT = CSV FORMAT_OPTIONS(
    "header" = "true",
    "inferSchema" = "true",
    "mergeSchema" = "true",
    "timestampFormat" = "dd-MM-yyyy HH.mm"
  ) COPY_OPTIONS("mergeSchema" = "true");;
SELECT
  *
FROM
  shipping_methods;

In [0]:
%sql
drop table vendors;
CREATE TABLE vendors;
COPY INTO vendors
FROM
  "dbfs:/mnt/ayushi_ecom/vendors.csv" FILEFORMAT = CSV FORMAT_OPTIONS(
    "header" = "true",
    "inferSchema" = "true",
    "mergeSchema" = "true",
    "timestampFormat" = "dd-MM-yyyy HH.mm"
  ) COPY_OPTIONS("mergeSchema" = "true");;
SELECT
  *
FROM
  vendors;