In [0]:
bucket_name = "msklspace"
mount_name = "amzns"
dbutils.fs.mount(
  f"gs://{bucket_name}",
  f"/mnt/databricks/{mount_name}",
  extra_configs = {"fs.gs.project.id": "mentorsko-1700569460412"}
)

In [0]:
%sql
create schema bronze;

In [0]:

# Read the CSV file into a DataFrame
df_cust = spark.read.csv("dbfs:/mnt/ayushi_ecom/customers.csv", header=True, inferSchema=True)

# Show the DataFrame
df_cust.show()

In [0]:
%sql
USE SCHEMA bronze;

In [0]:
%sql

CREATE TABLE customers (
    CustomerID VARCHAR(50),
    FirstName VARCHAR(100),
    LastName VARCHAR(100),
    Email VARCHAR(255),
    PhoneNumber BIGINT,
    DateOfBirth DATE,
    RegistrationDate DATE,
    PreferredPaymentMethodID VARCHAR(50)
)
USING DELTA;

COPY INTO customers
FROM "dbfs:/mnt/ayushi_ecom/customers.csv"
FILEFORMAT = CSV
FORMAT_OPTIONS ('header' = 'true', 'inferSchema' = 'true');

select * from customers;

In [0]:
%sql

CREATE TABLE orders (
    OrderID VARCHAR(50),
    CustomerID VARCHAR(50),
    OrderDate TIMESTAMP,
    ShippingDate TIMESTAMP,
    ExpectedDeliveryDate TIMESTAMP,
    ActualDeliveryDate TIMESTAMP,
    ShippingMethodID VARCHAR(50),
    VendorID VARCHAR(50)
)
USING DELTA;

COPY INTO orders
FROM "dbfs:/mnt/ayushi_ecom/orders.csv"
FILEFORMAT = CSV
FORMAT_OPTIONS ('header' = 'true', 'inferSchema' = 'true');

select * from orders;

In [0]:
# Read the CSV file into a DataFrame
df_add = spark.read.csv("dbfs:/mnt/ayushi_ecom/addresses.csv", header=True, inferSchema=True)

# Show the DataFrame
df_add.display()

In [0]:
import pandas as pd
import re

# Read the file using dbutils
file_path = "/mnt/ayushi_ecom/addresses.csv"
lines = dbutils.fs.head(file_path).split("\n")

# Initialize a list to hold the processed data
data = []

# Process each line
for line in lines:
    # Remove any extra whitespace
    line = line.strip()
    # Use regex to split only on commas that are not within quotes
    fields = re.split(r',(?=(?:[^"]*"[^"]*")*[^"]*$)', line)
    data.append(fields)

# Convert the processed data to a DataFrame
addresses_df = pd.DataFrame(data[1:], columns=data[0])

# Display the first few rows of the DataFrame
display(addresses_df)

In [0]:
import pandas as pd
import csv

# Use dbutils to read the CSV file
file_path = "/mnt/ayushi_ecom/addresses.csv"
file_content = dbutils.fs.head(file_path)

# Convert the file content to a DataFrame
from io import StringIO
df_add = pd.read_csv(StringIO(file_content), quoting=csv.QUOTE_ALL)

# Show the DataFrame
display(df_add)

In [0]:
# Convert the pandas DataFrame to a Spark DataFrame
spark_df = spark.createDataFrame(df_add)

# Write the Spark DataFrame to a Delta table
spark_df.write.format("delta").mode("overwrite").saveAsTable("addresses")

In [0]:
%sql
select * from addresses
--drop table addresses

In [0]:
%sql
CREATE TABLE order_items (
    OrderItemID VARCHAR(50),
    OrderID VARCHAR(50),
    ProductID VARCHAR(50),
    Quantity INT
)
USING DELTA;

COPY INTO order_items
FROM "dbfs:/mnt/ayushi_ecom/orders_items.csv"
FILEFORMAT = CSV
FORMAT_OPTIONS ('header' = 'true', 'inferSchema' = 'true');

SELECT * FROM order_items;

In [0]:
%sql

CREATE TABLE payments (
    PaymentID VARCHAR(50),
    OrderID VARCHAR(50),
    PaymentDate TIMESTAMP,
    GiftCardUsage VARCHAR(3),
    GiftCardAmount DOUBLE,
    CouponUsage VARCHAR(3),
    CouponAmount DOUBLE,
    PaymentMethodID VARCHAR(50)
)
USING DELTA;


COPY INTO payments
FROM "dbfs:/mnt/ayushi_ecom/payments.csv"
FILEFORMAT = CSV
FORMAT_OPTIONS ('header' = 'true', 'inferSchema' = 'true');

SELECT * FROM payments;

In [0]:
%sql
CREATE TABLE payment_methods (
    PaymentMethodID VARCHAR(50),
    MethodName VARCHAR(255)
)
USING DELTA;

COPY INTO payment_methods
FROM "dbfs:/mnt/ayushi_ecom/payment_methods.csv"
FILEFORMAT = CSV
FORMAT_OPTIONS ('header' = 'true', 'inferSchema' = 'true');

SELECT * FROM payment_methods;

In [0]:
%sql
-- Drop the table if it already exists
DROP TABLE IF EXISTS products;


In [0]:
file_path = "dbfs:/mnt/ayushi_ecom/products_1.csv"
file_content = dbutils.fs.head(file_path)

# Convert the file content to a DataFrame
from io import StringIO
df_add = pd.read_csv(StringIO(file_content), quoting=csv.QUOTE_ALL)
display(df_add)

In [0]:
# Convert the pandas DataFrame to a Spark DataFrame
spark_df = spark.createDataFrame(df_add)

# Write the Spark DataFrame to a Delta table
spark_df.write.format("delta").mode("overwrite").saveAsTable("products")

In [0]:
%sql
select * from products

In [0]:
%sql
CREATE TABLE returns (
    OrderId VARCHAR(255),
    Return_reason VARCHAR(255)
)
USING DELTA;

COPY INTO returns
FROM "dbfs:/mnt/ayushi_ecom/returns.csv"
FILEFORMAT = CSV
FORMAT_OPTIONS ('header' = 'true', 'inferSchema' = 'true');

SELECT * FROM returns;

In [0]:
%sql

CREATE TABLE shipping_methods (
    ShippingMethodID VARCHAR(50),
    MethodName VARCHAR(255),
    Cost_Rs int
)
USING DELTA;

COPY INTO shipping_methods 
FROM "dbfs:/mnt/ayushi_ecom/shipping_methods.csv"
FILEFORMAT = CSV
FORMAT_OPTIONS ('header' = 'true', 'inferSchema' = 'true');

SELECT * FROM shipping_methods;

In [0]:
%sql
CREATE TABLE vendors (
    VendorID VARCHAR(50),
    VendorName VARCHAR(255)
)
USING DELTA;

COPY INTO vendors
FROM "dbfs:/mnt/ayushi_ecom/vendors.csv"
FILEFORMAT = CSV
FORMAT_OPTIONS ('header' = 'true', 'inferSchema' = 'true');

SELECT * FROM vendors ;