In [0]:
import requests
import xml.etree.ElementTree as ET

In [0]:
# Define the URL to fetch XML data
url = 'https://raw.githubusercontent.com/Aravindh0/Sample_xml/refs/heads/main/sample.xml'
response = requests.get(url)

if response.status_code == 200:
    # Parse the XML content
    try:
        root = ET.fromstring(response.content)
        data = ET.tostring(root, encoding='utf8').decode('utf8')
        print(data)  # Print the parsed XML data
    except ET.ParseError as e:
        print(f'Error parsing XML: {e}')
else:
    print(f'Failed to fetch data: {response.status_code}')

# Optional: Print the raw content as a string
#print(response.content.decode('utf-8'))


<?xml version='1.0' encoding='utf8'?>
<customers>
    <customer>
        <Customer_ID>1</Customer_ID>
        <Name>John Doe</Name>
        <Age>30</Age>
        <Address>
            <City>New York</City>
            <District>Manhattan</District>
            <State>NY</State>
        </Address>
        <Orders>
            <Order>
                <Order_ID>1001</Order_ID>
                <Product>Book</Product>
                <Quantity>2</Quantity>
            </Order>
            <Order>
                <Order_ID>1002</Order_ID>
                <Product>Pen</Product>
                <Quantity>10</Quantity>
            </Order>
        </Orders>
    </customer>
    <customer>
        <Customer_ID>2</Customer_ID>
        <Name>Jane Smith</Name>
        <Age>25</Age>
        <Address>
            <City>Los Angeles</City>
            <District>Hollywood</District>
            <State>CA</State>
        </Address>
        <Orders>
            <Order>
                <Order_ID>1003</Order

In [0]:
%fs ls /FileStore/tables/


path,name,size,modificationTime
dbfs:/FileStore/tables/SAMPLE.zip,SAMPLE.zip,6624,1725622415000
dbfs:/FileStore/tables/authors.xml,authors.xml,1687,1725605601000
dbfs:/FileStore/tables/config-1.json,config-1.json,276,1723611879000
dbfs:/FileStore/tables/config.json,config.json,276,1723611517000
dbfs:/FileStore/tables/sample.xml,sample.xml,1153,1728966407000
dbfs:/FileStore/tables/test.txt,test.txt,47,1724915571000


In [0]:
output = 'dbfs:/FileStore/tables/sample.xml'
dbutils.fs.put(output, data)

Wrote 1153 bytes.


True

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("XML to DataFrame").getOrCreate()

In [0]:
df = spark.read.format('xml').option('rowTag', 'customer').xml('dbfs:/FileStore/tables/sample.xml')
display(df)

Address,Age,Customer_ID,Name,Orders
"List(New York, Manhattan, NY)",30,1,John Doe,"List(List(List(1001, Book, 2), List(1002, Pen, 10)))"
"List(Los Angeles, Hollywood, CA)",25,2,Jane Smith,"List(List(List(1003, Laptop, 1)))"


In [0]:
from pyspark.sql.functions import explode, col

# Explode the 'Orders' array and access individual fields from 'Address' struct
df_flattened = df.select(
    col("Customer_ID"),
    col("Name"),
    col("Age"),
    col("Address.City").alias("City"),
    col("Address.District").alias("District"),
    col("Address.State").alias("State"),
    explode(col("Orders.Order")).alias("Order")
)
display(df_flattened)
# Extract individual fields from the exploded 'Order' struct
df_final = df_flattened.select(
    "Customer_ID", "Name", "Age", "City", "District", "State",
    col("Order.Order_ID").alias("Order_ID"),
    col("Order.Product").alias("Product"),
    col("Order.Quantity").alias("Quantity")   
)

# Display the final DataFrame
display(df_final)


Customer_ID,Name,Age,City,District,State,Order
1,John Doe,30,New York,Manhattan,NY,"List(1001, Book, 2)"
1,John Doe,30,New York,Manhattan,NY,"List(1002, Pen, 10)"
2,Jane Smith,25,Los Angeles,Hollywood,CA,"List(1003, Laptop, 1)"


Customer_ID,Name,Age,City,District,State,Order_ID,Product,Quantity
1,John Doe,30,New York,Manhattan,NY,1001,Book,2
1,John Doe,30,New York,Manhattan,NY,1002,Pen,10
2,Jane Smith,25,Los Angeles,Hollywood,CA,1003,Laptop,1
