In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("XML to DataFrame").getOrCreate()

In [0]:
import requests
import xml.etree.ElementTree as ET

In [0]:
collections = ['sample3', 'sample2', 'sample']
collection = []
for i in collections:
    collection.append(i)

In [0]:
collection

['sample3', 'sample2', 'sample']

In [0]:
# Define the URL to fetch XML data
for i in collections:
    url = f'https://1d55-2001-4490-4e9c-990e-bc6a-bb57-3bac-cea5.ngrok-free.app/{i}'
    response = requests.get(url)

    if response.status_code == 200:
        try:
            root = ET.fromstring(response.content)
            data = ET.tostring(root, encoding = 'utf8').decode('utf8')
            #print(data)
        except ET.ParseError as e:
            print(f'Error parsing XML :  {e}')
    else:
        print(f'Failed tp fetch data :  {response.status_code}')

    #Storing all xml data in the dbfs storage 
    output = f'dbfs:/FileStore/tables/{i}.xml'
    dbutils.fs.put(output, data, overwrite = True)

Wrote 3342 bytes.
Wrote 3429 bytes.
Wrote 697 bytes.


In [0]:
%fs ls /FileStore/tables/


path,name,size,modificationTime
dbfs:/FileStore/tables/SAMPLE.zip,SAMPLE.zip,6624,1725622415000
dbfs:/FileStore/tables/authors.xml,authors.xml,1687,1725605601000
dbfs:/FileStore/tables/config-1.json,config-1.json,276,1723611879000
dbfs:/FileStore/tables/config.json,config.json,276,1723611517000
dbfs:/FileStore/tables/sample.xml,sample.xml,1153,1728966407000
dbfs:/FileStore/tables/sample2.xml,sample2.xml,3473,1729147307000
dbfs:/FileStore/tables/sample3.xml,sample3.xml,3386,1729147307000
dbfs:/FileStore/tables/test.txt,test.txt,47,1724915571000


In [0]:
tag = {
    'sample' : 'customers',
    'sample2' : 'company',
    'sample3' : 'employees',
}

for i, row_tag in tag.items():
    df = spark.read.format('xml').option('rowTag', row_tag).load(f'dbfs:/FileStore/tables/{i}.xml') 
    print(f'DataFrame of {i}')
    display(df)

DataFrame of sample


customer
"List(List(List(New York, Manhattan, NY), 30, 1, John Doe, List(List(List(1001, Book, 2), List(1002, Pen, 10)))), List(List(Los Angeles, Hollywood, CA), 25, 2, Jane Smith, List(List(List(1003, Laptop, 1)))))"


DataFrame of sample2


departments,location,name,products
"List(List(List(List(List(List(null, List(alice@techcorp.com, +1-555-1010), 10, 1001, Alice, List(List(Project X, Project Z)), Senior Software Engineer, 150000, List(List(Java, Python, C++)), null), List(null, List(bob@techcorp.com, +1-555-2020), 7, 1002, Bob, List(List(Project Y)), Data Scientist, 120000, List(List(Python, R, SQL)), null))), Engineering), List(List(List(List(null, List(carol@techcorp.com, +1-555-3030), 12, 2001, Carol, null, Sales Manager, 90000, null, List(List(USA, Canada))))), Sales), List(List(List(List(List(List(List(250000, Brand Awareness 2024, Ongoing), List(500000, Product Launch 2023, Completed))), List(david@techcorp.com, +1-555-4040), 5, 3001, David, null, Marketing Specialist, 85000, null, null))), Marketing)))","List(List(List(List(New York, 300, List(List(List(500000, Project X, Ongoing), List(1200000, Project Y, Completed)))), List(Chicago, 150, List(List(List(800000, Project Z, Planning)))))), List(789 Innovation Drive, San Francisco, USA, CA, 94107))",TechCorp,"List(List(List(Hardware, List(12 hours, Black, null, null, 10x15 cm, null), P001, 2022-05-01, TechWidget, 250, List(List(List(North America, 1250000, 5000), List(Europe, 750000, 3000)))), List(Software, List(null, null, 1 year, List(List(Windows, macOS, Linux)), null, 2.1), P002, 2023-01-15, TechGadget, 499, List(List(List(North America, 998000, 2000), List(Asia, 748500, 1500))))))"


DataFrame of sample3


employee
"List(List(1, List(New York, USA, 123 Main St, 10001), 30, Engineering, 2015-03-15, 2021-06-01, John Doe, Senior Software Engineer, List(List(List(500000, Project Alpha, Ongoing), List(300000, Project Beta, Completed))), 120000, List(List(Python, SQL, Java, AWS))), List(2, List(San Francisco, USA, 456 Market St, 94107), 28, Marketing, 2018-09-01, 2022-01-15, Jane Smith, Marketing Manager, List(List(List(120000, Campaign 2023, Completed), List(150000, Campaign 2024, Ongoing))), 95000, List(List(SEO, Content Strategy, Google Analytics))), List(3, List(Chicago, USA, 789 Wall St, 60605), 35, Finance, 2010-05-20, 2020-02-10, Alice Johnson, Finance Manager, List(List(List(200000, Budget Forecast 2023, Completed))), 130000, List(List(Financial Analysis, Risk Management, Excel, SAP))), List(4, List(Los Angeles, USA, 101 HR Lane, 90001), 42, Human Resources, 2008-08-25, 2019-09-05, Bob Williams, HR Director, List(List(List(50000, Employee Engagement, Ongoing))), 140000, List(List(Employee Relations, Recruitment, Training, HR Compliance))), List(5, List(Austin, USA, 102 Tech Park, 73301), 38, IT, 2012-11-11, 2020-07-21, Charlie Brown, IT Manager, List(List(List(250000, Network Overhaul, Completed), List(400000, Cloud Migration, Ongoing))), 125000, List(List(Networking, System Administration, Linux, Cybersecurity))))"


In [0]:
sample_df = spark.read.format('xml').option('rowTag', 'customers').load('dbfs:/FileStore/tables/sample.xml')
sample_df.display()

customer
"List(List(List(New York, Manhattan, NY), 30, 1, John Doe, List(List(List(1001, Book, 2), List(1002, Pen, 10)))), List(List(Los Angeles, Hollywood, CA), 25, 2, Jane Smith, List(List(List(1003, Laptop, 1)))))"


In [0]:
from pyspark.sql.functions import explode,col

df_flat = sample_df.select(
    explode('customer').alias('customer')
)
df_flat.display()

customer
"List(List(New York, Manhattan, NY), 30, 1, John Doe, List(List(List(1001, Book, 2), List(1002, Pen, 10))))"
"List(List(Los Angeles, Hollywood, CA), 25, 2, Jane Smith, List(List(List(1003, Laptop, 1))))"


In [0]:
df_flat = df_flat.select(
    'customer.Customer_ID',
    'customer.Name',
    'customer.Address.City',
    'customer.Address.District',
    'customer.Address.State',
    'customer.Age',
    explode('customer.Orders.Order').alias('order')  # Exploding the Orders.Order array
)

df_flat.display()

Customer_ID,Name,City,District,State,Age,order
1,John Doe,New York,Manhattan,NY,30,"List(1001, Book, 2)"
1,John Doe,New York,Manhattan,NY,30,"List(1002, Pen, 10)"
2,Jane Smith,Los Angeles,Hollywood,CA,25,"List(1003, Laptop, 1)"


In [0]:
df_flat_final = df_flat.select(
    'Customer_ID',
    'Name',
    'Age',
    'City',
    'State',
    col('order.Order_ID').alias('Order_ID'),
    col('order.Product').alias('Product'),
    col('order.Quantity').alias('Quantity')
)

df_flat_final.display()

Customer_ID,Name,Age,City,State,Order_ID,Product,Quantity
1,John Doe,30,New York,NY,1001,Book,2
1,John Doe,30,New York,NY,1002,Pen,10
2,Jane Smith,25,Los Angeles,CA,1003,Laptop,1


In [0]:
sample2_df = spark.read.format('xml').option('rowTag', 'company').load('dbfs:/FileStore/tables/sample2.xml')
sample2_df.display()

departments,location,name,products
"List(List(List(List(List(List(null, List(alice@techcorp.com, +1-555-1010), 10, 1001, Alice, List(List(Project X, Project Z)), Senior Software Engineer, 150000, List(List(Java, Python, C++)), null), List(null, List(bob@techcorp.com, +1-555-2020), 7, 1002, Bob, List(List(Project Y)), Data Scientist, 120000, List(List(Python, R, SQL)), null))), Engineering), List(List(List(List(null, List(carol@techcorp.com, +1-555-3030), 12, 2001, Carol, null, Sales Manager, 90000, null, List(List(USA, Canada))))), Sales), List(List(List(List(List(List(List(250000, Brand Awareness 2024, Ongoing), List(500000, Product Launch 2023, Completed))), List(david@techcorp.com, +1-555-4040), 5, 3001, David, null, Marketing Specialist, 85000, null, null))), Marketing)))","List(List(List(List(New York, 300, List(List(List(500000, Project X, Ongoing), List(1200000, Project Y, Completed)))), List(Chicago, 150, List(List(List(800000, Project Z, Planning)))))), List(789 Innovation Drive, San Francisco, USA, CA, 94107))",TechCorp,"List(List(List(Hardware, List(12 hours, Black, null, null, 10x15 cm, null), P001, 2022-05-01, TechWidget, 250, List(List(List(North America, 1250000, 5000), List(Europe, 750000, 3000)))), List(Software, List(null, null, 1 year, List(List(Windows, macOS, Linux)), null, 2.1), P002, 2023-01-15, TechGadget, 499, List(List(List(North America, 998000, 2000), List(Asia, 748500, 1500))))))"


In [0]:
sample2_df.printSchema()

root
 |-- departments: struct (nullable = true)
 |    |-- department: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- employees: struct (nullable = true)
 |    |    |    |    |-- employee: array (nullable = true)
 |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |-- campaigns: struct (nullable = true)
 |    |    |    |    |    |    |    |-- campaign: array (nullable = true)
 |    |    |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |    |    |-- budget: long (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- status: string (nullable = true)
 |    |    |    |    |    |    |-- contact: struct (nullable = true)
 |    |    |    |    |    |    |    |-- email: string (nullable = true)
 |    |    |    |    |    |    |    |-- phone: string (n

In [0]:
from pyspark.sql.functions import explode, col

# Step 1: Explode the 'departments.department' array to separate departments
df_exploded_departments = sample2_df.select(explode('departments.department').alias('department'))

# Step 2: Select individual fields from the exploded 'department' struct
df_flat_departments = df_exploded_departments.select(
    col('department.name').alias('Department_Name'),
    explode('department.employees.employee').alias('employee')
)

# Step 3: Select fields from the exploded 'employee' struct
df_flat_employees = df_flat_departments.select(
    col('Department_Name'),
    col('employee.id').alias('Employee_ID'),
    col('employee.name').alias('Employee_Name'),
    col('employee.role').alias('Role'),
    col('employee.salary').alias('Salary'),
    col('employee.experience_years').alias('Experience_Years'),
    col('employee.contact.email').alias('Email'),
    col('employee.contact.phone').alias('Phone'),
    explode('employee.campaigns.campaign').alias('campaign')
)

# Step 4: Select fields from the exploded 'campaign' struct
df_flat_campaigns = df_flat_employees.select(
    col('Department_Name'),
    col('Employee_ID'),
    col('Employee_Name'),
    col('Role'),
    col('Salary'),
    col('Experience_Years'),
    col('Email'),
    col('Phone'),
    col('campaign.name').alias('Campaign_Name'),
    col('campaign.budget').alias('Budget'),
    col('campaign.status').alias('Status')
)

# Step 5: Explode 'location.branch_offices.branch_office' and select relevant fields
df_flat_location = sample2_df.select(
    explode('location.branch_offices.branch_office').alias('branch_office')
).select(
    col('branch_office.city').alias('Branch_City'),
    col('branch_office.employees').alias('Branch_Employees'),
    explode('branch_office.projects.project').alias('project')
).select(
    col('Branch_City'),
    col('Branch_Employees'),
    col('project.name').alias('Project_Name'),
    col('project.budget').alias('Project_Budget'),
    col('project.status').alias('Project_Status')
)

# Step 6: Explode 'products.product' and handle nested 'features' and 'sales'
df_flat_products = sample2_df.select(explode('products.product').alias('product')).select(
    col('product.id').alias('Product_ID'),
    col('product.name').alias('Product_Name'),
    col('product.price').alias('Price'),
    col('product.launch_date').alias('Launch_Date'),
    col('product.category').alias('Category'),
    col('product.features.color').alias('Color'),
    col('product.features.size').alias('Size'),
    col('product.features.battery_life').alias('Battery_Life'),
    col('product.features.license').alias('License'),
    explode('product.sales.sale').alias('sale')
).select(
    col('Product_ID'),
    col('Product_Name'),
    col('Price'),
    col('Launch_Date'),
    col('Category'),
    col('Color'),
    col('Size'),
    col('Battery_Life'),
    col('License'),
    col('sale.region').alias('Sales_Region'),
    col('sale.revenue').alias('Revenue'),
    col('sale.units_sold').alias('Units_Sold')
)

# Display the flattened DataFrames
df_flat_campaigns.display()
df_flat_location.display()
df_flat_products.display()


Department_Name,Employee_ID,Employee_Name,Role,Salary,Experience_Years,Email,Phone,Campaign_Name,Budget,Status
Marketing,3001,David,Marketing Specialist,85000,5,david@techcorp.com,+1-555-4040,Brand Awareness 2024,250000,Ongoing
Marketing,3001,David,Marketing Specialist,85000,5,david@techcorp.com,+1-555-4040,Product Launch 2023,500000,Completed


Branch_City,Branch_Employees,Project_Name,Project_Budget,Project_Status
New York,300,Project X,500000,Ongoing
New York,300,Project Y,1200000,Completed
Chicago,150,Project Z,800000,Planning


Product_ID,Product_Name,Price,Launch_Date,Category,Color,Size,Battery_Life,License,Sales_Region,Revenue,Units_Sold
P001,TechWidget,250,2022-05-01,Hardware,Black,10x15 cm,12 hours,,North America,1250000,5000
P001,TechWidget,250,2022-05-01,Hardware,Black,10x15 cm,12 hours,,Europe,750000,3000
P002,TechGadget,499,2023-01-15,Software,,,,1 year,North America,998000,2000
P002,TechGadget,499,2023-01-15,Software,,,,1 year,Asia,748500,1500


In [0]:
sample3_df = spark.read.format('xml').option('rowTag', 'employees').load('dbfs:/FileStore/tables/sample3.xml')
sample3_df.display()

employee
"List(List(1, List(New York, USA, 123 Main St, 10001), 30, Engineering, 2015-03-15, 2021-06-01, John Doe, Senior Software Engineer, List(List(List(500000, Project Alpha, Ongoing), List(300000, Project Beta, Completed))), 120000, List(List(Python, SQL, Java, AWS))), List(2, List(San Francisco, USA, 456 Market St, 94107), 28, Marketing, 2018-09-01, 2022-01-15, Jane Smith, Marketing Manager, List(List(List(120000, Campaign 2023, Completed), List(150000, Campaign 2024, Ongoing))), 95000, List(List(SEO, Content Strategy, Google Analytics))), List(3, List(Chicago, USA, 789 Wall St, 60605), 35, Finance, 2010-05-20, 2020-02-10, Alice Johnson, Finance Manager, List(List(List(200000, Budget Forecast 2023, Completed))), 130000, List(List(Financial Analysis, Risk Management, Excel, SAP))), List(4, List(Los Angeles, USA, 101 HR Lane, 90001), 42, Human Resources, 2008-08-25, 2019-09-05, Bob Williams, HR Director, List(List(List(50000, Employee Engagement, Ongoing))), 140000, List(List(Employee Relations, Recruitment, Training, HR Compliance))), List(5, List(Austin, USA, 102 Tech Park, 73301), 38, IT, 2012-11-11, 2020-07-21, Charlie Brown, IT Manager, List(List(List(250000, Network Overhaul, Completed), List(400000, Cloud Migration, Ongoing))), 125000, List(List(Networking, System Administration, Linux, Cybersecurity))))"


In [0]:
sample3_df.printSchema()

root
 |-- employee: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: long (nullable = true)
 |    |    |-- address: struct (nullable = true)
 |    |    |    |-- city: string (nullable = true)
 |    |    |    |-- country: string (nullable = true)
 |    |    |    |-- street: string (nullable = true)
 |    |    |    |-- zipcode: long (nullable = true)
 |    |    |-- age: long (nullable = true)
 |    |    |-- department: string (nullable = true)
 |    |    |-- joining_date: date (nullable = true)
 |    |    |-- last_promotion_date: date (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- position: string (nullable = true)
 |    |    |-- projects: struct (nullable = true)
 |    |    |    |-- project: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- budget: long (nullable = true)
 |    |    |    |    |    |-- name: string (nullable = true)
 |    |    |    

In [0]:
df_flat = sample3_df.select(
    explode('employee').alias('employee')
)

df_flat.display()

employee
"List(1, List(New York, USA, 123 Main St, 10001), 30, Engineering, 2015-03-15, 2021-06-01, John Doe, Senior Software Engineer, List(List(List(500000, Project Alpha, Ongoing), List(300000, Project Beta, Completed))), 120000, List(List(Python, SQL, Java, AWS)))"
"List(2, List(San Francisco, USA, 456 Market St, 94107), 28, Marketing, 2018-09-01, 2022-01-15, Jane Smith, Marketing Manager, List(List(List(120000, Campaign 2023, Completed), List(150000, Campaign 2024, Ongoing))), 95000, List(List(SEO, Content Strategy, Google Analytics)))"
"List(3, List(Chicago, USA, 789 Wall St, 60605), 35, Finance, 2010-05-20, 2020-02-10, Alice Johnson, Finance Manager, List(List(List(200000, Budget Forecast 2023, Completed))), 130000, List(List(Financial Analysis, Risk Management, Excel, SAP)))"
"List(4, List(Los Angeles, USA, 101 HR Lane, 90001), 42, Human Resources, 2008-08-25, 2019-09-05, Bob Williams, HR Director, List(List(List(50000, Employee Engagement, Ongoing))), 140000, List(List(Employee Relations, Recruitment, Training, HR Compliance)))"
"List(5, List(Austin, USA, 102 Tech Park, 73301), 38, IT, 2012-11-11, 2020-07-21, Charlie Brown, IT Manager, List(List(List(250000, Network Overhaul, Completed), List(400000, Cloud Migration, Ongoing))), 125000, List(List(Networking, System Administration, Linux, Cybersecurity)))"


In [0]:
df_flat.printSchema()

root
 |-- employee: struct (nullable = true)
 |    |-- _id: long (nullable = true)
 |    |-- address: struct (nullable = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- street: string (nullable = true)
 |    |    |-- zipcode: long (nullable = true)
 |    |-- age: long (nullable = true)
 |    |-- department: string (nullable = true)
 |    |-- joining_date: date (nullable = true)
 |    |-- last_promotion_date: date (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- position: string (nullable = true)
 |    |-- projects: struct (nullable = true)
 |    |    |-- project: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- budget: long (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |    |-- status: string (nullable = true)
 |    |-- salary: long (nullable = true)
 |    |-- skills: struct (nullable = true)
 |    

In [0]:
df_flat = df_flat.select(
    col('employee._id').alias('id'),
    col('employee.name').alias('name'),
    col('employee.age').alias('age'),
    col('employee.department').alias('department'),
    col('employee.joining_date').alias('joining_date'),
    col('employee.last_promotion_date').alias('last_promotion_date'),
    col('employee.position').alias('position'),
    col('employee.salary').alias('salary'),
    col('employee.address.city').alias('city'),
    col('employee.address.country').alias('country'),
    col('employee.address.street').alias('street'),
    col('employee.address.zipcode').alias('zipcode'),
    explode('employee.projects.project').alias('project')
)

df_flat.display()

id,name,age,department,joining_date,last_promotion_date,position,salary,city,country,street,zipcode,project
1,John Doe,30,Engineering,2015-03-15,2021-06-01,Senior Software Engineer,120000,New York,USA,123 Main St,10001,"List(500000, Project Alpha, Ongoing)"
1,John Doe,30,Engineering,2015-03-15,2021-06-01,Senior Software Engineer,120000,New York,USA,123 Main St,10001,"List(300000, Project Beta, Completed)"
2,Jane Smith,28,Marketing,2018-09-01,2022-01-15,Marketing Manager,95000,San Francisco,USA,456 Market St,94107,"List(120000, Campaign 2023, Completed)"
2,Jane Smith,28,Marketing,2018-09-01,2022-01-15,Marketing Manager,95000,San Francisco,USA,456 Market St,94107,"List(150000, Campaign 2024, Ongoing)"
3,Alice Johnson,35,Finance,2010-05-20,2020-02-10,Finance Manager,130000,Chicago,USA,789 Wall St,60605,"List(200000, Budget Forecast 2023, Completed)"
4,Bob Williams,42,Human Resources,2008-08-25,2019-09-05,HR Director,140000,Los Angeles,USA,101 HR Lane,90001,"List(50000, Employee Engagement, Ongoing)"
5,Charlie Brown,38,IT,2012-11-11,2020-07-21,IT Manager,125000,Austin,USA,102 Tech Park,73301,"List(250000, Network Overhaul, Completed)"
5,Charlie Brown,38,IT,2012-11-11,2020-07-21,IT Manager,125000,Austin,USA,102 Tech Park,73301,"List(400000, Cloud Migration, Ongoing)"


In [0]:
df_flat.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- joining_date: date (nullable = true)
 |-- last_promotion_date: date (nullable = true)
 |-- position: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- street: string (nullable = true)
 |-- zipcode: long (nullable = true)
 |-- project: struct (nullable = true)
 |    |-- budget: long (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- status: string (nullable = true)



In [0]:
df_final = df_flat.select(
    'id',
    'name',
    'age',
    'department',
    'joining_date',
    'last_promotion_date',
    'position',
    'salary',
    'city',
    'country',
    'street',
    'zipcode',
    col('project.budget').alias('project_budget'),
    col('project.name').alias('project_name'),
    col('project.status').alias('project_status')
)

df_final.display()


id,name,age,department,joining_date,last_promotion_date,position,salary,city,country,street,zipcode,project_budget,project_name,project_status
1,John Doe,30,Engineering,2015-03-15,2021-06-01,Senior Software Engineer,120000,New York,USA,123 Main St,10001,500000,Project Alpha,Ongoing
1,John Doe,30,Engineering,2015-03-15,2021-06-01,Senior Software Engineer,120000,New York,USA,123 Main St,10001,300000,Project Beta,Completed
2,Jane Smith,28,Marketing,2018-09-01,2022-01-15,Marketing Manager,95000,San Francisco,USA,456 Market St,94107,120000,Campaign 2023,Completed
2,Jane Smith,28,Marketing,2018-09-01,2022-01-15,Marketing Manager,95000,San Francisco,USA,456 Market St,94107,150000,Campaign 2024,Ongoing
3,Alice Johnson,35,Finance,2010-05-20,2020-02-10,Finance Manager,130000,Chicago,USA,789 Wall St,60605,200000,Budget Forecast 2023,Completed
4,Bob Williams,42,Human Resources,2008-08-25,2019-09-05,HR Director,140000,Los Angeles,USA,101 HR Lane,90001,50000,Employee Engagement,Ongoing
5,Charlie Brown,38,IT,2012-11-11,2020-07-21,IT Manager,125000,Austin,USA,102 Tech Park,73301,250000,Network Overhaul,Completed
5,Charlie Brown,38,IT,2012-11-11,2020-07-21,IT Manager,125000,Austin,USA,102 Tech Park,73301,400000,Cloud Migration,Ongoing


In [0]:
df_final.display()

id,name,age,department,joining_date,last_promotion_date,position,salary,city,country,street,zipcode,project_budget,project_name,project_status
1,John Doe,30,Engineering,2015-03-15,2021-06-01,Senior Software Engineer,120000,New York,USA,123 Main St,10001,500000,Project Alpha,Ongoing
1,John Doe,30,Engineering,2015-03-15,2021-06-01,Senior Software Engineer,120000,New York,USA,123 Main St,10001,300000,Project Beta,Completed
2,Jane Smith,28,Marketing,2018-09-01,2022-01-15,Marketing Manager,95000,San Francisco,USA,456 Market St,94107,120000,Campaign 2023,Completed
2,Jane Smith,28,Marketing,2018-09-01,2022-01-15,Marketing Manager,95000,San Francisco,USA,456 Market St,94107,150000,Campaign 2024,Ongoing
3,Alice Johnson,35,Finance,2010-05-20,2020-02-10,Finance Manager,130000,Chicago,USA,789 Wall St,60605,200000,Budget Forecast 2023,Completed
4,Bob Williams,42,Human Resources,2008-08-25,2019-09-05,HR Director,140000,Los Angeles,USA,101 HR Lane,90001,50000,Employee Engagement,Ongoing
5,Charlie Brown,38,IT,2012-11-11,2020-07-21,IT Manager,125000,Austin,USA,102 Tech Park,73301,250000,Network Overhaul,Completed
5,Charlie Brown,38,IT,2012-11-11,2020-07-21,IT Manager,125000,Austin,USA,102 Tech Park,73301,400000,Cloud Migration,Ongoing
