In [1]:
import pandas as pd
import json

j_data = {
  "company": {
    "name": "Tech Solutions",
    "employees": [
      {
        "id": 1,
        "name": {
          "first": "John",
          "last": "Doe"
        },
        "position": "Software Engineer",
        "projects": [
          {
            "project_id": "p001",
            "name": "AI Development",
            "technologies": ["Python", "PyTorch", "TensorFlow"],
            "timeline": {
              "start": "2023-01-10",
              "end": "2023-06-20"
            }
          },
          {
            "project_id": "p002",
            "name": "Cloud Migration",
            "technologies": ["AWS", "Docker"],
            "timeline": {
              "start": "2023-07-01",
              "end": "2023-12-15"
            }
          }
        ]
      },
      {
        "id": 2,
        "name": {
          "first": "Jane",
          "last": "Smith"
        },
        "position": "Data Scientist",
        "projects": [
          {
            "project_id": "p003",
            "name": "Predictive Analytics",
            "technologies": ["Python", "Pandas", "scikit-learn"],
            "timeline": {
              "start": "2023-03-15",
              "end": "2023-09-30"
            }
          }
        ]
      }
    ]
  },
  "address": {
    "street": "123 Tech Lane",
    "city": "Innovate City",
    "state": "CA",
    "postal_code": "94043"
  }
}

# Flatten the JSON with json_normalize
n_df = pd.json_normalize(j_data, record_path = ['company', 'employees', 'projects'],
                      meta = [['company','name'],
                              ['company', 'employees', 'id'],
                              ['company', 'employees', 'name','first'],
                              ['company', 'employees', 'name','last'],
                              ['company', 'employees', 'position']])

n_df

Unnamed: 0,project_id,name,technologies,timeline.start,timeline.end,company.name,company.employees.id,company.employees.name.first,company.employees.name.last,company.employees.position
0,p001,AI Development,"[Python, PyTorch, TensorFlow]",2023-01-10,2023-06-20,Tech Solutions,1,John,Doe,Software Engineer
1,p002,Cloud Migration,"[AWS, Docker]",2023-07-01,2023-12-15,Tech Solutions,1,John,Doe,Software Engineer
2,p003,Predictive Analytics,"[Python, Pandas, scikit-learn]",2023-03-15,2023-09-30,Tech Solutions,2,Jane,Smith,Data Scientist


In [2]:
!pip install pyspark


Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=83370371366cdaf99b220372e08b7e0c04399bf2076b0de3d845cbef4dfc0c34
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col
from io import StringIO

spark = SparkSession.builder.appName("ComplexJSON").getOrCreate()

In [4]:
# Convert the json_data to a string
json_str = json.dumps(j_data)

# Use StringIO to convert the string into a file-like object
json_rdd = spark.sparkContext.parallelize([json_str])

# Read the JSON data into a DataFrame directly from the variable
df = spark.read.json(json_rdd)

# Explode the 'employees' field into rows
df_exploded = df.select(col("company.name").alias("company_name"),
                        explode(col("company.employees")).alias("employee"))

# Further explode 'projects' inside 'employees'
df_projects = df_exploded.select("company_name",
                                 col("employee.id").alias("employee_id"),
                                 col("employee.name.first").alias("first_name"),
                                 col("employee.name.last").alias("last_name"),
                                 col("employee.position"),
                                 explode(col("employee.projects")).alias("project"))

# Now, you can select specific project details and other columns
df_final = df_projects.select("company_name", "employee_id", "first_name", "last_name",
                              "position", col("project.project_id"),
                              col("project.name").alias("project_name"),
                              col("project.technologies"),
                              col("project.timeline.start").alias("project_start"),
                              col("project.timeline.end").alias("project_end"))

# Show the resulting DataFrame
df_final.show(truncate=False)

+--------------+-----------+----------+---------+-----------------+----------+--------------------+------------------------------+-------------+-----------+
|company_name  |employee_id|first_name|last_name|position         |project_id|project_name        |technologies                  |project_start|project_end|
+--------------+-----------+----------+---------+-----------------+----------+--------------------+------------------------------+-------------+-----------+
|Tech Solutions|1          |John      |Doe      |Software Engineer|p001      |AI Development      |[Python, PyTorch, TensorFlow] |2023-01-10   |2023-06-20 |
|Tech Solutions|1          |John      |Doe      |Software Engineer|p002      |Cloud Migration     |[AWS, Docker]                 |2023-07-01   |2023-12-15 |
|Tech Solutions|2          |Jane      |Smith    |Data Scientist   |p003      |Predictive Analytics|[Python, Pandas, scikit-learn]|2023-03-15   |2023-09-30 |
+--------------+-----------+----------+---------+---------