In [24]:
import pandas as pd
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
from PIL import Image
import os

In [28]:

# Specify the path to your Parquet file
file_path = 'D:\Codes\Python\DL_Project\P1.parquet'

# Read the Parquet file into a PyArrow Table
table = pq.read_table(file_path)

# Convert the PyArrow Table to a Pandas DataFrame if needed
df = table.to_pandas()

# Now you can work with the DataFrame
print(df.columns)


Index(['indices', 'plans', 'walls', 'colors', 'footprints', 'plan_captions'], dtype='object')


In [29]:


# Load the Parquet file into a Pandas DataFrame
df = pd.read_parquet(file_path)

# Create the main database directory
database_dir = 'D:\Codes\Python\DL_Project\database'
os.makedirs(database_dir, exist_ok=True)

# List of subfolders for different types of data
subfolders = ['plans', 'walls', 'colors', 'footprints', 'plan_captions']

# Create subdirectories for each type of data
for subfolder in subfolders:
    subfolder_path = os.path.join(database_dir, subfolder)
    os.makedirs(subfolder_path, exist_ok=True)

# Iterate through each row of the DataFrame
for index, row in df.iterrows():
    # Get the index value from the 'indices' column
    idx = row['indices']

    # Iterate through each subfolder type
    for subfolder in subfolders[:-1]:  # Exclude 'description' subfolder
        # Check if the column contains image data as a dictionary
        if isinstance(row[subfolder], dict):
            # Extract the image data from the dictionary
            image_data = row[subfolder]['bytes']
            
            # Save the image to the corresponding subfolder using the index value
            image = Image.open(io.BytesIO(image_data))
            image_path = os.path.join(database_dir, subfolder, f"{idx}.png")  # Adjust file extension as needed
            image.save(image_path)
            
    # Save the description to the 'description' subfolder using the index value
    description_path = os.path.join(database_dir, 'plan_captions', f"{idx}.txt")
    with open(description_path, 'w') as file:
        file.write(row['plan_captions'])

# Show the first few rows of the DataFrame to verify
print(df.head())


    indices                                              plans  \
0  00000000  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   
1  00000001  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   
2  00000010  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   
3  00000100  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   
4  00001000  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   

                                               walls  \
0  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   
1  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   
2  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   
3  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   
4  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   

                                              colors  \
0  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   
1  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   
2  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   
3  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x