In [1]:
# Split a parquet file into two based on column names
import pandas as pd

# Read the original Parquet file
df = pd.read_parquet("../data/artifacts2/reqid_001/1757527232_1757278522.parquet")

# Select sensor1 columns
sensor1_cols = [col for col in df.columns if col.startswith("sensor1_")]
sensor1_df = df[["time", "TO"] + sensor1_cols]

# Select remaining columns (excluding sensor1 columns)
remaining_cols = [col for col in df.columns if col not in sensor1_cols]
remaining_cols = [col for col in remaining_cols if col not in ["time", "TO"]]
remaining_df = df[["time", "TO"] + remaining_cols]

# Write to separate Parquet files
sensor1_df.to_parquet("../data/artifacts2/reqid_001/1757527232_1757278522_sensor1.parquet", index=False)
remaining_df.to_parquet("../data/artifacts2/reqid_001/1757527232_1757278522_remaining.parquet", index=False)

In [3]:
# Read the original Parquet file
df = pd.read_parquet("../data/artifacts2/reqid_001/1757527232_1757278522.parquet")


In [4]:
df.head()

Unnamed: 0,time,TO,sensor1_min,sensor1_max,sensor2_min,sensor2_max,sensor2_mean,sensor3_min,sensor3_max,sensor3_mean
0,1743532199,10540337,65.83,65.27,10.0,98.0,55.59,12.0,99.0,54.11
1,1743618599,10540337,76.09,54.77,10.0,98.0,57.7,10.0,99.0,51.68
2,1743704999,10540337,67.66,66.12,10.0,99.0,50.29,10.0,98.0,53.25
3,1743791399,10540337,83.22,80.61,10.0,99.0,58.97,11.0,98.0,49.92
4,1743877799,10540337,78.29,66.14,10.0,99.0,59.6,10.0,99.0,61.02


In [5]:
sensor1_cols = [col for col in df.columns if col.startswith("sensor1_")]
sensor1_df = df[["time", "TO"] + sensor1_cols]

In [6]:
sensor1_df.head()

Unnamed: 0,time,TO,sensor1_min,sensor1_max
0,1743532199,10540337,65.83,65.27
1,1743618599,10540337,76.09,54.77
2,1743704999,10540337,67.66,66.12
3,1743791399,10540337,83.22,80.61
4,1743877799,10540337,78.29,66.14


In [7]:
# Select remaining columns (excluding sensor1 columns)
remaining_cols = [col for col in df.columns if col not in sensor1_cols]
remaining_cols = [col for col in remaining_cols if col not in ["time", "TO"]]
remaining_df = df[["time", "TO"] + remaining_cols]

In [8]:
remaining_df.head()

Unnamed: 0,time,TO,sensor2_min,sensor2_max,sensor2_mean,sensor3_min,sensor3_max,sensor3_mean
0,1743532199,10540337,10.0,98.0,55.59,12.0,99.0,54.11
1,1743618599,10540337,10.0,98.0,57.7,10.0,99.0,51.68
2,1743704999,10540337,10.0,99.0,50.29,10.0,98.0,53.25
3,1743791399,10540337,10.0,99.0,58.97,11.0,98.0,49.92
4,1743877799,10540337,10.0,99.0,59.6,10.0,99.0,61.02


In [9]:
# Now, let's read and merge parquet files from a folder
# 
import pandas as pd
import os

folder = "../data/artifacts2/reqid_001"
parquet_files = [f for f in os.listdir(folder) if f.endswith(".parquet")]

# Separate part files and full files
part_files = [f for f in parquet_files if "_part" in f]
print(part_files)
full_files = [f for f in parquet_files if "_part" not in f]
print(full_files)

dfs = []

['1757527232_1757278522_part2.parquet', '1757527232_1757278522_part1.parquet']
['1757565217_1757565282.parquet', '1757565217_1757565295.parquet', '1757527232_1757278522.parquet']


In [10]:
from collections import defaultdict
part_groups = defaultdict(list)

print("part groups: ", part_groups)

part groups:  defaultdict(<class 'list'>, {})


In [11]:
for f in part_files:
    print("f: ", f)
    prefix = f.split("_part")[0]
    part_groups[prefix].append(f)
    print("part groups: ", part_groups)

f:  1757527232_1757278522_part2.parquet
part groups:  defaultdict(<class 'list'>, {'1757527232_1757278522': ['1757527232_1757278522_part2.parquet']})
f:  1757527232_1757278522_part1.parquet
part groups:  defaultdict(<class 'list'>, {'1757527232_1757278522': ['1757527232_1757278522_part2.parquet', '1757527232_1757278522_part1.parquet']})


In [12]:

for prefix, files in part_groups.items():
    # Sort to ensure consistent order
    files = sorted(files)

print("files", files)

files ['1757527232_1757278522_part1.parquet', '1757527232_1757278522_part2.parquet']


In [13]:
for prefix, files in part_groups.items():
    # Sort to ensure consistent order
    files = sorted(files)
    part_dfs = [pd.read_parquet(os.path.join(folder, f)) for f in files]
    print("part_dfs: ", part_dfs)
    # # Merge column-wise (axis=1)
    # merged_df = pd.concat(part_dfs, axis=1)
    # # Remove duplicate columns if any (e.g., 'time', 'TO')
    # merged_df = merged_df.loc[:,~merged_df.columns.duplicated()]
    # dfs.append(merged_df)

part_dfs:  [          time        TO  sensor2_min  sensor2_max  sensor2_mean  sensor3_min  \
0   1743532199  10540337        10.00        98.00         55.59        12.00   
1   1743618599  10540337        10.00        98.00         57.70        10.00   
2   1743704999  10540337        10.00        99.00         50.29        10.00   
3   1743791399  10540337        10.00        99.00         58.97        11.00   
4   1743877799  10540337        10.00        99.00         59.60        10.00   
..         ...       ...          ...          ...           ...          ...   
77  1746728999  10540337         9.90        98.01         55.41        12.23   
78  1746728999  10540337        10.59        97.39         56.05        11.32   
79  1746815399  10540337        10.53        98.79         55.84        11.95   
80  1746815399  10540337        10.66        99.02         54.12        12.53   
81  1746815399  10540337        11.73        98.66         52.69        10.92   

    sensor3_max

In [14]:
for prefix, files in part_groups.items():
    # Sort to ensure consistent order
    files = sorted(files)
    part_dfs = [pd.read_parquet(os.path.join(folder, f)) for f in files]
    # print("part_dfs: ", part_dfs)
    # Merge column-wise (axis=1)
    merged_df = pd.concat(part_dfs, axis=1)
    print("merged_df: ", merged_df.head())
    # Remove duplicate columns if any (e.g., 'time', 'TO')
    merged_df = merged_df.loc[:,~merged_df.columns.duplicated()]
    # dfs.append(merged_df)
    print("merged_df: ", merged_df.head())

merged_df:           time        TO  sensor2_min  sensor2_max  sensor2_mean  sensor3_min  \
0  1743532199  10540337         10.0         98.0         55.59         12.0   
1  1743618599  10540337         10.0         98.0         57.70         10.0   
2  1743704999  10540337         10.0         99.0         50.29         10.0   
3  1743791399  10540337         10.0         99.0         58.97         11.0   
4  1743877799  10540337         10.0         99.0         59.60         10.0   

   sensor3_max  sensor3_mean        time        TO  sensor1_min  sensor1_max  
0         99.0         54.11  1743532199  10540337        65.83        65.27  
1         99.0         51.68  1743618599  10540337        76.09        54.77  
2         98.0         53.25  1743704999  10540337        67.66        66.12  
3         98.0         49.92  1743791399  10540337        83.22        80.61  
4         99.0         61.02  1743877799  10540337        78.29        66.14  
merged_df:           time        

In [15]:
# Now, let's read and merge parquet files from a folder
# 
import pandas as pd
import os

folder = "../data/artifacts2/reqid_001"
parquet_files = [f for f in os.listdir(folder) if f.endswith(".parquet")]

# Separate part files and full files
part_files = [f for f in parquet_files if "_part" in f]
print(part_files)
full_files = [f for f in parquet_files if "_part" not in f]
print(full_files)

dfs = []

# Handle part files: group by prefix before '_part'
from collections import defaultdict
part_groups = defaultdict(list)
for f in part_files:
    print("f: ", f)
    prefix = f.split("_part")[0]
    part_groups[prefix].append(f)

for prefix, files in part_groups.items():
    # Sort to ensure consistent order
    files = sorted(files)
    part_dfs = [pd.read_parquet(os.path.join(folder, f)) for f in files]
    # Merge column-wise (axis=1)
    merged_df = pd.concat(part_dfs, axis=1)
    # Remove duplicate columns if any (e.g., 'time', 'TO')
    merged_df = merged_df.loc[:,~merged_df.columns.duplicated()]
    dfs.append(merged_df)

# Handle full files
for f in full_files:
    df = pd.read_parquet(os.path.join(folder, f))
    dfs.append(df)

# Combine all data vertically (axis=0)
final_df = pd.concat(dfs, ignore_index=True)
print(final_df.shape)

['1757527232_1757278522_part2.parquet', '1757527232_1757278522_part1.parquet']
['1757565217_1757565282.parquet', '1757565217_1757565295.parquet', '1757527232_1757278522.parquet']
f:  1757527232_1757278522_part2.parquet
f:  1757527232_1757278522_part1.parquet
(296, 10)


In [16]:
final_df.head()

Unnamed: 0,time,TO,sensor2_min,sensor2_max,sensor2_mean,sensor3_min,sensor3_max,sensor3_mean,sensor1_min,sensor1_max
0,1743532199,10540337,10.0,98.0,55.59,12.0,99.0,54.11,65.83,65.27
1,1743618599,10540337,10.0,98.0,57.7,10.0,99.0,51.68,76.09,54.77
2,1743704999,10540337,10.0,99.0,50.29,10.0,98.0,53.25,67.66,66.12
3,1743791399,10540337,10.0,99.0,58.97,11.0,98.0,49.92,83.22,80.61
4,1743877799,10540337,10.0,99.0,59.6,10.0,99.0,61.02,78.29,66.14


In [18]:
# Now, let's read and merge parquet files from a folder and 
# maintain the order of operations for schema consistency
# 
import pandas as pd
import os

folder = "../data/artifacts2/reqid_001"
parquet_files = [f for f in os.listdir(folder) if f.endswith(".parquet")]

# Separate part files and full files
part_files = [f for f in parquet_files if "_part" in f]
print(part_files)
full_files = [f for f in parquet_files if "_part" not in f]
print(full_files)

dfs = []

# Handle full files
for f in full_files:
    df = pd.read_parquet(os.path.join(folder, f))
    dfs.append(df)

# Handle part files: group by prefix before '_part'
from collections import defaultdict
part_groups = defaultdict(list)
for f in part_files:
    print("f: ", f)
    prefix = f.split("_part")[0]
    part_groups[prefix].append(f)

for prefix, files in part_groups.items():
    # Sort to ensure consistent order
    files = sorted(files)
    part_dfs = [pd.read_parquet(os.path.join(folder, f)) for f in files]
    # Merge column-wise (axis=1)
    merged_df = pd.concat(part_dfs, axis=1)
    # Remove duplicate columns if any (e.g., 'time', 'TO')
    merged_df = merged_df.loc[:,~merged_df.columns.duplicated()]
    dfs.append(merged_df)

# # Handle full files
# for f in full_files:
#     df = pd.read_parquet(os.path.join(folder, f))
#     dfs.append(df)

# Combine all data vertically (axis=0)
final_df = pd.concat(dfs, ignore_index=True)
print(final_df.shape)

print(final_df.head())

['1757527232_1757278522_part2.parquet', '1757527232_1757278522_part1.parquet']
['1757565217_1757565282.parquet', '1757565217_1757565295.parquet', '1757527232_1757278522.parquet']
f:  1757527232_1757278522_part2.parquet
f:  1757527232_1757278522_part1.parquet
(296, 10)
         time        TO  sensor1_min  sensor1_max  sensor2_min  sensor2_max  \
0  1743532199  10540337         40.0         60.0         10.0         98.0   
1  1743618599  10032096         40.5         60.7         10.5         99.0   
2  1743704999  20012001         41.0         61.4         11.0         98.0   
3  1743791399  30045009         41.5         62.1         10.0         99.0   
4  1743877799  10540337         42.0         62.8         10.5         98.0   

   sensor2_mean  sensor3_min  sensor3_max  sensor3_mean  
0          50.0         10.0         99.0          52.0  
1          50.6         11.0         98.0          52.4  
2          51.2         12.0         97.0          52.8  
3          51.8         

In [21]:
# Now, let's read and merge parquet files from multiple folders and 
# maintain the order of operations for schema consistency
# 
import pandas as pd
import os
from collections import defaultdict

folders = ["../data/artifacts2/reqid_001", "../data/artifacts2/reqid002"]
dfs = []

for folder in folders:
    parquet_files = [f for f in os.listdir(folder) if f.endswith(".parquet")]

    # Separate part files and full files
    part_files = [f for f in parquet_files if "_part" in f]
    print(f"Part files in {folder}:", part_files)
    full_files = [f for f in parquet_files if "_part" not in f]
    print(f"Full files in {folder}:", full_files)

    # Handle full files
    for f in full_files:
        df = pd.read_parquet(os.path.join(folder, f))
        dfs.append(df)

    # Handle part files: group by prefix before '_part'
    part_groups = defaultdict(list)
    for f in part_files:
        print("f: ", f)
        print("inside for loop, folder: ", folder)
        prefix = f.split("_part")[0]
        part_groups[prefix].append(f)

    for prefix, files in part_groups.items():
        # Sort to ensure consistent order
        files = sorted(files)
        part_dfs = [pd.read_parquet(os.path.join(folder, f)) for f in files]
        # Merge column-wise (axis=1)
        merged_df = pd.concat(part_dfs, axis=1)
        # Remove duplicate columns if any (e.g., 'time', 'TO')
        merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
        dfs.append(merged_df)

# Combine all data vertically (axis=0)
final_df = pd.concat(dfs, ignore_index=True)
print(final_df.shape)
print(final_df.head())

Part files in ../data/artifacts2/reqid_001: ['1757527232_1757278522_part2.parquet', '1757527232_1757278522_part1.parquet']
Full files in ../data/artifacts2/reqid_001: ['1757565217_1757565282.parquet', '1757565217_1757565295.parquet', '1757527232_1757278522.parquet']
f:  1757527232_1757278522_part2.parquet
inside for loop, folder:  ../data/artifacts2/reqid_001
f:  1757527232_1757278522_part1.parquet
inside for loop, folder:  ../data/artifacts2/reqid_001
Part files in ../data/artifacts2/reqid002: []
Full files in ../data/artifacts2/reqid002: ['1757527232_1757278782.parquet', '1757527232_1757278781.parquet']
(460, 10)
         time        TO  sensor1_min  sensor1_max  sensor2_min  sensor2_max  \
0  1743532199  10540337         40.0         60.0         10.0         98.0   
1  1743618599  10032096         40.5         60.7         10.5         99.0   
2  1743704999  20012001         41.0         61.4         11.0         98.0   
3  1743791399  30045009         41.5         62.1         10.

In [22]:
import pandas as pd
import os
import zipfile
from io import BytesIO
from collections import defaultdict

artifacts_folder = "../data/artifacts2"
dfs = []

# Find all zip files in the artifacts folder
zip_files = [f for f in os.listdir(artifacts_folder) if f.endswith(".zip")]
print("Zip files found:", zip_files)

for zip_filename in zip_files:
    zip_path = os.path.join(artifacts_folder, zip_filename)
    print(f"Processing {zip_path}...")
    with zipfile.ZipFile(zip_path, "r") as z:
        print("Files in zip:", z.namelist())
        # Group part files by prefix, collect full files
        part_groups = defaultdict(list)
        full_files = []
        for file in z.namelist():
            if file.endswith(".parquet"):
                if "_part" in os.path.basename(file):
                    prefix = os.path.basename(file).split("_part")[0]
                    part_groups[prefix].append(file)
                else:
                    full_files.append(file)

print("Full files:", full_files)
print("Part groups:", part_groups)

Zip files found: ['reqid002.zip', 'reqid_001.zip']
Processing ../data/artifacts2/reqid002.zip...
Files in zip: ['reqid002/', 'reqid002/1757527232_1757278782.parquet', 'reqid002/1757527232_1757278781.parquet']
Processing ../data/artifacts2/reqid_001.zip...
Files in zip: ['reqid_001/', 'reqid_001/1757565217_1757565282.parquet', 'reqid_001/1757527232_1757278522_part2.parquet', 'reqid_001/1757527232_1757278522_part1.parquet', 'reqid_001/1757565217_1757565295.parquet', 'reqid_001/1757527232_1757278522.parquet']
Full files: ['reqid_001/1757565217_1757565282.parquet', 'reqid_001/1757565217_1757565295.parquet', 'reqid_001/1757527232_1757278522.parquet']
Part groups: defaultdict(<class 'list'>, {'1757527232_1757278522': ['reqid_001/1757527232_1757278522_part2.parquet', 'reqid_001/1757527232_1757278522_part1.parquet']})


In [24]:
import pandas as pd
import os
import zipfile
from io import BytesIO
from collections import defaultdict

artifacts_folder = "../data/artifacts2"
dfs = []

# Find all zip files in the artifacts folder
zip_files = [f for f in os.listdir(artifacts_folder) if f.endswith(".zip")]
print("Zip files found:", zip_files)

for zip_filename in zip_files:
    zip_path = os.path.join(artifacts_folder, zip_filename)
    print(f"Processing {zip_path}...")
    with zipfile.ZipFile(zip_path, "r") as z:
        print("Files in zip:", z.namelist())
        # Group part files by prefix, collect full files
        part_groups = defaultdict(list)
        full_files = []
        for file in z.namelist():
            if file.endswith(".parquet"):
                if "_part" in os.path.basename(file):
                    prefix = os.path.basename(file).split("_part")[0]
                    part_groups[prefix].append(file)
                else:
                    full_files.append(file)
        # Handle full files
        for file in full_files:
            with z.open(file) as f:
                df = pd.read_parquet(BytesIO(f.read()))
                dfs.append(df)

# Combine all data vertically (axis=0)
final_df = pd.concat(dfs, ignore_index=True)
print(final_df.shape)
print(final_df.head())

Zip files found: ['reqid002.zip', 'reqid_001.zip']
Processing ../data/artifacts2/reqid002.zip...
Files in zip: ['reqid002/', 'reqid002/1757527232_1757278782.parquet', 'reqid002/1757527232_1757278781.parquet']
Processing ../data/artifacts2/reqid_001.zip...
Files in zip: ['reqid_001/', 'reqid_001/1757565217_1757565282.parquet', 'reqid_001/1757527232_1757278522_part2.parquet', 'reqid_001/1757527232_1757278522_part1.parquet', 'reqid_001/1757565217_1757565295.parquet', 'reqid_001/1757527232_1757278522.parquet']
(378, 10)
         time        TO  sensor1_min  sensor1_max  sensor2_min  sensor2_max  \
0  1743532199  10540337        65.83        65.27         10.0         98.0   
1  1743618599  10540337        76.09        54.77         10.0         98.0   
2  1743704999  10540337        67.66        66.12         10.0         99.0   
3  1743791399  10540337        83.22        80.61         10.0         99.0   
4  1743877799  10540337        78.29        66.14         10.0         99.0   

   

### BytesIO
BytesIO is used to treat the bytes read from a file-like object (such as a file inside a zip archive) as a file in memory.

When you use z.open(file) on a zipfile, it returns a file-like object, but pd.read_parquet() expects a file path or a file-like object that supports random access (seek/tell). Wrapping the bytes with BytesIO(f.read()) creates an in-memory binary stream that pandas can read as if it were a regular file.

In summary:

BytesIO(f.read()) allows you to read a Parquet file directly from a zip archive without extracting it to disk.
It converts the bytes from the zip file into a file-like object that pandas can process.

In [25]:
import pandas as pd
import os
import zipfile
from io import BytesIO
from collections import defaultdict

artifacts_folder = "../data/artifacts2"
dfs = []

# Find all zip files in the artifacts folder
zip_files = [f for f in os.listdir(artifacts_folder) if f.endswith(".zip")]
print("Zip files found:", zip_files)

for zip_filename in zip_files:
    zip_path = os.path.join(artifacts_folder, zip_filename)
    print(f"Processing {zip_path}...")
    with zipfile.ZipFile(zip_path, "r") as z:
        print("Files in zip:", z.namelist())
        # Group part files by prefix, collect full files
        part_groups = defaultdict(list)
        full_files = []
        for file in z.namelist():
            if file.endswith(".parquet"):
                if "_part" in os.path.basename(file):
                    prefix = os.path.basename(file).split("_part")[0]
                    part_groups[prefix].append(file)
                else:
                    full_files.append(file)
        # Handle full files
        for file in full_files:
            with z.open(file) as f:
                df = pd.read_parquet(BytesIO(f.read()))
                dfs.append(df)
        # Handle part files
        for prefix, files in part_groups.items():
            files = sorted(files)
            part_dfs = []
            for file in files:
                with z.open(file) as f:
                    part_dfs.append(pd.read_parquet(BytesIO(f.read())))
            merged_df = pd.concat(part_dfs, axis=1)
            merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
            dfs.append(merged_df)

# Combine all data vertically (axis=0)
final_df = pd.concat(dfs, ignore_index=True)
print(final_df.shape)
print(final_df.head())

Zip files found: ['reqid002.zip', 'reqid_001.zip']
Processing ../data/artifacts2/reqid002.zip...
Files in zip: ['reqid002/', 'reqid002/1757527232_1757278782.parquet', 'reqid002/1757527232_1757278781.parquet']
Processing ../data/artifacts2/reqid_001.zip...
Files in zip: ['reqid_001/', 'reqid_001/1757565217_1757565282.parquet', 'reqid_001/1757527232_1757278522_part2.parquet', 'reqid_001/1757527232_1757278522_part1.parquet', 'reqid_001/1757565217_1757565295.parquet', 'reqid_001/1757527232_1757278522.parquet']
(460, 10)
         time        TO  sensor1_min  sensor1_max  sensor2_min  sensor2_max  \
0  1743532199  10540337        65.83        65.27         10.0         98.0   
1  1743618599  10540337        76.09        54.77         10.0         98.0   
2  1743704999  10540337        67.66        66.12         10.0         99.0   
3  1743791399  10540337        83.22        80.61         10.0         99.0   
4  1743877799  10540337        78.29        66.14         10.0         99.0   

   