**Merge Parquets**

This code was used for merging back parquets which needed to be saved in chunks due to storage

In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

# Path where the parts are saved
base_path = r"C:\Users\annik\OneDrive\Desktop\BIG DATA A3\parquet_parts"

# Output file
output_path = os.path.join(base_path, "Grocery_and_Gourmet_Merged_FULL.parquet")
writer = None

# Loop through chunks one at a time
for i in range(1, 6):
    print(f"Processing part {i}...")
    chunk_path = os.path.join(base_path, f"Grocery_and_Gourmet_Merged_part{i}.parquet")
    chunk_df = pd.read_parquet(chunk_path)  # One chunk at a time
    
    # Convert to PyArrow Table
    table = pa.Table.from_pandas(chunk_df)
    
    # Initialize writer once with schema
    if writer is None:
        writer = pq.ParquetWriter(output_path, table.schema)
    
    # Append to file
    writer.write_table(table)

# Done writing!
writer.close()
print("✅ Incrementally merged and saved without RAM KO.")

Processing part 1...
Processing part 2...
Processing part 3...
Processing part 4...
Processing part 5...
✅ Incrementally merged and saved without RAM KO.


In [2]:
import pyarrow.parquet as pq

# Path to your full Parquet file
file_path = r"C:\Users\annik\OneDrive\Desktop\BIG DATA A3\parquet_parts\Grocery_and_Gourmet_Merged_FULL.parquet"

# Load the entire dataset in PyArrow (without reading everything into memory at once)
dataset = pq.ParquetDataset(file_path)

# Read the first few rows (get the first 10 rows)
table = dataset.read(columns=None).slice(0, 10)  # The slice here reads the first 10 rows
df_sample = table.to_pandas()  # Convert to pandas DataFrame

# Show the sample data
print(df_sample)

   rating                                 title_x  \
0     5.0                      Excellent!  Yummy!   
1     5.0                       Delicious!!! Yum!   
2     5.0  Extremely Delicious, but expensive imo   
3     5.0                              Delicious!   
4     5.0                             Great taste   
5     5.0                                  Yummy!   
6     5.0          Excellent tea & smells divine!   
7     5.0        Like drinking a wedding bouquet!   
8     5.0                              Delicious!   
9     5.0               Yummy tea with a cool tin   

                                                text  images_x        asin  \
0  Excellent!! Yummy!  Great with other foods and...       NaN  B00CM36GAQ   
1  Excellent!  The best!  I use it with my beef a...       NaN  B074J5WVYH   
2  These are very tasty. They are extremely soft ...       NaN  B079TRNVHX   
3                                       My favorite!       NaN  B07194LN2Z   
4     Great for making bro

In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

# Folder where your parts are saved
base_path = r"C:\Users\annik\OneDrive\Desktop\BIG DATA A3"

# Output full merged file
output_path = os.path.join(base_path, "Automotive_Merged_FULL.parquet")
writer = None

# Loop through all 20 parts
for i in range(1, 21):
    print(f"Processing part {i}...")
    chunk_path = os.path.join(base_path, f"Automotive_part{i}.parquet")
    chunk_df = pd.read_parquet(chunk_path)

    # Convert chunk to PyArrow Table
    table = pa.Table.from_pandas(chunk_df)

    # Write to Parquet incrementally
    if writer is None:
        writer = pq.ParquetWriter(output_path, table.schema)
    
    writer.write_table(table)

# Close the writer when done
writer.close()
print("✅ All 20 parts merged and saved at:\n" + output_path)

Processing part 1...
Processing part 2...
Processing part 3...
Processing part 4...
Processing part 5...
Processing part 6...
Processing part 7...
Processing part 8...
Processing part 9...
Processing part 10...
Processing part 11...
Processing part 12...
Processing part 13...
Processing part 14...
Processing part 15...
Processing part 16...
Processing part 17...
Processing part 18...
Processing part 19...
Processing part 20...
✅ All 20 parts merged and saved at:
C:\Users\annik\OneDrive\Desktop\BIG DATA A3\Automotive_Merged_FULL.parquet


In [None]:
import pyarrow.parquet as pq

# Path to your full Parquet file
file_path = r"C:\Users\annik\OneDrive\Desktop\BIG DATA A3\Automotive_Merged_FULL.parquet"

# Load the entire dataset in PyArrow (without reading everything into memory at once)
dataset = pq.ParquetDataset(file_path)

# Read the first few rows (get the first 10 rows)
table = dataset.read(columns=None).slice(0, 10)  # The slice here reads the first 10 rows
df_sample = table.to_pandas()  # Convert to pandas DataFrame

# Show the sample data
print(df_sample)