In [4]:
# Extract 200 random entries each from five .parquet files, creating a 1000 entry dataset
import pandas as pd
import glob
import os

def sample_and_combine(parquet_files, samples_per_file=334, output_file="sampled_dataset.parquet"):
    """
    Reads multiple parquet files, randomly samples rows from each,
    and combines them into a single parquet dataset.

    Args:
        parquet_files (list): List of parquet file paths.
        samples_per_file (int): Number of random samples to select from each file.
        output_file (str): Path for the combined parquet output file.
    """
    sampled_dfs = []

    for file in parquet_files:
        print(f"Reading {file} ...")
        df = pd.read_parquet(file)
        sampled = df.sample(n=samples_per_file, random_state=42)  # for reproducibility
        sampled_dfs.append(sampled)

    combined_df = pd.concat(sampled_dfs, ignore_index=True)
    print(f"Final dataset shape: {combined_df.shape}")

    combined_df.to_parquet(output_file, index=False)
    print(f"Saved combined dataset to {output_file}")


if __name__ == "__main__":
    # Adjust pattern or file paths as needed
    parquet_files = sorted(glob.glob("drive/MyDrive/Samsung - Benchmarking/data/original-summarization/*.parquet"))[:3]  # first 5 parquet files in 'data/' directory

    sample_and_combine(parquet_files, samples_per_file=334, output_file="summarization_dataset.parquet")


Reading drive/MyDrive/Samsung - Benchmarking/data/original-summarization/test-00000-of-00001 (1).parquet ...
Reading drive/MyDrive/Samsung - Benchmarking/data/original-summarization/test-00000-of-00001 (2).parquet ...
Reading drive/MyDrive/Samsung - Benchmarking/data/original-summarization/test-00000-of-00001 (3).parquet ...
Final dataset shape: (1002, 3)
Saved combined dataset to summarization_dataset.parquet


In [5]:
# Print size of a dataframe
import pandas as pd
import glob
import os

file = glob.glob("drive/MyDrive/Samsung - Benchmarking/data/original-mmlu/*.parquet")
print(f"Reading {file} ...")
df = pd.read_parquet(file)
print(df.shape)

Reading ['drive/MyDrive/Samsung - Benchmarking/data/original-mmlu/test-00000-of-00001.parquet'] ...
(14042, 4)


In [6]:
# Extract 1000 random entries from a single .parquet file
import pandas as pd
import glob
import os

file = glob.glob("drive/MyDrive/Samsung - Benchmarking/data/original-mmlu/*.parquet")
print(f"Reading {file} ...")
df = pd.read_parquet(file)
print(df.shape)

output_file = "mmlu_dataset.parquet"
sampled = df.sample(n=1000, random_state=42)
sampled.to_parquet(output_file, index=False)
print(f"Saved combined dataset to {output_file}")

Reading ['drive/MyDrive/Samsung - Benchmarking/data/original-mmlu/test-00000-of-00001.parquet'] ...
(14042, 4)
Saved combined dataset to mmlu_dataset.parquet


In [11]:
# Convert parquet to CSV
import pandas as pd

# Read parquet file
df = pd.read_parquet("drive/MyDrive/Samsung - Benchmarking/data/science_qa_dataset.parquet")

# Save as CSV
df.to_csv("drive/MyDrive/Samsung - Benchmarking/data/science_qa_dataset.csv", index=False)
