In [1]:
import pandas as pd
from multiprocessing import Pool, cpu_count
import numpy as np

# Load the two CSV files
students_df = pd.read_csv("students.csv")
fees_df = pd.read_csv("student_fees.csv")

# Create a dictionary for quick lookup of student data by Student ID
students_dict = {row["Student ID"]: row for _, row in students_df.iterrows()}

# Function to merge a chunk of the fees DataFrame
def merge_chunk(fee_chunk):
    merged_chunk = []
    for _, fee_row in fee_chunk.iterrows():
        student_id = fee_row["Student ID"]
        if student_id in students_dict:
            student_data = students_dict[student_id]
            merged_row = {**fee_row, **student_data}  # Combine the two rows
            merged_chunk.append(merged_row)
    return merged_chunk

# Split the fees DataFrame into chunks for parallel processing
num_chunks = cpu_count()
fee_chunks = np.array_split(fees_df, num_chunks)

# Use multiprocessing to process the chunks in parallel
with Pool(num_chunks) as pool:
    results = pool.map(merge_chunk, fee_chunks)

# Combine the results from all chunks
merged_data = [row for chunk in results for row in chunk]

# Convert the merged data into a DataFrame
merged_df = pd.DataFrame(merged_data)

# Save the merged result to a new CSV file
merged_file_path = "merged_students_fees_parallel.csv"
merged_df.to_csv(merged_file_path, index=False)

print(f"Merged file saved to: {merged_file_path}")


  return bound(*args, **kwds)


Merged file saved to: merged_students_fees_parallel.csv
