In [1]:
import pandas as pd
from pathlib import Path

## 2. Manual Data Splitting Across Nodes:
You will manually split the same dataset into parts and place each part on a different
node:
- Worker 1: Receives one portion of the dataset.
- Worker 2: Receives another portion of the dataset (or only one worker in the
simplified setup handles half the data).
- Master: Holds the remaining portion of the dataset.

Each node will only have access to its part of the dataset, which will later be combined
during distributed processing.

In [2]:
data_dir = Path(".")

df_full = pd.read_csv(data_dir / "jan_2008.csv", low_memory=False)
print(f"Full dataset: {df_full.shape[0]:,} rows")

n = len(df_full) // 3
df_part1 = df_full.iloc[0:n]
df_part2 = df_full.iloc[n : 2 * n]
df_part3 = df_full.iloc[2 * n :]

Full dataset: 605,765 rows


In [3]:
df_part1.to_csv(data_dir / "part1.csv", index=False)
df_part2.to_csv(data_dir / "part2.csv", index=False)
df_part3.to_csv(data_dir / "part3.csv", index=False)

print(f"Part 1 (master): {len(df_part1):,} rows")
print(f"Part 2 (worker1): {len(df_part2):,} rows")
print(f"Part 3 (worker2): {len(df_part3):,} rows")

Part 1 (master): 201,921 rows
Part 2 (worker1): 201,921 rows
Part 3 (worker2): 201,923 rows
