In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import random
import heapq


# Initialize SparkSession
spark = SparkSession.builder.appName("SortLargeDataset").getOrCreate()

In [None]:

# Generate a large dataset
def generate_large_dataset(file_path, num_records):
    with open(file_path, 'w') as f:
        for _ in range(num_records):
            record = random.randint(1, 1000000000)
            f.write(f"{record}\n")

# Example usage
file_path = 'large_dataset.txt'
num_records = 1000000  # For demonstration, use a smaller number
generate_large_dataset(file_path, num_records)

# Load the dataset into a Spark DataFrame
df = spark.read.text(file_path).withColumnRenamed("value", "number")
df.show(5)

# Convert the column to integer type
df = df.withColumn("number", col("number").cast("int"))

# Repartition the DataFrame to create chunks
num_partitions = 10  # Adjust based on available resources
df = df.repartition(num_partitions)

# Convert DataFrame to RDD
rdd = df.rdd

# Sort each partition individually
sorted_rdd = rdd.mapPartitions(lambda partition: \
    sorted(partition, key=lambda row: row['number']))

# Function to merge sorted partitions using a heap
def merge_sorted_partitions(partitions):
    min_heap = []
    for partition in partitions:
        for row in partition:
            heapq.heappush(min_heap, row['number'])
    sorted_list = []
    while min_heap:
        sorted_list.append(heapq.heappop(min_heap))
    return iter(sorted_list)

# Apply the merge function to the sorted partitions
merged_sorted_rdd = sorted_rdd.mapPartitions(lambda \
    partition: merge_sorted_partitions([partition]))

# Convert the RDD back to a DataFrame
sorted_df = merged_sorted_rdd.map(lambda x: (x,)).toDF(["number"])

# Save the sorted DataFrame to a file
output_file = 'sorted_large_dataset'
sorted_df.coalesce(1).write.format("noop").csv(output_file, header=True)

In [None]:
spark.stop()