# Implement and demonstrate dataset sampling using the sample() and takeSample() methods in PySpark

In [None]:
from pyspark.sql import SparkSession

# Step 1: Initialize Spark Session
spark = SparkSession.builder.appName("SamplingExample").getOrCreate()

In [None]:
# Step 2: Read CSV file into DataFrame
df = spark.read.csv("students.csv", header=True, inferSchema=True)

In [None]:
# 1. View first 5 rows
print("=== First 5 rows of dataset ===")
df.show(5)

In [None]:
# 2. Print schema
print("=== Schema of dataset ===")
df.printSchema()

In [None]:
# 3. Random sample without replacement (30% of data)
print("=== Sample (30% without replacement) ===")
df.sample(withReplacement=False, fraction=0.3, seed=42).show(10)

In [None]:
# 4. Random sample with replacement (20% of data)
print("=== Sample (20% with replacement) ===")
df.sample(withReplacement=True, fraction=0.2, seed=42).show(10)

In [None]:
# 5. Take a random sample of 5 rows using takeSample (without replacement)
print("=== takeSample: 5 rows (without replacement) ===")
sampled_rows = df.rdd.takeSample(False, 5, seed=42)
for row in sampled_rows:
    print(row)

In [None]:
# 6. Take a random sample of 5 rows using takeSample (with replacement)
print("=== takeSample: 5 rows (with replacement) ===")
sampled_rows_wr = df.rdd.takeSample(True, 5, seed=42)
for row in sampled_rows_wr:
    print(row)

In [None]:
# 7. Count total rows (to compare with sampled data size)
print("Total rows in dataset:", df.count())

# Stop Spark session
# spark.stop()