**DATA SAMPLING**

In [None]:
from pyspark.sql import SparkSession

def combine_and_sample_correctly_with_multiple_columns(job_desc_file, resumes_file, output_file, sample_size=5700):
    # Initialize Spark session
    spark = SparkSession.builder \
        .appName("Combine and Sample Datasets with Multiple Columns") \
        .config("spark.executor.memory", "4g") \
        .config("spark.driver.memory", "4g") \
        .config("spark.sql.csv.multiLine", "true") \
        .config("spark.sql.csv.escape", '"') \
        .getOrCreate()

    print("Loading datasets...")

    # Load the job descriptions dataset
    job_desc_df = spark.read.csv(job_desc_file, header=True, inferSchema=True)

    # Load the resumes dataset with multiline support
    resumes_df = spark.read.option("multiLine", "true") \
                           .option("quote", '"') \
                           .option("escape", '"') \
                           .option("header", "true") \
                           .csv(resumes_file)

    # Verify data
    if job_desc_df.count() == 0 or resumes_df.count() == 0:
        raise ValueError("One of the input datasets is empty. Please check the input files.")

    print(f"Job Description Rows: {job_desc_df.count()}, Resume Rows: {resumes_df.count()}")

    # Perform Cartesian product
    print("Performing Cartesian product...")
    combined_df = job_desc_df.crossJoin(resumes_df)

    # Total number of rows after Cartesian product
    total_records = combined_df.count()
    print(f"Total records after Cartesian product: {total_records}")

    # Sample the specified number of records
    if sample_size > total_records:
        print(f"Warning: Sample size {sample_size} exceeds total records {total_records}. Taking all records.")
        sampled_df = combined_df
    else:
        sampled_df = combined_df.sample(withReplacement=False, fraction=sample_size / total_records, seed=1)

    print(f"Saving sampled dataset to: {output_file}")
    # Save the sampled dataset to a CSV file
    sampled_df.write.option("quote", '"') \
                    .option("escape", '"') \
                    .option("multiLine", "true") \
                    .csv(output_file, header=True, mode="overwrite")
    print(f"Sampled dataset saved successfully to: {output_file}")

# File paths
job_desc_file = "/FileStore/tables/jd.csv"
resumes_file = "/FileStore/tables/UpdatedResumeDataSet.csv"
output_file = "/FileStore/tables/sampled_datasets.csv"

# Sample size
sample_size = 5700

combine_and_sample_correctly_with_multiple_columns(job_desc_file, resumes_file, output_file, sample_size)

Loading datasets...
Job Description Rows: 5001, Resume Rows: 962
Performing Cartesian product...
Total records after Cartesian product: 4810962
Saving sampled dataset to: /FileStore/tables/sampled_datasets.csv
Sampled dataset saved successfully to: /FileStore/tables/sampled_datasets.csv
