In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import length, mean
import matplotlib.pyplot as plt

In [None]:
# Initialize a Spark session
spark = SparkSession.builder.appName("GenomeDataProcessing").getOrCreate()

In [None]:
# Read (mock) genome data from the CSV file
genome_df = spark.read.csv("../data/genome_data.csv", header=True)

In [None]:
# Display the DataFrame
genome_df.show()

In [None]:
# Calculate the length of each genome sequence
genome_df = genome_df.withColumn("SequenceLength", length(genome_df["GenomeSequence"]))

In [None]:
# Calculate the mean sequence length
mean_length = genome_df.select(mean(genome_df["SequenceLength"])).collect()[0][0]
print(f"Mean Sequence Length: {mean_length:.2f}")

In [None]:
# Collect sequence lengths to create a histogram
sequence_lengths = genome_df.select("SequenceLength").rdd.flatMap(lambda x: x).collect()

In [None]:
# Plot a histogram
plt.hist(sequence_lengths, bins=20, edgecolor='black')
plt.title("Distribution of Genome Sequence Lengths")
plt.xlabel("Sequence Length")
plt.ylabel("Frequency")

In [None]:
# Display the histogram
plt.show()

In [None]:
# Stop the Spark session
spark.stop()