In [67]:
import pyspark
from pyspark.sql import SparkSession

In [68]:
import os
import pandas
from pyspark.sql.types import *

In [69]:
# Create a new SparkSession
spark2 = SparkSession.builder.appName('spark-notebook-2').master('spark://spark-master:7077').getOrCreate()

# Create the schema
schema = StructType([
    StructField("0", StringType(), True)])

In [70]:
# The data from covid.csv will be loaded from a local file path
for path, subdirs, files in os.walk('./data/'):
    for name in files:
        if "ecoli" in name and name.endswith(".txt"):
            txtPath = os.path.join(path, name)
            print("Loading data from text file: {}".format(txtPath))
            # Load the ecoli.txt file
            lines_txt = pandas.read_csv(txtPath)

Loading data from text file: ./data/ecoli.txt


In [71]:
lines_sp = spark2.createDataFrame(lines_txt, schema=schema)

In [72]:
# Preview the structure
lines_sp.show(10)

2023-09-13 09:49:12,021 WARN scheduler.TaskSetManager: Stage 0 contains a task of very large size (2405 KiB). The maximum recommended task size is 1000 KiB.
[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+
|                   0|
+--------------------+
|TTCTGAACTGGTTACCT...|
|TATAGGCATAGCGCACA...|
|ATTACCACCACCATCAC...|
|CCCGCACCTGACAGTGC...|
|GTTCGGCGGTACATCAG...|
|AGGCAGGGGCAGGTGGC...|
|AAAAAACCATTAGCGGC...|
|GACGGGACTCGCCGCCG...|
|GCCCAAATAAAACATGT...|
|TGATTTGCCGTGGCGAG...|
+--------------------+
only showing top 10 rows



                                                                                

In [73]:
# Convert Spark DataFrame to an RDD
lines_rdd = lines_sp.rdd

# Define the lengths for patterns (3 and 4)
pattern_lengths = [3, 4]

In [74]:
lines_rdd = lines_rdd.map(lambda row: row[0])

In [75]:
lines_rdd

PythonRDD[11] at RDD at PythonRDD.scala:53

In [76]:
def line_to_grams(line, length):
    ngrams = []
    for i in range(len(line) - length + 1):
        ngrams.append(line[i : i + length])
    return ngrams

pattern_lengths = [3, 4]

In [77]:
# Create an RDD of n-grams for each pattern length
ngrams_rdds = []
for length in pattern_lengths:
    ngrams_rdd = lines_rdd.flatMap(lambda line: line_to_grams(line, length))
    ngrams_rdds.append(ngrams_rdd)

# Calculate the counts of n-grams
count_rdds = []
for ngrams_rdd, length in zip(ngrams_rdds, pattern_lengths):
    count_rdd = ngrams_rdd.map(lambda ngram: (ngram, 1)).reduceByKey(lambda a, b: a + b).map(lambda x: (x[1], x[0]))
    count_rdds.append(count_rdd)

In [78]:
count_rdds # The two rdds created have the data for each pattern length 

[PythonRDD[20] at RDD at PythonRDD.scala:53,
 PythonRDD[21] at RDD at PythonRDD.scala:53]

In [79]:
merged_rdd = count_rdds[0].union(count_rdds[1])

In [80]:
dna_patterns = merged_rdd.sortByKey(ascending=False).map(lambda x: f"{x[1]} {x[0]}")

2023-09-13 09:49:17,731 WARN scheduler.TaskSetManager: Stage 1 contains a task of very large size (2405 KiB). The maximum recommended task size is 1000 KiB.
2023-09-13 09:49:19,898 WARN scheduler.TaskSetManager: Stage 2 contains a task of very large size (2405 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [81]:
dna_patterns.take(10)

['CGC 112398',
 'GCG 111418',
 'TTT 106672',
 'AAA 105850',
 'CAG 101927',
 'CTG 99892',
 'GCA 93338',
 'TGC 92456',
 'GCC 90317',
 'GGC 89538']

In [82]:
# Split each string into two columns (ngram and count) using space as the delimiter
split_rdd = dna_patterns.map(lambda line: line.split())

# Create a DataFrame from the RDD with column names
columns = ["pattern", "count"]
df = spark2.createDataFrame(split_rdd, columns)

In [83]:
df.show(10)

+-------+------+
|pattern| count|
+-------+------+
|    CGC|112398|
|    GCG|111418|
|    TTT|106672|
|    AAA|105850|
|    CAG|101927|
|    CTG| 99892|
|    GCA| 93338|
|    TGC| 92456|
|    GCC| 90317|
|    GGC| 89538|
+-------+------+
only showing top 10 rows



In [84]:
spark2.stop()