In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=f38a5403ac154f4fdfdc698839d6629ebe2aa4b4aad96a5c5fd2d65a5c5118d4
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DNA-Count") \
    .getOrCreate()

spark


# New Section

In [5]:
input_path = "/content/sample.fasta"

""" Create an instance of SparkContext as sc """
sc = spark.sparkContext

""" Read the input data """
records_rdd = sc.textFile(input_path)

records_rdd.collect()


['>seq1',
 'cGTAaccaataaaaaaacaagcttaacctaattc',
 '>seq2',
 'agcttagTTTGGatctggccgggg',
 '>seq3',
 'gcggatttactcCCCCCAAAAANNaggggagagcccagataaatggagtctgtgcgtccaca',
 'gaattcgcacca',
 'AATAAAACCTCACCCAT',
 'agagcccagaatttactcCCC',
 '>seq4',
 'gcggatttactcaggggagagcccagGGataaatggagtctgtgcgtccaca',
 'gaattcgcacca']

In [9]:
def process_record(fasta_record):
    key_value_list = []

    if fasta_record.startswith(">"):
        key_value_list.append((fasta_record, 1))
    else:
        chars = fasta_record.lower()
        for c in chars:
            key_value_list.append((c, 1))

    return key_value_list

""" Apply the function to the RDD and collect the results """
pairs_rdd = records_rdd.flatMap(lambda rec: process_record(rec))
collected_pairs = pairs_rdd.collect()

""" Print the collected pairs """
for pair in collected_pairs:
    print(pair)

('>seq1', 1)
('c', 1)
('g', 1)
('t', 1)
('a', 1)
('a', 1)
('c', 1)
('c', 1)
('a', 1)
('a', 1)
('t', 1)
('a', 1)
('a', 1)
('a', 1)
('a', 1)
('a', 1)
('a', 1)
('a', 1)
('c', 1)
('a', 1)
('a', 1)
('g', 1)
('c', 1)
('t', 1)
('t', 1)
('a', 1)
('a', 1)
('c', 1)
('c', 1)
('t', 1)
('a', 1)
('a', 1)
('t', 1)
('t', 1)
('c', 1)
('>seq2', 1)
('a', 1)
('g', 1)
('c', 1)
('t', 1)
('t', 1)
('a', 1)
('g', 1)
('t', 1)
('t', 1)
('t', 1)
('g', 1)
('g', 1)
('a', 1)
('t', 1)
('c', 1)
('t', 1)
('g', 1)
('g', 1)
('c', 1)
('c', 1)
('g', 1)
('g', 1)
('g', 1)
('g', 1)
('>seq3', 1)
('g', 1)
('c', 1)
('g', 1)
('g', 1)
('a', 1)
('t', 1)
('t', 1)
('t', 1)
('a', 1)
('c', 1)
('t', 1)
('c', 1)
('c', 1)
('c', 1)
('c', 1)
('c', 1)
('c', 1)
('a', 1)
('a', 1)
('a', 1)
('a', 1)
('a', 1)
('n', 1)
('n', 1)
('a', 1)
('g', 1)
('g', 1)
('g', 1)
('g', 1)
('a', 1)
('g', 1)
('a', 1)
('g', 1)
('c', 1)
('c', 1)
('c', 1)
('a', 1)
('g', 1)
('a', 1)
('t', 1)
('a', 1)
('a', 1)
('a', 1)
('t', 1)
('g', 1)
('g', 1)
('a', 1)
('g', 1)
('t', 1

In [10]:
""" reduce step : count the frequnecy of each letter """
frequencies_rdd = pairs_rdd.reduceByKey(lambda x, y: x+y)
frequencies_rdd.collect()

[('>seq1', 1),
 ('c', 61),
 ('g', 53),
 ('>seq4', 1),
 ('t', 45),
 ('a', 73),
 ('>seq2', 1),
 ('>seq3', 1),
 ('n', 2)]