In [76]:
import re

# Checks if a given string is a valid RNA sequence.
def is_valid_rna(sequence):
    #
    pattern = re.compile(r'^[ACGU]+$')

    return bool(pattern.fullmatch(sequence))

# Test examples
print(is_valid_rna("AUGCAUGCAUGC"))  # Expected output: True
print(is_valid_rna("AUGTXAGCAUGC"))  # Expected output: False


True
False


In [77]:
def nucleotide_count(sequence):
    if not is_valid_rna(sequence):
        return "Invalid RNA sequence. Sequence must contain only A, C, G, U."

    #Count the occurrences of each nucleotide in an RNA sequence.
    counts = {'A': 0, 'C': 0, 'G': 0, 'U': 0}

    for nucleotide in sequence:
        if nucleotide in counts:
            counts[nucleotide] += 1

    # Return the dictionary with counts
    return counts

# Test examples
print(nucleotide_count("AUGCAUGCAUGC"))  # Expected output: {'A': 3, 'C': 3, 'G': 3, 'U': 3}
print(nucleotide_count("ACGUACGUACGU"))  # Expected output: {'A': 3, 'C': 3, 'G': 3, 'U': 3}
print(nucleotide_count("AAAACCCGGGUUU"))  # Expected output: {'A': 4, 'C': 3, 'G': 3, 'U': 3}


{'A': 3, 'C': 3, 'G': 3, 'U': 3}
{'A': 3, 'C': 3, 'G': 3, 'U': 3}
{'A': 4, 'C': 3, 'G': 3, 'U': 3}


In [78]:
def find_motifs(sequence, motif):

    if not is_valid_rna(sequence):
        return "Invalid RNA sequence. Sequence must contain only A, C, G, U."

    pattern = re.compile(motif)

    # Find all matches of the motif in the sequence
    matches = pattern.finditer(sequence)

    # Extract the starting positions of each match
    positions = [match.start() for match in matches]

    return positions

# Test examples
print(find_motifs("AUGCAUGCAUGC", "AUG"))  # Expected output: [0, 4, 8]
print(find_motifs("AUGCAUGCAUGC", "CAU"))  # Expected output: [2, 6]
print(find_motifs("AUGCAUGCAUGCUAG", "AUGC"))  # Expected output: [0, 4, 8]
print(find_motifs("AUGCUGCAUGCAUGCUGCAUGCAUGCUAG", "AUGC"))  # Expected output: [0, 7, 11, 18, 22]


[0, 4, 8]
[3, 7]
[0, 4, 8]
[0, 7, 11, 18, 22]


In [79]:
def complementary_sequence(sequence):
    if not is_valid_rna(sequence):
        return "Invalid RNA sequence. Sequence must contain only A, C, G, U."

    # Replace A with U, U with A, C with G, and G with C
    complement_map = {"A": "U", "U": "A", "C": "G", "G": "C"}

    # Generates the complementary sequence of a given RNA sequence.
    return "".join(complement_map.get(char, char) for char in sequence)

# Test examples
print(complementary_sequence("AUGCAUGCAUGC"))  # Expected output: UACGUACGUACG
print(complementary_sequence("ACGUACGUACGU"))  # Expected output: UGCAUGCAUGCA
print(complementary_sequence("AAAACCCGGGUUU"))  # Expected output: UUUUGGGCCCAAA


UACGUACGUACG
UGCAUGCAUGCA
UUUUGGGCCCAAA


In [80]:
def gc_content(sequence):

    if not is_valid_rna(sequence):
        return "Invalid RNA sequence. Sequence must contain only A, C, G, U."

    # Calculate the count of G and C nucleotides
    gc_count = sequence.count('G') + sequence.count('C')


    # Calculate the length of the sequence
    total_length = len(sequence)

    # Calculate the GC content as a percentage
    if total_length == 0:
        return 0.0

    gc_percentage = (gc_count / total_length) * 100

    # Return the GC content percentage
    return round(gc_percentage, 2)

# Test examples
print(gc_content("AUGCAUGCAUGC"))  # Expected output: 50.0
print(gc_content("ACGUACGUACGU"))  # Expected output: 50.0
print(gc_content("AAAACCCGGGUUU"))  # Expected output: 46.1
print(gc_content("GGGGCCCC"))       # Expected output: 100.0
print(gc_content("AUAUAUAU"))       # Expected output: 0.0


50.0
50.0
46.15
100.0
0.0


In [81]:
def is_valid_rna_adv(sequence):
    # Define the set of valid RNA nucleotides including ambiguity codes
    pattern = re.compile(r'^[ACGURYWSKMBDHVN]+$')

    # Check if the sequence matches the pattern
    return bool(pattern.match(sequence))

# Test examples
valid_sequence_with_ambiguity = "AUGCRYSWKMBDHVN"
invalid_sequence = "AUGTXZGCAUGC"

print(is_valid_rna_adv(valid_sequence_with_ambiguity))  # Expected output: True
print(is_valid_rna_adv(invalid_sequence))              # Expected output: False


True
False


In [82]:
# Dictionary to map ambiguity codes to regex patterns
ambiguity_codes = {
    'A': 'A', 'C': 'C', 'G': 'G', 'U': 'U',
    'R': '[AG]', 'Y': '[CU]', 'S': '[CG]', 'W': '[AU]',
    'K': '[GU]', 'M': '[AC]', 'B': '[CGU]', 'D': '[AGU]',
    'H': '[ACU]', 'V': '[ACG]', 'N': '[ACGU]'
}

def motif_to_regex(motif):
    # Convert the motif to a regex pattern
    regex_pattern = ''.join(ambiguity_codes.get(char, char) for char in motif)
    return regex_pattern

def find_motifs_with_amb(sequence, motif):
    if not is_valid_rna_adv(sequence):
        return "Invalid RNA sequence."
    # Convert the motif to a regex pattern
    regex_pattern = motif_to_regex(motif)

    # Find all matches using regex
    matches = re.finditer(regex_pattern, sequence)

    # Collect the starting positions of each match
    positions = [match.start() for match in matches]

    # Return the positions
    return positions

# Test examples
sequence = "AUGCRYSN"
motif = "RY"
positions = find_motifs_with_amb(sequence, motif)
print(f"Motif '{motif}' found at positions: {positions}")  # Expected output: Motif 'RY' found at positions: [3]
print(f"Motif 'AUG' found at positions: {find_motifs_with_amb('AUGCAUGCAUGC', 'AUG')}")  # Expected output: Motif 'AUG' found at positions: [0, 4, 8]
print(f"Motif 'N' found at positions: {find_motifs_with_amb('AUGCRYSN', 'N')}")  # Expected output: Motif 'N' found at positions: [7]
print(f"Motif 'S' found at positions: {find_motifs_with_amb('AUGCRYSN', 'S')}")  # Expected output: Motif 'S' found at positions: [5]
print(f"Motif 'RYS' found at positions: {find_motifs_with_amb('AUGCRYSN', 'RYS')}")  # Expected output: Motif 'RYS' found at positions: [4]


Motif 'RY' found at positions: [0, 2]
Motif 'AUG' found at positions: [0, 4, 8]
Motif 'N' found at positions: [0, 1, 2, 3]
Motif 'S' found at positions: [2, 3]
Motif 'RYS' found at positions: [0]


In [83]:
def complementary_sequence_with_amb(sequence):

   # complete_map for advance sequences
    complement_map = { 'A': 'U', 'U': 'A', 'C': 'G', 'G': 'C',
        'R': 'Y', 'Y': 'R', 'S': 'S', 'W': 'W',
        'K': 'M', 'M': 'K', 'B': 'V', 'D': 'H',
        'H': 'D', 'V': 'B', 'N': 'N'}

    # Generates the complementary sequence of a given RNA sequence.
    return "".join(complement_map.get(char, char) for char in sequence)

In [84]:
def gc_content_with_amb(sequence):

    # Calculate the count of G and C nucleotides in advance sequences
    gc_count = sequence.count('G') + sequence.count('C')+sequence.count('S')


    # Calculate the length of the sequence
    total_length = len(sequence)

    # Calculate the GC content as a percentage
    if total_length == 0:
        return 0.0

    gc_percentage = (gc_count / total_length) * 100

    # Return the GC content percentage
    return round(gc_percentage, 2)

In [85]:
def fragment_and_analyze(sequence, fragment_length):

  #Fragment the RNA sequence into smaller parts and analyze each fragment.

    results = []
    for i in range(0, len(sequence), fragment_length):
        fragment = sequence[i:i + fragment_length] # Extract the current fragment
        is_valid = is_valid_rna_adv(fragment)  # Check if the fragment is a valid RNA sequence
        if is_valid:
            gc = gc_content_with_amb(fragment) # Calculate the GC content of the fragment if valid
            comp_seq = complementary_sequence_with_amb(fragment) # Get the complementary sequence if valid
        else:
            gc = 'N/A' # Set GC content to 'N/A' if the fragment is not valid
            comp_seq = 'N/A'  # Set complementary sequence to 'N/A' if the fragment is not valid


        # Append a dictionary with the analysis results of the current fragment to the results list
        results.append({
            'fragment': fragment,
            'is_valid_rna': is_valid,
            'gc_content': gc,
            'complementary_sequence': comp_seq
        })

    return results

# Test examples
sequence = "AUGCRYSNAUGCRYXNAUGCRYSN"
fragment_length = 6
analysis_results = fragment_and_analyze(sequence, fragment_length)
for result in analysis_results:
    print(result)


{'fragment': 'AUGCRY', 'is_valid_rna': True, 'gc_content': 33.33, 'complementary_sequence': 'UACGYR'}
{'fragment': 'SNAUGC', 'is_valid_rna': True, 'gc_content': 50.0, 'complementary_sequence': 'SNUACG'}
{'fragment': 'RYXNAU', 'is_valid_rna': False, 'gc_content': 'N/A', 'complementary_sequence': 'N/A'}
{'fragment': 'GCRYSN', 'is_valid_rna': True, 'gc_content': 50.0, 'complementary_sequence': 'CGYRSN'}


# **Unit Tests**

In [86]:
import unittest
class Tests(unittest.TestCase):
 def test_nucleotide_count_valid_sequence(self):
        sequence = "ACGUACGU"
        expected_counts = {'A': 2, 'C': 2, 'G': 2, 'U': 2}
        self.assertEqual(nucleotide_count(sequence), expected_counts)

 def test_nucleotide_count_invalid_sequence(self):
        sequence = "ACGTX"
        expected_message = "Invalid RNA sequence. Sequence must contain only A, C, G, U."
        self.assertEqual(nucleotide_count(sequence), expected_message)
 def test_find_motifs(self):
        sequence = "AUGCAUGCAUGC"
        motif = "AUG"
        expected_positions = [0, 4, 8]
        self.assertEqual(find_motifs(sequence, motif), expected_positions)
 def test_gc_content_basic(self):
        # Test with a basic RNA sequence
        rna_seq = "AUCGCUAGU"
        expected_result = 44.44
        self.assertAlmostEqual(gc_content(rna_seq), expected_result, places=2)

 def test_gc_content_ambiguity_codes(self):
        # Test with an RNA sequence containing ambiguity codes
        rna_seq = "AUCGRYSWN"
        expected_result = 33.33
        self.assertAlmostEqual(gc_content_with_amb(rna_seq), expected_result, places=2)

 def test_complementary_sequence_basic(self):
        # Test with a basic RNA sequence
        rna_seq = "AUCGCUAGU"
        expected_result = "UAGCGAUCA"
        self.assertEqual(complementary_sequence(rna_seq), expected_result)
 def test_complementary_sequence_ambiguity_codes(self):
        # Test with an RNA sequence containing ambiguity codes
        rna_seq = "AUCGRYSWN"
        expected_result = "UAGCYRSWN"
        self.assertEqual(complementary_sequence_with_amb(rna_seq), expected_result)

 def test_basic_valid_rna(self):
        # Test with a basic valid RNA sequence
        rna_seq = "AUGCAUGCAUGC"
        self.assertTrue(is_valid_rna(rna_seq))

 def test_valid_rna_with_ambiguity_codes(self):
        # Test with an RNA sequence containing ambiguity codes
        rna_seq = "AUGCRYSWMNAGUC"
        self.assertTrue(is_valid_rna_adv(rna_seq))

 def test_fragment_and_analyze_basic(self):
        # Test with a basic RNA sequence and fragment length 3
        sequence = "AUGCAUGCAUGC"
        fragment_length = 3
        expected_results = [
            {'fragment': 'AUG', 'is_valid_rna': True, 'gc_content': 33.33, 'complementary_sequence': 'UAC'},
            {'fragment': 'CAU', 'is_valid_rna': True, 'gc_content': 33.33, 'complementary_sequence': 'GUA'},
            {'fragment': 'GCA', 'is_valid_rna': True, 'gc_content': 66.67, 'complementary_sequence': 'CGU'},
            {'fragment': 'UGC', 'is_valid_rna': True, 'gc_content': 66.67, 'complementary_sequence': 'ACG'}
        ]
        self.assertEqual(fragment_and_analyze(sequence, fragment_length), expected_results)

def test_fragment_and_analyze_ambiguity_codes(self):
        # Test with an RNA sequence containing ambiguity codes and fragment length 4
        sequence = "AUGCRYSWMNAGUC"
        fragment_length = 4
        expected_results = [
            {'fragment': 'AUGC', 'is_valid_rna': True, 'gc_content': 50.0, 'complementary_sequence': 'UACG'},
            {'fragment': 'RYSW', 'is_valid_rna': False, 'gc_content': 'N/A', 'complementary_sequence': 'N/A'},
            {'fragment': 'MNAG', 'is_valid_rna': False, 'gc_content': 'N/A', 'complementary_sequence': 'N/A'},
            {'fragment': 'UC', 'is_valid_rna': True, 'gc_content': 50.0, 'complementary_sequence': 'AG'}
        ]
        self.assertEqual(fragment_and_analyze(sequence, fragment_length), expected_results)

unittest.main(argv=[''], verbosity=2, exit=False)

test_basic_valid_rna (__main__.Tests) ... ok
test_complementary_sequence_ambiguity_codes (__main__.Tests) ... ok
test_complementary_sequence_basic (__main__.Tests) ... ok
test_find_motifs (__main__.Tests) ... ok
test_fragment_and_analyze_basic (__main__.Tests) ... ok
test_gc_content_ambiguity_codes (__main__.Tests) ... ok
test_gc_content_basic (__main__.Tests) ... ok
test_nucleotide_count_invalid_sequence (__main__.Tests) ... ok
test_nucleotide_count_valid_sequence (__main__.Tests) ... ok
test_valid_rna_with_ambiguity_codes (__main__.Tests) ... ok

----------------------------------------------------------------------
Ran 10 tests in 0.024s

OK


<unittest.main.TestProgram at 0x7d6c8287acb0>