In [6]:
import pprint

protein_strand = '''ACACGTGTCAGTTGTGCAGTACACGTACGTCAGTCAACTGTGACCAGTTGGTCAGT CAACCAACGTCAGTCAACTGACCACAGTGTCATGACACGTACGT'''
data = protein_strand.replace(' ','')

class Strand:
    """Must be supplied with a protein strand as a string upon class creation. Does not modify original strand.
    Class is centered around scanning the protein strands for specific protein segments of the users choice.
    The build in methods return information such as the index a segment was located along with its occurances.
    Read methods for more details"""
    def __init__(self, protein_strand:str) -> None:
        self.strand = protein_strand
        self.occurances = 0
        self.segment_data = {}
        self.patterns()
        
    def search(self, sequence:str)->dict:
        """Scans for a protein segment sequence within within the protein strand"""
        results:dict[str,str] = {'sequence':sequence}
        for num in range(len(self.strand)):
            block = self.strand[num:num+len(sequence)]
            if block == sequence:
                results[self.occurances] = {'start':num,'end':num+len(sequence),'length':len(sequence)}
                self.occurances += 1
        return results
        
    def inverse(self, sequence:str)->str:
        """Swaps values of the four base DNA components to specified opposites.
        Takes in a string, returns a string"""
        result = ''
        # Python3.10 time!! Use it or lose it!
        # while the sequence exists:
        while len(sequence):
            s=''
            # check the first value and match it to its opposite base component
            match sequence[:1]:
                case 'A':
                    s = 'C'
                case 'C':
                    s = 'A'
                case 'G':
                    s = 'T'
                case 'T':
                    s = 'G' 
                case _:
                    pass
            # discard the checked character
            sequence=sequence[1:]
            # put the flipped component into 
            result+=s
        # once all letters are removed from the inital sequence the base case is met
        return result
    
    def patterns(self, length=4)->None:
        """Breaks protein chain into all unique segments of a specified length, as well as how many times each segment
        appears in the protein chain"""
        # A list of all segments of the specified length. (Contains duplicates)
        sequences = [self.strand[n:length+n] for n in range(len(self.strand)) if len(self.strand[n:length+n])==length]
        n=0
        # as long as there are segments left in the list
        while sequences:
            # check if iterator has reached the length of the list. If so reset it to 0 (averages one iteration was completed)
            if n >= len(sequences): n = 0
            # Using the last item of the list as a reference, check if current iterations segment matches is
            if sequences[n] == sequences[-1]:
                # if so check if a segment with the same values has already been stored
                if sequences[n] in self.segment_data.keys():
                    # if so add 1 to that segments counter while removing the found segment from the original list of sequences
                    self.segment_data[sequences.pop(n)]+=1
                else:
                    # if it is a new segment type, create a key for that segment and set its counter to 1
                    self.segment_data[sequences.pop(n)]=1
                # every time a segment is removed from the original list decrement the counter to not skip values (which would increase iterations!)
                n-=1
            n+=1
        # sort the resulting data but how many times each segment occurs (most occuring first)
        self.segment_data = sorted(self.segment_data.items(), key=lambda items:items[1], reverse=True)
        


# ANSWER 1
# Lets create a Strand!
strand1 = Strand(data)

# Searching a pattern against the protein chain
results1 = strand1.search("AGTC")
# Results!
print(f"[{strand1.occurances} found]")
# print it a little nicer
pprint.pprint(results1)

# ANSWER 2
# More Strands! 
strand2 = Strand(data)
# Retrieving 'codon' opposite
inverse = strand2.inverse("CAACTGGT")
# Check protein chain for it
results2 = strand2.search(inverse)
# More Results!
print(f"\n[{strand2.occurances} found]")
pprint.pprint(results2)
# Slice codon from the protein chain making as many splits in the chain as times the codon occurs. Only last segment shown in example.
print(f'Remaining Strand After Split:\n{strand2.strand.split(inverse)}\n')

# ANSWER 3
# Where's the logic??? There's a method to the madness:  Strand.patterns()
print(f'Most Occuring Segment: {strand1.segment_data[0]}')

[3 found]
{0: {'end': 35, 'length': 4, 'start': 31},
 1: {'end': 57, 'length': 4, 'start': 53},
 2: {'end': 71, 'length': 4, 'start': 67},
 'sequence': 'AGTC'}

[1 found]
{0: {'end': 50, 'length': 8, 'start': 42}, 'sequence': 'ACCAGTTG'}
Remaining Strand After Split:
['ACACGTGTCAGTTGTGCAGTACACGTACGTCAGTCAACTGTG', 'GTCAGTCAACCAACGTCAGTCAACTGACCACAGTGTCATGACACGTACGT']

Most Occuring Segment: ('GTCA', 8)
