In [50]:
import re
import math
import pytest
import filecmp

In [59]:
class Feature:
    """ This class reads input file and extracts the necessary parts
    using different functions.
    """
    def __init__(self, input_file):
        """ initialize the attributes;This method is called
        when Feature object is created. input_file in the only
        attribute of Feature class when get instantiated.
        """
        self.input_file = input_file
        self.extract_info_from_input_file()

    def make_list_of_features(self, iterator):
        """creates a list of Feature objects and a list of their info:
        [Feature1, [sequence location , explanation], Feature2,[...], ...]
        """
        self.list_of_features_info = []
        line = next(iterator)
        while line.find('ORIGIN', 0, 20) == -1:
            line2 = line[:21].strip()
            if line2:
                self.list_of_features_info.append(line2)
                info = []
                info_seq = ''
                # when it finds "(", looks for ')' to save all locations.
                if '(' in line:
                    while line.find(')', 21, 80) == -1:
                        info_seq = info_seq + line[21:]
                        line = next(iterator)
                info_seq = info_seq + line[21:]
                info_explain = next(iterator).strip()
                # make this list:[sequence location, explanation]
                info.append(info_seq)
                info.append(info_explain)
                self.list_of_features_info.append(info)
            line = next(iterator)
        return iterator

    def make_origin_sequence(self, line, iterator):
        """ Extract a string of all lines in ORIGIN part
        """
        self.origin_sequence = ""
        while line != "//":
            # remove line numbers
            line = re.sub(r'[0-9]', '', line)
            # remove spaces
            line = re.sub('\s', '', line)
            self.origin_sequence = self.origin_sequence + line
            line = next(iterator).strip()
        # remove "iflqpe" letters from protein
        if "m" in self.origin_sequence:
            self.origin_sequence = re.sub(r'[iflqpe]', '', self.origin_sequence)

    def extract_info_from_input_file(self):
        """this method extracts all needed information from
        input file by openning the file in read mode just once.
        read it line by line and only save data in variables.
        """
        self.sequence = ""
        with open(self.input_file, 'r') as f:
            iterator = iter(f)
            line = next(iterator)
            # extracts data before last line of file that contains "//"
            while line.find('//', 0, 2) == -1:
                if line.find('DEFINITION', 0, 20) == 0:
                    self.definition = line.strip().replace("DEFINITION  ", "")
                    if not self.definition.endswith("."):
                        self.definition = self.definition + " " + next(iterator).strip()
                elif line.find('FEATURES', 0, 20) == 0:
                    iterator = self.make_list_of_features(iterator)
                elif line.find('        1', 0, 20) == 0:
                    self.make_origin_sequence(line, iterator)
                    break
                line = next(iterator)

    def get_origin_sequence(self):
        """this method returns the string of origin sequence
        """
        return self.origin_sequence

    def get_definition(self):
        """this method returns the string of definition
        """
        return self.definition

    def get_list_of_features_info(self):
        """this method returns the list of features info
        """
        return self.list_of_features_info

    def __iter__(self, file):
        """refere to __next__ function
        """
        return self.__next__()

    def __next__(self):
        """ return one line of file
        """
        try:
            yield f.readline()
        except StopIteration:
            return "no more lines"

def main():
    input_file = "C:/Users/a.jalali/Downloads/Programming 2/Final Assignment-programming 1/CFTR_DNA.gb"
    FFF = Feature(input_file)
    a = FFF.get_definition()
    print(a)
if __name__ == "__main__":
    main()

Homo sapiens CF transmembrane conductance regulator (CFTR), RefSeqGene (LRG_663) on chromosome 7.


In [52]:
class GenbankParser:
    """ This class gets an object of Feature class and parse it.
    It has two main methods for extracting freatures in uppercased or
    seperated format. According to the format, different function will be
    called. a method that validate the segment of a feature_title.
    """
    def __init__(self, feature: Feature):
        """ This method is called when GenbankParser object is created.
        the attributes are defined here. output file will be created too.
        """
        self.feature = feature      # <-- dependency is injected
        self.origin = self.feature.get_origin_sequence()
        self.input_file = self.feature.input_file
        self.definition = self.feature.get_definition()
        self.info_list = self.feature.get_list_of_features_info()
        self.output_file = self.create_output_file(self.definition)

    def create_output_file(self, seq):
        """ This method creates the output file and name it according to the
        input file name. it gets a string to write in the first line.
        """
        start = self.input_file.rfind("CFTR_")
        stop = self.input_file.rfind(".")
        # path + name (DNA.mRNA/protein)+  "_features.txt"
        output_file = self.input_file[:start] + self.input_file[start+5 : stop] + "_features.txt"
        with open(output_file, "w") as OF:
            OF.write(seq + "\n")
        return output_file

    def complement(self, dna_seq):
        """ This method gets a sequence and converts it to
        the reverse and complement format
        """
        comp_bases = {"a":"t", "c":"g", "t":"a", "g":"c", "A": "T", "T": "A", "C": "G", "G": "C"}
        reverse_dna = dna_seq[::-1]
        x = []
        for i in reverse_dna:
            x.append(comp_bases[i])
        Complementary_seq = ''.join(x)
        return Complementary_seq

    def trim_sequence(self, seq):
        """ This method trims the sequence.
        it gets a string and if the length is more thans 60 characters,
        writes the first 60charcter in the output_file  and goes to the
        next line and does it until the length is less than 60
        """
        a = 60
        length = math.ceil(len(seq) / a)
        with open(self.output_file, "a") as EF:
            for i in range(length):
                start = i*a
                stop = a *(i+1)
                EF.write(seq[start: stop] + "\n")
            if len(seq) % a == 0:
                EF.write("\n")

    def validate_segment_of_sequence(self, length, *num):
        """ This method that checks whether the segment of a
        feature_title exists. If the start or stop of segment is
        greater than of the length of sequence, it raise a error.
        """
        for n in num:
            if n > length:
#                 raise Errors
                raise ValueError(f"{n} is greater than of the length of sequence")

    def extract_join_uppercased(self, location):
        """this method make a final sequence from main and rest;only main
        segments are uppercased, sequence between them are lowercased.
        """
        list_num = [int(num) for num in re.findall(r"\d+", location)]
        start_origin_seq = 0
        Final_Sequence = ""
        for i in range(0, len(list_num), 2):
            start = list_num[i]
            stop = list_num[i + 1]
            stop_origin_seq = start - 1
            # define main and rest seperately
            main_Sequence = self.origin[start-1:stop]
            rest_Sequence = self.origin [start_origin_seq :stop_origin_seq]
            main_Sequence = main_Sequence.upper()
            Final_Sequence = Final_Sequence + rest_Sequence + main_Sequence
            start_origin_seq = stop
        return Final_Sequence

    def extract_join_separated(self, location):
        """this method make a final sequence by joining all segments.
        """
        Final_Sequence = ''
        list_num = [int(num) for num in re.findall(r"\d+", location)]
        if len(list_num) % 2 == 0:
            for i in range(0, len(list_num), 2):
                start = list_num[i]
                stop = list_num[i + 1]
                Final_Sequence = Final_Sequence + self.origin[start-1:stop]
        return Final_Sequence

    def extract_normal_uppercased(self, location):
        """this method make a final sequence by making the segment uppercased
        and adding the rest of previous sequence to it.
        """
        try:
            list_num = [int(num) for num in re.findall(r"\d+", location)]
            start = list_num[0]
            stop_origin_seq = start - 1
            if len(list_num) == 2:
                stop = list_num[1]
                # at first the location of segment is validated
                self.validate_segment_of_sequence(len(self.origin ), start, stop, stop_origin_seq)
                main_Sequence = self.origin[start-1:stop]
            else:
                self.validate_segment_of_sequence(len(self.origin), start)
                main_Sequence = self.origin[start-1]
            main_Sequence = main_Sequence.upper()
            rest_Sequence = self.origin[0:stop_origin_seq]
            Final_Sequence = rest_Sequence + main_Sequence
        except ValueError:
            Final_Sequence = ""
        return Final_Sequence

    def extract_normal_separated(self, location):
        """extract the segment from origin sequence.
        """
        try:
            list_num = [int(num) for num in re.findall(r"\d+", location)]
            start = list_num[0]
            if len(list_num) == 2:
                stop = list_num[1]
                # at first the location of segment is validated
                self.validate_segment_of_sequence(len(self.origin), start, stop)
                Final_Sequence = self.origin[start-1:stop]
            else:
                self.validate_segment_of_sequence(len(self.origin), start - 1)
                Final_Sequence = self.origin[start-1]
        # if start or stop are out of origin length, nothing is written in file
        except ValueError:
            Final_Sequence = ""
        return Final_Sequence

    def write_in_file(self, current_object, Final_Sequence):
        """ this method writes title of feature, related sequence of feature
        and its expaination(next line of numbers) in output file.
        """
        with open(self.output_file, "a") as EF:
            EF.write("\n>" + self.info_list[current_object] + ' ' + self.info_list[current_object+1][1] + "\n")
        self.trim_sequence(Final_Sequence)

    def extract_features_uppercased(self):
        """ This method will be called if the requested format is uppercased.
        It analyzes the items of info_list from self.get_list_of_features_info.
        extracts the numbers(locations). at first, finds the rest_Sequence and
        main_Sequence according to the location.
        """
        number_of_objects  = len(self.info_list)
        main_Sequence = ''
        current_object = 0
        while current_object < number_of_objects :
            main_Sequence = ''
            location = self.info_list[current_object+1][0]
            # if finds 'join' in the line, calls extract_join_uppercased;
            if 'join' in location:
                Final_Sequence = self.extract_join_uppercased(location)
            # if finds 'order', nothing should be extracted.
            elif 'order' in location:
                   Final_Sequence = ""
            # normal case
            else:
                Final_Sequence = self.extract_normal_uppercased(location)
            # if finds 'complement', calls complement method.
            if 'complement' in location:
                    Final_Sequence = self.complement(Final_Sequence)
            # write in ouput in a trimmed way
            if Final_Sequence != "":
                self.write_in_file(current_object, Final_Sequence)
            # go for next feature
            current_object += 2
        # if all features are not parsed returns an error
        try:
            if current_object == number_of_objects:
                return "GenBank Features extracted Successfully"
            else:
                raise TypeError("not correct file")
        except TypeError:
            return "GenBank Features is NOT extracted completely"

    def extract_features_separated(self):
        """ This method will be called if the requested format in separated.
        it analyzes the items of info_list from self.get_list_of_features_info.
        It analyzes the second items of self.list_of_feature_objects_info.
        extracts the numbers(locations). finds the Final_Sequence according
        to the location.
        """
        number_of_objects  = len(self.info_list)
        current_object = 0
        while current_object < number_of_objects :
            location = self.info_list[current_object+1][0]
            # if finds 'join' in the line, calls extract_join_separated;
            if 'join' in location:
                Final_Sequence = self.extract_join_separated(location)
            # if finds 'order', nothing should be extracted.
            elif 'order' in location:
                    Final_Sequence = ""
            # normal case
            else:
                Final_Sequence = self.extract_normal_separated(location)
            # if finds 'complement', calls complement method.
            if 'complement' in location:
                Final_Sequence = self.complement(Final_Sequence)
            # write in ouput in a trimmed way
            if Final_Sequence != "":
                self.write_in_file(current_object, Final_Sequence)
            # go for next feature
            current_object += 2
        # if all features are not parsed returns an error
        try:
            if current_object == number_of_objects:
                return "GenBank Features extracted Successfully"
            else:
                raise TypeError("not correct file")
        except TypeError:
            return "GenBank Features is NOT extracted completely"

In [64]:
def main():
    input_file = "CFTR_DNA.gb"
    feature = Feature(input_file)
    print(feature.definition)
    gp = GenbankParser(feature)
    format = input("please enter u for uppercased or s for separate: ")
    while True:
        if format == "u":
            print(gp.extract_features_uppercased())
            break
        elif format == "s":
            print(gp.extract_features_separated())
            break
        else:
            format = input("Failure! Please enter u  or s : ")

if __name__ == "__main__":
    main()


Homo sapiens CF transmembrane conductance regulator (CFTR), RefSeqGene (LRG_663) on chromosome 7.
please enter u for uppercased or s for separate: u
GenBank Features extracted Successfully


In [62]:
input_file = "CFTR_DNA.gb"

f = open(input_file, 'r')
iterator = iter(f)
print(type(iterator))

<class '_io.TextIOWrapper'>


In [65]:
assert filecmp.cmp("DNA_features.txt", "Main_DNA_extract_Uppercased.txt"), "not equal"

In [66]:
assert [row for row in open("DNA_features.txt")] == [row for row in open("Main_DNA_extract_Uppercased.txt")],"not equal"