In [1]:
class CompressedGene:
    def __init__(self,gene:str) -> None:
        self.__compress(gene)
    def __compress(self, gene:str):
        self.bit_string: int = 1 
        for nucleotide in gene.upper():
            self.bit_string <<= 2 #shifting left by 2 bits  
            match nucleotide:
                case 'A':
                    self.bit_string |= 0b00
                case 'C':
                    self.bit_string |= 0b01
                case 'G':
                    self.bit_string |= 0b10
                case 'T':
                    self.bit_string |= 0b11
                case _:
                    raise ValueError(f'Invalid Nucleotide {nucleotide}')
    def decompress(self) -> str:
        gene: str = ''
        for i in range(0,self.bit_string.bit_length()-1 ,2): # -1 to exluded sentinel
            bits : int = self.bit_string >> i & 0b11 # get 2 relevant bits 
            match bits:
                case 0b00:
                    gene += 'A'
                case 0b01:
                    gene += 'C'
                case 0b10:
                    gene += 'G'
                case 0b11:
                    gene += 'T'
                case _:
                    raise ValueError(f'Invalid Bits {bits}')
        return gene[::-1]
    def __string__(self)-> str:
        return self.decompress()


In [3]:

from sys import getsizeof
original : str = 'TAGGGATTAACCGTTATATATATATAGCCATGGATCGATTATATAGGGATTAACCGTTATATATATATAGCCATGGATCGATTATA' * 100
print(f'Original is of {getsizeof(original)} bytes')
compressed : CompressedGene = CompressedGene(original)
print(f'Compressed is of {getsizeof(compressed.bit_string)} bytes')
print(f'Original and decompressed are both same: {original==compressed.decompress()}')




Original is of 8649 bytes
Compressed is of 2320 bytes
Original and decompressed are both same: True
