In [4]:
from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction
import pandas as pd

with open("human.txt", "r") as file:
    sequences = [line.strip() for line in file if line.strip()] 

df = pd.DataFrame(sequences, columns=['sequence'])
print(df.head(10))

motif = "ATG"
stop_codons = ["TAA", "TAG", "TGA"]

def analyze_sequence(seq_str):
    sequence = Seq(seq_str)

    # 1. Find motif positions
    positions = [
        i + 1
        for i in range(len(sequence) - len(motif) + 1)
        if sequence[i:i+len(motif)] == motif
    ]

    # 2. Calculate GC content
    GC_content = gc_fraction(sequence) * 100  # percentage

    # 3. Identify coding region (first ORF)
    start = seq_str.find("ATG")
    coding_region = ""
    if start != -1:
        for i in range(start + 3, len(seq_str), 3):
            if seq_str[i:i+3] in stop_codons:
                coding_region = seq_str[start:i+3]
                break

    return pd.Series([positions, GC_content, coding_region])

df[['Motif_Positions', 'GC_Content', 'Coding_Region']] = df['sequence'].apply(analyze_sequence)

print(df.head(10))

                                            sequence
0                                    sequence\tclass
1  ATGCCCCAACTAAATACTACCGTATGGCCCACCATAATTACCCCCA...
2  ATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAG...
3  ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...
4  ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...
5  ATGCAACAGCATTTTGAATTTGAATACCAGACCAAAGTGGATGGTG...
6  ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...
7  ATGAAGATTGCACACAGAGGTCCAGATGCATTCCGTTTTGAGAATG...
8  ATGCAACAGCATTTTGAATTTGAATACCAGACCAAAGTGGATGGTG...
9  ATGAAGATTGCACACAGAGGTCCAGATGCATTCCGTTTTGAGAATG...
                                            sequence  \
0                                    sequence\tclass   
1  ATGCCCCAACTAAATACTACCGTATGGCCCACCATAATTACCCCCA...   
2  ATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAG...   
3  ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...   
4  ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...   
5  ATGCAACAGCATTTTGAATTTGAATACCAGACCAAAGTGGATGGTG...   
6  ATGTGTGGCATTTGGGCGCTGT