# Week 1: Sequence Analysis in Python

## Overview
This week focuses on implementing fundamental DNA sequence analysis in Python:
- File parsing for FASTA format
- Cleaning DNA sequences
- Counting nucleotides
- GC content calculation
- Transcription (DNA -> RNA)
- Complement and reverse complement
- Start codon detection
- Longest GC stretch



In [17]:

# File loading and parsing
def file_parsing(filepath):

    fasta_dict = {}
    with open(filepath, "r") as file:
        header= None
        seq = []
        for line in file:
            line= line.strip()
            if line.startswith(">"):
                if header:
                    fasta_dict[header] = "".join(seq)
                header = line[1:]
                seq= []
            else:
                seq.append(line)

        if header:
            fasta_dict[header] = "".join(seq)
    return fasta_dict

# seq cleaning 
def clean(seq):
    clean_seq= "".join([base for base in seq.upper() if base in "ATGC"])
    return clean_seq

# Counter for counting the nucleotides counts
from collections import Counter
def counting(clean_seq):
    base_count= Counter(clean_seq)
    return base_count

# GC Content determines the stability of DNA
def calculate_gc(clean_seq):
    gc = 0
    for base in clean_seq:
        if base in "GC":
            gc +=1
    gc_content= round((gc/ len(clean_seq)) * 100, 2)
    return gc_content


# Transcribes DNA -> RNA
def transcription(clean_seq):
    rna = ""
    for base in clean_seq:
        if base == "T":
            rna += "U"
        else:
            rna += base
    return rna


# Finds the complement and reverse complement as the DNA structure 
# is anti-parallel
def complmt(clean_seq):
    complement= {"A":"T", "T":"A", "G":"C", "C":"G"}
    complement_seq = ""
    for base in clean_seq:
        if base in complement:
            complement_seq += complement[base]
    reverse_complement= complement_seq[::-1]

    return complement_seq, reverse_complement


# Finds the start index of the start codon in a sequence
def start_position(clean_seq):
    start_codon = "ATG"
    codons = [clean_seq[i:i+3] for i in range(0, len(clean_seq)-2, 3)]
    if start_codon in codons:
        codon_index = codons.index(start_codon) 
        start_index = codon_index * 3 
        return start_index
    else:
        return None


# Finds the seq with the longest gc length and resets when the bases are AT
def longest_gc_stretch(clean_seq):
    current_length = 0
    current_seq = ""
    max_length = 0
    max_seq = ""

    for base in clean_seq:
        if base in "GC":
            current_length += 1
            current_seq += base
# This updates the max_length each time the gc sequence adds up
            if current_length > max_length: 
                max_length = current_length
                max_seq = current_seq

# This resets the gc count once an A or T base is reached
        else:
            current_length = 0
            current_seq = ""
    return max_seq, max_length

    
sequences= file_parsing("cleaned_sequences.fasta")


for header, seq in sequences.items():
    cleaned_seq= clean(seq)

    print(f"Header: {header}")
    
    Base= counting(cleaned_seq)
    print(f"Base Count: {Base}\n")
    
    gc= calculate_gc(cleaned_seq)
    print(f"GC Content: {gc}%\n")
    
    Rna= transcription(cleaned_seq)
    print(f"RNA: {Rna}\n")
    
    compleme, Reverse = complmt(cleaned_seq)
    print(f"Complement Seq: {compleme}\n")
    print(f"Reverse Complement: {Reverse}\n")
    
    Pos= start_position(cleaned_seq)
    print(f"Position of start codon in Sequence: {Pos}\n")
    
    Longest_gc= longest_gc_stretch(cleaned_seq)
    print(f"Longest GC Stretch: {Longest_gc}\n")
    
                    
               
                    
                    

Header: Human_sequence
Base Count: Counter({'A': 9, 'T': 8, 'G': 8, 'C': 8})

GC Content: 48.48%

RNA: AUGCUAGCUAGCUAACGAUGCUAGCUAGCUGAC

Complement Seq: TACGATCGATCGATTGCTACGATCGATCGACTG

Reverse Complement: GTCAGCTAGCTAGCATCGTTAGCTAGCTAGCAT

Position of start codon in Sequence: 0

Longest GC Stretch: ('GC', 2)

Header: Mouse_sequence
Base Count: Counter({'T': 10, 'G': 10, 'C': 8, 'A': 8})

GC Content: 50.0%

RNA: UUGCGCGGAUCGUAGCUAGCUAGCUAGCUAAUGCUA

Complement Seq: AACGCGCCTAGCATCGATCGATCGATCGATTACGAT

Reverse Complement: TAGCATTAGCTAGCTAGCTAGCTACGATCCGCGCAA

Position of start codon in Sequence: 30

Longest GC Stretch: ('GCGCGG', 6)

Header: Plant_sequence
Base Count: Counter({'G': 8, 'C': 8, 'T': 8, 'A': 8})

GC Content: 50.0%

RNA: GCUAGCUAGCAUCGAUCGUAUAGCUAGCUAGC

Complement Seq: CGATCGATCGTAGCTAGCATATCGATCGATCG

Reverse Complement: GCTAGCTAGCTATACGATCGATGCTAGCTAGC

Position of start codon in Sequence: None

Longest GC Stretch: ('GC', 2)

