## Nucleotide Composition Analysis 
Implemented functions for FASTA file parsing, sequence cleaning, GC content calculation, and nucleotide frequency counting using Pythonâ€™s collections.Counter. 

Demonstrated the workflow on a custom DNA dataset and validated outputs for base distribution and GC ratio.

In [28]:
# Use of Counter for the count of nucleotides

from collections import Counter

def fasta_parsing(fasta2txt):
    fasta_dict= {}
    with open("fasta.txt", "r") as file:
        header= None
        sequence_lines= []
        for line in file:
            line= line.strip()
            if line.startswith('>'):
                if header:
                    fasta_dict[header]= "".join(sequence_lines)
                header= line[1:]
                sequence_lines= []
            else:
                sequence_lines.append(line)

        if header:
            fasta_dict[header]= "".join(sequence_lines)
        return fasta_dict


def clean_sequence(sequence):
    cleaned_sequence= "".join([base for base in sequence.upper() if base in 'ATGC'])
    return cleaned_sequence


def calculate_gc(fasta_dict):
    gc= 0
    total_length= 0
    for seq in fasta_dict.values():
        cleaned= clean_sequence(seq)
        for base in cleaned:
            if base in 'GC':
                gc += 1
        total_length += len(cleaned)
    gc_percent= round(gc/total_length * 100, 2) if total_length > 0 else 0
    return gc_percent


def nucleotides(fasta_dict):
    base_count= Counter()
    for seq in fasta_dict.values():
        cleaned= clean_sequence(seq)
        for base in cleaned:
            base_count[base] +=1
    return base_count


sequences= fasta_parsing("fasta.txt")

for header, seq in sequences.items():
    clean_seq= clean_sequence(seq)
    length_of_cleaned_sequence= len(clean_seq)
    print(f"header: {header}\nCleaned Sequence: {clean_seq}\n",
    f"Length of cleaned sequence: {length_of_cleaned_sequence}\n")
          
count_of_each_nucleotide= nucleotides(sequences)
gc_content_percent = calculate_gc(sequences)
print(f"Count of each nucleotides: {count_of_each_nucleotide}")
print(f"GC content percentage: {gc_content_percent}%")
        

header: sequence_1
Cleaned Sequence: ATGCTAGCTAGCTACGATCGATCGTAGCTAGCTAGCATCGATCGATGCTAGCTAGCTAGCATCGATGCTAGCTAGCATCG
 Length of cleaned sequence: 80

header: sequence_2
Cleaned Sequence: GGCATCGATCGATCGATCGATCGGATCGATCGATCGATCGCGATCGATCGATCGATCGATCGGATCGATCGATCGATCGA
 Length of cleaned sequence: 80

Count of each nucleotides: Counter({'G': 43, 'C': 40, 'A': 39, 'T': 38})
GC content percentage: 51.88%


In [27]:
# Use of manual dictionary for the count of nucleotides

def fasta_parsing(fasta2txt):
    fasta_dict= {}
    with open("fasta2.txt", "r") as file:
        header= None
        sequence_lines= []
        for line in file:
            line= line.strip()
            if line.startswith('>'):
                if header:
                    fasta_dict[header]= "".join(sequence_lines)
                header= line[1:]
                sequence_lines= []
            else:
                sequence_lines.append(line)

        if header:
            fasta_dict[header]= "".join(sequence_lines)
        return fasta_dict


def clean_sequence(sequence):
    cleaned_sequence= "".join([base for base in sequence.upper() if base in 'ATGC'])
    return cleaned_sequence


def calculate_gc(fasta_dict):
    gc= 0
    total_length= 0
    for seq in fasta_dict.values():
        cleaned= clean_sequence(seq)
        for base in cleaned:
            if base in 'GC':
                gc += 1
        total_length += len(cleaned)
    gc_percent= round(gc/total_length * 100, 2)
    return gc_percent


def nucleotides(fasta_dict):
    base_count= {'A':0, 'T':0, 'C':0, 'G':0}
    for seq in fasta_dict.values():
        cleaned= clean_sequence(seq)
        for base in cleaned:
                base_count[base] +=1
    return base_count


sequences= fasta_parsing("fasta2.txt")

for header, seq in sequences.items():
    clean_seq= clean_sequence(seq)
    length_of_cleaned_sequence= len(clean_seq)
    print(f"header: {header}\nCleaned Sequence: {clean_seq}\n",
        f"Length of cleaned sequence: {length_of_cleaned_sequence}\n")
          
count_of_each_nucleotide= nucleotides(sequences)
gc_content_percent = calculate_gc(sequences)
print(f"Count of each nucleotides: {count_of_each_nucleotide}")
print(f"GC content percentage: {gc_content_percent}%")
        

header: sequence_1
Cleaned Sequence: ATGCCGTTATTTAGCTG
 Length of cleaned sequence: 17

header: sequence_2
Cleaned Sequence: CGGCTTAATTAACG
 Length of cleaned sequence: 14

Count of each nucleotides: {'A': 7, 'T': 11, 'C': 6, 'G': 7}
GC content percentage: 41.94%


In [26]:
# Use of frequency for the count of nucleotides

def fasta_parsing(fasta2txt):
    fasta_dict= {}
    with open("fasta2.txt", "r") as file:
        header= None
        sequence_lines= []
        for line in file:
            line= line.strip()
            if line.startswith('>'):
                if header:
                    fasta_dict[header]= "".join(sequence_lines)
                header= line[1:]
                sequence_lines= []
            else:
                sequence_lines.append(line)

        if header:
            fasta_dict[header]= "".join(sequence_lines)
        return fasta_dict


def clean_sequence(sequence):
    cleaned_sequence= "".join([base for base in sequence.upper() if base in 'ATGC'])
    return cleaned_sequence


def calculate_gc(fasta_dict):
    gc= 0
    total_length= 0
    for seq in fasta_dict.values():
        cleaned= clean_sequence(seq)
        for base in cleaned:
            if base in 'GC':
                gc += 1
        total_length += len(cleaned)
    gc_percent= round(gc/total_length * 100, 2)
    return gc_percent


def freq(fasta_dict):
    frequency={}
    for seq in fasta_dict.values():
        cleaned= clean_sequence(seq)
        for base in seq:
            frequency[base]= frequency.get(base, 0)+1

sequences= fasta_parsing("fasta2.txt")

for header, seq in sequences.items():
    clean_seq= clean_sequence(seq)
    length_of_cleaned_sequence= len(clean_seq)
    print(f"header: {header}\nCleaned Sequence: {clean_seq}\n",
        f"Length of cleaned sequence: {length_of_cleaned_sequence}\n")
          
count_of_each_nucleotide= nucleotides(sequences)
gc_content_percent = calculate_gc(sequences)
print(f"Count of each nucleotides: {count_of_each_nucleotide}")
print(f"GC content percentage: {gc_content_percent}%")
        

header: sequence_1
Cleaned Sequence: ATGCCGTTATTTAGCTG
 Length of cleaned sequence: 17

header: sequence_2
Cleaned Sequence: CGGCTTAATTAACG
 Length of cleaned sequence: 14

Count of each nucleotides: Counter({'T': 11, 'A': 7, 'G': 7, 'C': 6})
GC content percentage: 41.94%
