In [2]:
from Bio import SeqIO

# Initialize counter for sequences containing "GCGC"
count_gcbc = 0

# Read the FASTA file using Biopython
for record in SeqIO.parse("genes.fa", "fasta"):
    sequence = str(record.seq).upper()
    
    # Check if the sequence contains the substring "GCGC"
    if "GCGC" in sequence:
        count_gcbc += 1

# Output the count of sequences containing "GCGC"
print(f"Number of sequences containing 'GCGC': {count_gcbc}")

Number of sequences containing 'GCGC': 55


In [3]:
with open("genes_length_classification.txt", "w") as output_file:
    # Read the FASTA file using Biopython
    for record in SeqIO.parse("genes.fa", "fasta"):
        gene_name = record.id
        sequence_length = len(record.seq)

        # Classify gene sequence based on its length
        if sequence_length < 300:
            classification = "Short"
        elif 300 <= sequence_length <= 1000:
            classification = "Medium"
        else:
            classification = "Long"
        
        # Write result to the output file
        output_file.write(f"{gene_name}\t{classification}\n")

print("Classification results have been written to 'genes_length_classification.txt'")

Classification results have been written to 'genes_length_classification.txt'


In [4]:
output_file = open("genes_length_classification.txt", "w")
# Read the FASTA file using Biopython
for record in SeqIO.parse("genes.fa", "fasta"):
    gene_name = record.id
    sequence_length = len(record.seq)

    # Classify gene sequence based on its length
    if sequence_length < 300:
        classification = "Short"
    elif 300 <= sequence_length <= 1000:
        classification = "Medium"
    else:
        classification = "Long"
    
    # Write result to the output file
    output_file.write(f"{gene_name}\t{classification}\n")

output_file.close()

print("Classification results have been written to 'genes_length_classification.txt'")

Classification results have been written to 'genes_length_classification.txt'


In [5]:
# Initialize variables
total_genes = 0
longest_gene_length = 0
longest_gene_name = None

# Read the FASTA file using Biopython
for record in SeqIO.parse("genes.fa", "fasta"):
    total_genes += 1
    gene_length = len(record.seq)

    # Check if this is the longest gene
    if gene_length > longest_gene_length:
        longest_gene_length = gene_length
        longest_gene_name = record.id

# Output the total number of genes and the longest gene sequence
print(f"Total number of genes: {total_genes}")
print(f"Longest gene: {longest_gene_name} (Length: {longest_gene_length})")

Total number of genes: 100
Longest gene: B0024.8a (Length: 3849)


In [6]:
# Initialize dictionary to store gene names and lengths
gene_lengths = {}

# Read the FASTA file using Biopython
for record in SeqIO.parse("genes.fa", "fasta"):
    gene_name = record.id
    gene_length = len(record.seq)

    # Add gene name and length to the dictionary
    gene_lengths[gene_name] = gene_length

# Print the dictionary
print("Gene lengths:")
for gene_name, gene_length in gene_lengths.items():
    print(f"{gene_name}: {gene_length}")

Gene lengths:
2L52.1a: 1284
2L52.1b: 663
2RSSE.1a: 1032
2RSSE.1b: 1437
3R5.1a: 648
3R5.1b: 636
4R79.1a: 1035
4R79.1b: 543
4R79.2a: 1188
4R79.2b: 936
6R55.2: 258
AC3.1: 1068
AC3.2: 1578
AC3.3: 1278
AC3.4: 1278
AC3.5a: 3273
AC3.5b: 2952
AC3.6: 978
AC3.7: 1590
AC3.8: 1596
AC3.10: 1056
AC3.12: 285
AC7.1a: 1188
AC7.1b: 1161
AC7.2a: 1680
AC7.2b: 1677
AC7.2c: 1896
AC7.2d: 1650
AC7.3: 711
AC8.3: 1320
AC8.4: 654
AC8.7: 690
AC8.10: 1320
AC8.11: 654
AC8.12: 690
AH6.1: 3414
AH6.2: 990
AH6.3: 693
AH6.4: 996
AH6.5: 1404
AH6.6: 990
AH6.7: 990
AH6.8: 990
AH6.10: 990
AH6.11: 990
AH6.12: 990
AH6.14: 996
AH6.17: 186
AH9.1: 1236
AH9.2: 897
AH9.3: 765
AH9.4: 1095
AH9.6: 801
AH10.1: 1701
AH10.2: 738
AH10.3: 519
AH10.4: 219
B0001.1a: 816
B0001.1b: 498
B0001.1c: 213
B0001.2: 2781
B0001.3a: 1776
B0001.3b: 1728
B0001.3c: 1725
B0001.4a: 747
B0001.5: 1662
B0001.6: 2025
B0001.7a: 2196
B0001.7b: 2208
B0001.7c: 366
B0001.8a: 1473
B0001.8b: 1479
B0001.8c: 216
B0019.1: 2175
B0019.2: 2244
B0024.1: 894
B0024.2: 894
B002

In [7]:
# Find and display the gene with the shortest sequence
shortest_gene_name = min(gene_lengths, key=gene_lengths.get)
shortest_gene_length = gene_lengths[shortest_gene_name]
print(f"Gene with the shortest sequence: {shortest_gene_name} (Length: {shortest_gene_length})")

Gene with the shortest sequence: AH6.17 (Length: 186)


In [8]:
# Define function to calculate GC content
def find_gc_content(gene_sequence):
    gc_count = gene_sequence.count("G") + gene_sequence.count("C")
    gc_content = (gc_count / len(gene_sequence)) * 100
    return gc_content

# Initialize variables for the gene with the highest GC content
highest_gc_content = 0
highest_gc_gene_name = None

# Read the FASTA file using Biopython
for record in SeqIO.parse("genes.fa", "fasta"):
    gene_sequence = str(record.seq).upper()

    # Get the GC content of the gene
    gc_content = find_gc_content(gene_sequence)

    # Check if this gene has the highest GC content
    if gc_content > highest_gc_content:
        highest_gc_content = gc_content
        highest_gc_gene_name = record.id

# Output the gene with the highest GC content and its GC percentage
print(f"Gene with the highest GC content: {highest_gc_gene_name} (GC Content: {highest_gc_content:.2f}%)")

Gene with the highest GC content: 2RSSE.1a (GC Content: 52.23%)


In [9]:
from Bio import SeqIO
import re

# Define the file name
file_name = "Organism_cDNA_FASTA.txt"

# Initialize variables for each question
count_start_atg = 0
count_stop_codons = 0
longest_g_run = 0
longest_g_gene = None
count_poly_a = 0
genes_with_poly_a = []
count_motif_genes = 0
genes_with_motif = []

In [11]:
# Process the FASTA file using Biopython
for record in SeqIO.parse(file_name, "fasta"):
    sequence = str(record.seq).upper()  # Convert sequence to uppercase
    gene_name = record.id  # Gene identifier
if re.search(r"^ATG", sequence):  # Match sequences starting with "ATG"
        count_start_atg += 1
 print(f"Question 1: Genes starting with 'ATG': {count_start_atg}\n")   


IndentationError: unindent does not match any outer indentation level (<string>, line 7)