In [15]:
!pip install biopython



In [17]:
from Bio.Seq import Seq

In [18]:
my_seq = Seq("AGTACACTGGTA")
print(my_seq)
print(my_seq.complement())
print(my_seq.reverse_complement())
print(my_seq.transcribe())
print(my_seq.translate())

AGTACACTGGTA
TCATGTGACCAT
TACCAGTGTACT
AGUACACUGGUA
STLV


In [19]:
from google.colab import files
uploaded =files.upload()

Saving gene.fna to gene.fna


In [20]:
!pip install biopython



In [13]:
from Bio import SeqIO


In [14]:
ecoli_genome_analyse = list(uploaded.keys())[0]

In [22]:
for record in SeqIO.parse(ecoli_genome_analyse, "fasta"):
  print(f"ID: {record.id}")
  print(f"Description: {record.description}")
  print(f"Length: {len(record.seq)}")
  print(f"sequence (first 100 bases): {record.seq[:100]}")

ID: NW_021636346.1:c40256-27400
Description: NW_021636346.1:c40256-27400 LOC114717194 [organism=Prosopis alba] [GeneID=114717194] [chromosome=Un]
Length: 12857
sequence (first 100 bases): TCACGGAGCTTAAACAAGGCCCTCTATAGCCTCTGTCTGCTTACTGTGCCACAAAAACGCCCACCTTCATAACGGCTTCTTGCATAAGCGACTGTGGAAG


In [27]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

seq_obj = SeqRecord(Seq("ATGCATGACTA"), id="Test",
description="Example sequence")
SeqIO.write(seq_obj,  "output.fasta", "fasta")

1

In [28]:
dna_seq = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
protein_seq = dna_seq.translate()
print(protein_seq)

MAIVMGR*KGAR*


In [32]:
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

alignments = pairwise2.align.globalxx("GATTACA", "GCATGCU")
for alignment in alignments:
    print(format_alignment(*alignment))

G-ATTA-CA-
| | |  |  
GCA-T-GC-U
  Score=4

G-ATTA-CA-
| ||   |  
GCAT--GC-U
  Score=4

G-ATTACA-
| | |.|  
GCA-TGC-U
  Score=4

G-ATTACA-
| || .|  
GCAT-GC-U
  Score=4

G-ATTACA-
| ||. |  
GCATG-C-U
  Score=4

G-ATTA-CA
| | |  |.
GCA-T-GCU
  Score=4

G-ATTA-CA
| ||   |.
GCAT--GCU
  Score=4

G-ATTACA
| | |.|.
GCA-TGCU
  Score=4

G-ATTACA
| || .|.
GCAT-GCU
  Score=4

G-ATTACA
| ||. |.
GCATG-CU
  Score=4



In [37]:
from Bio import SeqIO
def caluclate_gc(seq):
    g = seq.count("G")
    c = seq.count("C")
    gc_content = 100 * float(g + c) / len(seq)
    return gc_content

for record in SeqIO.parse(ecoli_genome_analyse, "fasta"):
    print(f"{record.id}: {caluclate_gc(record.seq):.2f}% GC")

NW_021636346.1:c40256-27400: 37.58% GC


In [42]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

translated_records = []

for record in SeqIO.parse("gene.fna", "fasta"):
    protein_seq = record.seq.translate(to_stop=True)
    new_record = SeqRecord(protein_seq, id=record.id,
description="Translated protein")
    translated_records.append(new_record)
SeqIO.write(translated_records, "translated_proteins.fasta", "fasta")



1

In [44]:
from Bio import Entrez, SeqIO

Entrez.email = "nagendraslv619@gmail.com"
handle = Entrez.efetch(db="nucleotide", id="NM_001301717",
rettype="gb", retmode="text")
record = SeqIO.read(handle, "genbank")

print("Gene ID:", record.id)
print("Description:", record.description)
print("Sequence Length:", len(record.seq))
print("First 50 bp:", record.seq[:50])

Gene ID: NM_001301717.2
Description: Homo sapiens C-C motif chemokine receptor 7 (CCR7), transcript variant 4, mRNA
Sequence Length: 2191
First 50 bp: CTCTAGATGAGTCAGTGGAGGGCGGGTGGAGCGTTGAACCGTGAAGAGTG


In [47]:
from Bio.Seq import Seq

def find_codons(dna_seq):
  start_index = dna_seq.find("ATG")
  stop_codons = ["TAA", "TAG", "TGA"]
  stop_index = -1
  for codon in stop_codons:
    idx = dna_seq.find(codon, start_index)
    if idx != -1 and (stop_index == -1 or idx < stop_index):
      stop_index = idx
  return start_index, stop_index + 3

seq = Seq("AAAGCATGAAACCCCTAGGGTAA")
start, stop = find_codons(str(seq))
print(f"Start codon: {start}, Stop codon at: {stop}")
print("Predicted ORF:", seq[start:stop])

Start codon: 5, Stop codon at: 9
Predicted ORF: ATGA


In [52]:
from Bio import SeqIO

min_length = 300
filtered = [record for record in
SeqIO.parse("gene.fna", "fasta") if len (record.seq)
>= min_length]
SeqIO.write(filtered, "long_sequences.fasta", "fasta")

1

In [55]:
from Bio import SeqIO
import re
motif = re.compile("ATG[ATGC]{3,}TAA")

for record in SeqIO.parse("gene.fna", "fasta"):
    matches = motif.findall(str(record.seq))
    print(f"{record.id}: {len(matches)} motifs found")

NW_021636346.1:c40256-27400: 1 motifs found
