# Protein Bioinformatics - Day 1



In [None]:
import sys
import os

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import Entrez
from Bio.Align import MultipleSeqAlignment as MSA

# To configure
Entrez.email = "giodalton.gaming.off@gmail.com"

# Hidde warnings
import warnings
warnings.filterwarnings('ignore')

## Exercise 1
In a DNA sequence, identify the possile genes and it's transcripted and translated products. Identify at least the most similar protein with a hit in the protein data bank for each translated protein.

## Exercise 2
From a list of genebank identifiers, retreive their corresponding protein sequences in fasta format. After, indicate if the proteins have been experimentally validated and retrieve the related literature.

In [None]:
# Main function
def retrieve_from_genebank(gb_ids, format=None):
    if isinstance(gb_ids, str):
        return retrieve_from_genebank([gb_ids], format)
    elif isinstance(gb_ids, (list, tuple)):
        return [_retrieve_from_genebank_(gb_id, format) for gb_id in gb_ids]
    else:
        raise ValueError()

# Function to handle single id
def _retrieve_from_genebank_(gb_id, format=None):
    # Load data from NCBI & read data as GenBank data
    handle = Entrez.efetch(db="nucleotide", id=gb_id, rettype="gb", retmode="text")
    gb_data = SeqIO.read(handle, 'genbank')

    # Convert DNA Seq to Protein Seq and format as fasta
    dna_seq = gb_data.seq
    prot_seq = gb_data.seq.translate()
    prot_rec = SeqRecord(prot_seq, id=gb_data.id, name=gb_data.name, description=gb_data.description, dbxrefs=gb_data.dbxrefs)
    
    if format:
        prot_rec = prot_rec.format(format)
    
    # Verifiy if data was validated, if so return the list of references
    if gb_data.annotations['references']:
        return (prot_rec, True, gb_data.annotations['references'])
    return (prot_rec, False, [])

In [None]:
retrieve_from_genebank(('X14061', 'X15062'), format="fasta")

## Exercise 3
Create a theoretical framework for directed mutagenesis of a user specified gene in genbank. The user must be able to decide the mutation position and choose the codon to use. From this codon, create primers with 40% GC content and a minimum of 25bp.

## Exercise 4
Starting with a list of gene identifiers, saved in a file, perform the MSA of the corresponding proteins. From the MSA, retrieve the zones with higher conservation.

In [None]:
handle = 'gb_ids.txt' #specify the location of your gene ids file

def get_ids(handle):
    if not os.path.isfile(handle):
        raise ValueError()

    with open(handle) as f:
        ids = list({line.rstrip() for line in f.readlines()})
            
    return ids

# We use the exercise 2 retrieve_from_genebank function to get the records
records = [el[0] for el in retrieve_from_genebank(get_ids(handle))]
align = MSA(records)
print(align)

## Exercise 5
Starting with a gene sequence, provide the genetic analysis of the sequence (GC content, length, taxonomy and condon usage). Provide also the protein sequence and the corresponding characteristics according to Prosite, if any.

In [128]:
gc_content = lambda seq: (seq.count("G")+seq.count("C"))/len(seq)*100

def seq_stats(seq, source=None, verbose=True):
    stats = [None, None, None, None]
    if source is None:
        stats[0] = gc_content(seq)
        stats[1] = len(seq)
        stats[2] = None #TODO
        stats[3] = None #TODO
        
    # pretty print
    if verbose:
        print(f"> Sequence Stats (Source:{source})\n" + \
              f"\tGC-content.................{stats[0]:.2f}\n" + \
              f"\tLength.....................{stats[1]}\n" + \
              f"\tTaxonomy...................{stats[2]}\n" + \
              f"\tCodon Usage................{stats[3]}\n")

    return stats

seq_stats("ATGCCCGATAGGCTTAAATGAGAGATCGATACAGATAGACCCAATTAAATGAGAGAGATCAGCGCATG", verbose=False)

[42.64705882352941, 68, None, None]

## Exercise 6
Starting from one protein sequence, retrieve the homologous from 5 different species. perform the MSA and retrieve an identity matrix.

## Exercise 7
From a list of proteins, listed by their uniprot identifiers, retreive their EC number and sequence. If they share common enzymatic class on the higher instance, perform the MSA and identify the two most distant proteins.

## Exercise 8
From a list of proteins, listed by their uniprot identifiers, retrieve their PDB code and sequence. Only if they belong to different organisms, perform a MSA and print a phylogenetic tree.