In [1]:
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *

In [10]:
hydro_dict = {
    #Amino Acids
    'A' : 0.41, #Alanine
    'C' : 0.49, #Cysteine
    'D' : -0.55, #Aspartic acid
    'E' : -0.31, #Glutamic acid
    'F' : 1.00, #Phenylalanine
    'G' : 0.00, #Glycine
    'H' : 0.08, #Histidine
    'I' : 0.99, #Isoleucine
    'K' : -0.23, #Lysine
    'L' : 0.97, #Leucine
    'M' : 0.74, #Methionine
    'N' : -0.28, #Asparagine
    'P' : -0.46, #Proline
    'Q' : -0.10, #Glutamine
    'R' : -0.14, #Arginine
    'S' : -0.05, #Serine
    'T' : 0.13, #Threonine
    'V' : 0.76, #Valine
    'W' : 0.97, #Tryptophan
    'Y' : 0.63, #Tyrosine
    #Variable characters
    'B' : -0.415, #Aspartic acid or Asparagine
    'J' : 0.98, #Leucine or Isoleucine
    'Z' : -0.205 #Glutamic acid or Glutamine
}

def hydroScore(sequencestr):
    score = 0
    for AA in sequencestr:
        score += hydro_dict[AA]
    return(round(score,2))

Values are relative to Glycine (0), values sourced from Monera et al., J. Protein Sci. 1: 319-329 (1995).

In [3]:
def listtostr(strlist):
    output = ""
    for word in strlist:
        output += word + "_"
    return(output[:len(output)-1])
def simplifyname(orgname):
    firstname = [orgname.split("_")[0][0]]
    lastname = orgname.split("_")[1:]
    fullname = firstname + lastname
    return(listtostr(fullname))

In [12]:
datadict = {
    'sequence' : [],
    'ID' : [],
    'description' : [],
    'MW' : [],
    'hydrophobicity' : [],
}

datafile = "data/Yersinia_pestis[632].faa"
with open(datafile,"r") as protfile:
    for record in SeqIO.parse(datafile,"fasta"):
        sequence = str(record.seq)
        parameters = PA(sequence)
        datadict['sequence'].append(sequence)
        datadict['ID'].append(record.id)
        datadict['description'].append(record.description)
        datadict['MW'].append(parameters.molecular_weight())
        datadict['hydrophobicity'].append(hydroScore(sequence))

df = pd.DataFrame.from_dict(datadict)
df.sort_values('hydrophobicity',inplace=True)
df

Unnamed: 0,sequence,ID,description,MW,hydrophobicity
392,MNRVQFNHHHHHHPD,sp|Q8D079|LPHI_YERPE,sp|Q8D079|LPHI_YERPE His operon leader peptide...,1943.0746,1.17
2835,MWVVLDRVTPGGHGPEDEDNARNNEDNKPSQNGSNFTNSEIDDRIT...,tr|Q7CJ52|Q7CJ52_YERPE,tr|Q7CJ52|Q7CJ52_YERPE Putative uncharacterize...,8985.5158,3.37
1372,MADLYNPDSDREEYELDDKQLYEQVLQLINKLNQDALNK,tr|Q74PL1|Q74PL1_YERPE,tr|Q74PL1|Q74PL1_YERPE Putative colicin immuni...,4686.0812,4.59
3726,MCGWEAPKKAELTGANGGPIQTSNLTPDEAAEAYRKMMG,tr|Q0WJ28|Q0WJ28_YERPE,tr|Q0WJ28|Q0WJ28_YERPE Putative uncharacterize...,4125.6410,5.38
3187,MPIIAPIPRNKRHQMEKIVHKTADKNHSRHLIA,tr|Q9RI14|Q9RI14_YERPE,tr|Q9RI14|Q9RI14_YERPE YPCD1.70c protein (Frag...,3881.5837,5.55
...,...,...,...,...,...
2219,MNKNLYRIVFNQARGMLMVVADIAASGRAASSPSSGVGHTQRRRVS...,tr|Q7CGD9|Q7CGD9_YERPE,tr|Q7CGD9|Q7CGD9_YERPE Putative adhesin OS=Yer...,338786.8022,667.75
1687,MLNYFRAILISWKWKLSHHTSRPHDVKEKGHPRKIKVVAWITLFFQ...,tr|Q7CFY4|Q7CFY4_YERPE,tr|Q7CFY4|Q7CFY4_YERPE Putative invasin OS=Yer...,308572.7179,696.15
1468,MPNGNEMAGFYIDKLSLSQRLSIVSETYDRVNKNNKKEKLKYSYDD...,tr|Q7CGR6|Q7CGR6_YERPE,tr|Q7CGR6|Q7CGR6_YERPE Putative virulence dete...,353076.3328,699.33
1277,MDNLRFSSAPTADSIDASIAQHYPDCEPVAVIGYACHFPESPDGET...,tr|Q9Z373|Q9Z373_YERPE,tr|Q9Z373|Q9Z373_YERPE HMWP1 nonribosomal pept...,348794.3311,765.77


In [15]:
def sequenceCheck(sequence):
    currentAA = 0
    output = []
    while currentAA <= len(sequence)-10:
        segment = sequence[currentAA:currentAA+10]
        output.append(hydroScore(segment))
        currentAA +=1
    return(output)
        
sequenceCheck('MDSQRNLLLIALLFVSFMIWQAWQVDNNPQPTAQTTQQTTNTATGDKASQAVPGSGQGQLITVKTDVLSLTINTRGGDIEQANLLAYPDTLGSSNTFELLETTPSFVYQAQSGLTGKNGPDNPANGDRPLFEVPQTSFVLADGQDELRIPLTFTSKDGSVFIKTFVLKRNDYAIGVDYHVNNASAAPLELTLFGQLKQSINLPKKRDTGSNNFALQTYRGAAYSSDETKYKKYSFSDIEDKNLDITTKGGWVAMLQQYFATAWIPAANETNTFYSAELGNGLAAIGFKGAPVVIQPGEQKQLSATLWVGPEIQNKMAEIAPHLDLTVDYGWLWFISQPLFKLLKFIHSFVGNWGFSIIVITFIVRGIMYPLTKAQYTSMAKMRLLQPKLAAMRERIGDDKQRMSQEMMALYKAEKVNPLGGCLPLIIQMPIFLALYYMLMSSVELRHAPFILWIHDLSAQDPYYILPILMGITMYFIQKMSPTTVTDPMQQKIMTFMPVIFTVFFLWFPAGLVLYYIVSNLVTILQQQLIYRGLEKRGLHSREKKK')

KeyError: '1'

In [16]:
df.iloc[3475]['sequence']

'MDSQRNLLLIALLFVSFMIWQAWQVDNNPQPTAQTTQQTTNTATGDKASQAVPGSGQGQLITVKTDVLSLTINTRGGDIEQANLLAYPDTLGSSNTFELLETTPSFVYQAQSGLTGKNGPDNPANGDRPLFEVPQTSFVLADGQDELRIPLTFTSKDGSVFIKTFVLKRNDYAIGVDYHVNNASAAPLELTLFGQLKQSINLPKKRDTGSNNFALQTYRGAAYSSDETKYKKYSFSDIEDKNLDITTKGGWVAMLQQYFATAWIPAANETNTFYSAELGNGLAAIGFKGAPVVIQPGEQKQLSATLWVGPEIQNKMAEIAPHLDLTVDYGWLWFISQPLFKLLKFIHSFVGNWGFSIIVITFIVRGIMYPLTKAQYTSMAKMRLLQPKLAAMRERIGDDKQRMSQEMMALYKAEKVNPLGGCLPLIIQMPIFLALYYMLMSSVELRHAPFILWIHDLSAQDPYYILPILMGITMYFIQKMSPTTVTDPMQQKIMTFMPVIFTVFFLWFPAGLVLYYIVSNLVTILQQQLIYRGLEKRGLHSREKKK'