In [2]:

from collections import Counter
from collections import deque
import csv
import matplotlib.pyplot as plt
import pandas as pd
import plotly
import plotly.graph_objs as go
import plotly.express as px
import numpy as np

In [3]:
aa_df = pd.read_csv("../../data/amino_acid_properties.csv")
aa_df = aa_df.set_index("1-letter code", drop = False)
aa_dict = aa_df.to_dict()


In [4]:
def open_fasta_file(path):
    aa_seq = []                 
    with open(path) as f:
        for line in f:                          # create list of strings 
            aa_seq.append(line)   
    sequence = []
    for i, line in enumerate(aa_seq):           # i as index of each line
        line = line.replace("\n","")            # delete \n at the end of each string               
        if line.startswith(">"):                # all the identifier lines (start of protein)
            identifier = line.replace(">","")
        else:
            for aa in line:
                sequence += aa                  # non identifiers are compressed into one single string 
    return [identifier, sequence]
    


In [13]:
class Protein:

    def __init__(self, ID = "not known", 
                name = "not known", 
                metrics = "not known", 
                fasta_file = "not known", 
                AAsequence = "not known",
                identifier = "not known"):
        self.ID = ID
        self.name = name
        self.metrics = metrics
        self.fasta_file = fasta_file
        self.AAsequence = AAsequence
        self.identifier = identifier
    
    def open_fasta_file(self, file = None):
        if self.fasta_file == "not known":
            self.fasta_file = file
        self.AAsequence = open_fasta_file(self.fasta_file)[1]
        self.identifier = open_fasta_file(self.fasta_file)[0]

    def create_df(self):
        list = []
        for x in self.AAsequence:
            metric = {}
            for key, value in self.metrics.items():
                metric.update({key: value[x]})
            list.append(metric)
        self.df = pd.DataFrame(list)

    def plot(self, metric = "hydropathy", window_size = 1):

        if metric == "hydropathy":
            metric = "hydropathy index (Kyte-Doolittle method)"

        layout = {"title": {"text": "{0}, averaging window size: {1}".format(self.name, window_size)},
                "template" : "plotly", 
                "yaxis": {"title": {"text": metric}},
                "xaxis": {"title": {"text": "amino acid position"}}
                }

        window = deque([], maxlen=window_size)
        average = []
        for pos, aa in enumerate(self.AAsequence):
            value = self.df['hydropathy index (Kyte-Doolittle method)'][pos]
            window.append(value)
            average.append(sum(window)/len(window))

        data = [
            go.Scatter(
                x = self.df.index,
                y = np.array(average),
                hovertext="Name:"+self.df["Name"] + "<br />" +\
                        "abbr.:" + self.df["3-letter code"]+ ", " + self.df["1-letter code"]
            )
        ]

        fig = go.Figure(data = data, layout = layout)
        return fig.show()

        

In [14]:
Protein_P32249 = Protein("P32249", 
                        "G-protein coupled receptor 183", 
                        metrics = aa_dict)
Protein_P32249.open_fasta_file("P32249.fasta")
Protein_P32249.create_df()
Protein_P32249.plot(window_size = 1)
Protein_P32249.plot(window_size = 5)
Protein_P32249.plot(window_size = 10)