In [27]:
import pandas as pd
import re
import numpy as np
import math
import plotly.graph_objs as go
from plotly.offline import iplot
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.decomposition import PCA


In [7]:
#My paper's dataset
#EC - 40
EC_train = pd.read_csv('My_paper/EC_X_train_40.csv')
EC_test = pd.read_csv('My_paper/EC_X_test_40.csv')
EC_val = pd.read_csv('My_paper/EC_X_val_40.csv')

#SA - 40
SA_train = pd.read_csv('My_paper/SA_X_train_40.csv')
SA_test = pd.read_csv('My_paper/SA_X_test_40.csv')
SA_val = pd.read_csv('My_paper/SA_X_val_40.csv')

#PA - 40
PA_train = pd.read_csv('My_paper/PA_X_train_40.csv')
PA_test = pd.read_csv('My_paper/PA_X_test_40.csv')
PA_val = pd.read_csv('My_paper/PA_X_val_40.csv')

SA_feature_genome = pd.concat([SA_train, SA_test, SA_val], axis=0)
EC_feature_genome = pd.concat([EC_train, EC_test, EC_val], axis=0)
PA_feature_genome = pd.concat([PA_train, PA_test, PA_val], axis=0)


In [14]:
SA_plot_scatter_MIC(SA_feature_genome,"Staphylococcus aureus ATCC 25923")
EC_plot_scatter_MIC(EC_feature_genome,"Escherichia coli ATCC 25922")
PA_plot_scatter_MIC(PA_feature_genome,"Pseudomonas aeruginosa ATCC 27853")

In [11]:
def SA_plot_scatter_MIC(SA_feature_genome,bacteria):
    SA_feature_genome_mean = round(SA_feature_genome['NEW-CONCENTRATION'].mean(),3)
    SA_feature_genome_median = round(SA_feature_genome['NEW-CONCENTRATION'].median(),3)
    SA_feature_genome_std = round(SA_feature_genome['NEW-CONCENTRATION'].std(),3)

    trace1 =go.Scatter(
                        y = SA_feature_genome['SEQ_LEN'],
                        x = SA_feature_genome['NEW-CONCENTRATION'],
                        mode = "markers",
                        name = 'MIC and length distribution',
                        marker = dict(color='#E8A317',
                        line=dict(
                        color='MediumPurple',
                        width=0.5
                        )))
    trace2 = go.Scatter(y=[5,40], x= [SA_feature_genome_mean,SA_feature_genome_mean],
                        mode='lines+markers+text',
                        text=[f'{SA_feature_genome_mean} ',''],
                        textposition = 'bottom center',
                        textfont=dict(
                        family="sans serif",
                        size=18,
                        color="crimson"
                        ),
                        name='Mean')
    trace3 = go.Histogram(
                        y=SA_feature_genome['SEQ_LEN'],
                        xaxis='x2',
                        name = 'Sequence length counts',
                        marker=dict(color='rgba(171, 50, 96, 0.3)'))

    trace4 = go.Scatter(y=[5,40], x= [SA_feature_genome_mean+SA_feature_genome_std,SA_feature_genome_mean+SA_feature_genome_std],
                        mode='lines+markers+text',
                        text=[f'{round(SA_feature_genome_mean+SA_feature_genome_std,3)} ',''],
                        textposition = 'bottom center',
                        textfont=dict(
                        family="sans serif",
                        size=18,
                        color="crimson"
                        ),
                        name='Mean+Std')

    trace5 = go.Scatter(y=[5,40], x= [SA_feature_genome_mean-SA_feature_genome_std,SA_feature_genome_mean-SA_feature_genome_std],
                        mode='lines+markers+text',
                        text=[f'{round(SA_feature_genome_mean-SA_feature_genome_std,3)} ',''],
                        textposition = 'bottom center',
                        textfont=dict(
                        family="sans serif",
                        size=18,
                        color="crimson"
                        ),
                        name='Mean-Std')

    data = [trace1,trace3,trace2,trace4,trace5]
    layout = dict(title = {'text': '<i>Staphylococcus aureus ATCC 25923</i> MIC and Sequence Length',
                          'y': 0.95,
                          'x': 0.45,
                          'xanchor': 'center',
                          'yanchor': 'top'},
                  xaxis= dict(title= 'MIC (Unit:logµM)',ticklen= 5,zeroline= False),
                  yaxis= dict(title= 'Sequence Length',ticklen= 5,zeroline= False),
                  xaxis2=dict(
                    title={'text':"Count"},
                    anchor="y",    
                    overlaying="x",  
                    side="top"  
                    )
#                   legend=dict(
#                     yanchor="top",
#                     y=0.99,
#                     xanchor="right",
#                     x=0.99
#                 )
                 )
    fig = dict(data = data, layout = layout)
    iplot(fig)

In [12]:
def EC_plot_scatter_MIC(EC_feature_genome,bacteria):
    EC_feature_genome_mean = round(EC_feature_genome['NEW-CONCENTRATION'].mean(),3)
    EC_feature_genome_median = round(EC_feature_genome['NEW-CONCENTRATION'].median(),3)
    EC_feature_genome_std = round(EC_feature_genome['NEW-CONCENTRATION'].std(),3)

    trace1 =go.Scatter(
                        y = EC_feature_genome['SEQ_LEN'],
                        x = EC_feature_genome['NEW-CONCENTRATION'],
                        mode = "markers",
                        name = 'MIC and length distribution',
                        marker = dict(color = '#F88158',
                        line=dict(
                        color='#8A2BE2',
                        width=0.5
                        )))

    trace2 = go.Scatter(y=[5,40], x= [EC_feature_genome_mean,EC_feature_genome_mean],
                        mode='lines+markers+text',
                        text=[f'{round(EC_feature_genome_mean,3)} ',""],
                        textposition = 'bottom center',
                        textfont=dict(
                        family="sans serif",
                        size=18,
                        color="crimson"
                        ),
                        name='Mean')

    trace3 = go.Histogram(
                        y=EC_feature_genome['SEQ_LEN'],
                        xaxis= 'x2',
                        name = 'Sequence length counts',
                        marker=dict(color='rgba(0, 128, 64, 0.3)'))

    trace4 = go.Scatter(y=[5,40], x= [EC_feature_genome_mean+EC_feature_genome_std,EC_feature_genome_mean+EC_feature_genome_std],
                        mode='lines+markers+text',
                        text=[f'{round(EC_feature_genome_mean+EC_feature_genome_std,3)} ',""],
                        textposition = 'bottom center',
                        textfont=dict(
                        family="sans serif",
                        size=18,
                        color="crimson"
                        ),
                        name='Mean+Std')

    trace5 = go.Scatter(y=[5,40], x= [EC_feature_genome_mean-EC_feature_genome_std,EC_feature_genome_mean-EC_feature_genome_std],
                        mode='lines+markers+text',
                        text=[f'{round(EC_feature_genome_mean-EC_feature_genome_std,3)} ',''],
                        textposition = 'bottom center',
                        textfont=dict(
                        family="sans serif",
                        size=18,
                        color="crimson"
                        ),
                        name='Mean-Std')

    data = [trace1,trace3,trace2,trace4,trace5]
    layout = dict(title= {'text':'<i>Escherichia coli ATCC 25922</i> MIC and Sequence Length',
                          'y': 0.95,
                          'x': 0.45,
                          'xanchor': 'center',
                          'yanchor': 'top'},
                  xaxis= dict(title= 'MIC (Unit:logµM)',ticklen= 5,zeroline= False),
                  yaxis= dict(title= 'Sequence Length',ticklen= 5,zeroline= False),
                  xaxis2=dict(
                    title={'text':"Count"},
                    anchor="y",    
                    overlaying="x",  
                    side="top"  
                    )
                 )
    fig = dict(data = data, layout = layout)
    iplot(fig)

In [13]:
def PA_plot_scatter_MIC(PA_feature_genome,bacteria):
    PA_feature_genome_mean = round(PA_feature_genome['NEW-CONCENTRATION'].mean(),3)
    PA_feature_genome_median = round(PA_feature_genome['NEW-CONCENTRATION'].median(),3)
    PA_feature_genome_std = round(PA_feature_genome['NEW-CONCENTRATION'].std(),3)

    trace1 =go.Scatter(
                        y = PA_feature_genome['SEQ_LEN'],
                        x = PA_feature_genome['NEW-CONCENTRATION'],
                        mode = "markers",
                        name = 'MIC and length distribution',
                        marker = dict(color = '#6CC417',
                        line=dict(
                        color='#8A2BE2',
                        width=0.5
                        )))
    trace2 = go.Scatter(y=[5,40], x= [PA_feature_genome_mean,PA_feature_genome_mean],
                        mode='lines+markers+text',
                        text=[f'{round(PA_feature_genome_mean,3)} ',''],
                        textposition = 'bottom center',
                        textfont=dict(
                        family="sans serif",
                        size=18,
                        color="crimson"
                        ),
                        name='Mean')
    trace3 = go.Histogram(
                        y=PA_feature_genome['SEQ_LEN'],
                        xaxis= 'x2',
                        name = 'Sequence length counts',
                        marker=dict(color='rgba(50, 50, 250, 0.3)'))

    trace4 = go.Scatter(y=[5,40], x= [PA_feature_genome_mean+PA_feature_genome_std,PA_feature_genome_mean+PA_feature_genome_std],
                        mode='lines+markers+text',
                        text=[f'{round(PA_feature_genome_mean+PA_feature_genome_std,3)} ',''],
                        textposition = 'bottom center',
                        textfont=dict(
                        family="sans serif",
                        size=18,
                        color="crimson"
                        ),
                        name='Mean+Std')

    trace5 = go.Scatter(y=[5,40], x= [PA_feature_genome_mean-PA_feature_genome_std,PA_feature_genome_mean-PA_feature_genome_std],
                        mode='lines+markers+text',
                        text=[f'{round(PA_feature_genome_mean-PA_feature_genome_std,3)} ',''],
                        textposition = 'bottom center',
                        textfont=dict(
                        family="sans serif",
                        size=18,
                        color="crimson"
                        ),
                        name='Mean-Std')

    data = [trace1,trace3,trace2,trace4,trace5]

    layout = dict(title ={'text': '<i>Pseudomonas aeruginosa ATCC 27853</i> MIC and Sequence Length',
                          'y': 0.95,
                          'x': 0.45,
                          'xanchor': 'center',
                          'yanchor': 'top'},
                  xaxis= dict(title= 'MIC (Unit:logµM)',ticklen= 5,zeroline= False),
                  yaxis= dict(title= 'Sequence Length',ticklen= 5,zeroline= False),
                  xaxis2=dict(
                    title={'text':"Count"},
                    anchor="y",    
                    overlaying="x",  
                    side="top"  
                    )
                 )
    fig = dict(data = data, layout = layout)
    iplot(fig)

In [15]:
trace1 = go.Histogram(
    x=SA_feature_genome['SEQ_LEN'],
    name = '<i>Staphylococcus aureus ATCC 25923</i>',
    marker=dict(color='rgba(171, 50, 96, 0.5)'))
trace2 = go.Histogram(
    x=EC_feature_genome['SEQ_LEN'],
    name = '<i>Escherichia coli ATCC 25922</i>',
    marker=dict(color='rgba(0, 128, 64, 0.5)'))
trace3 = go.Histogram(
    x=PA_feature_genome['SEQ_LEN'],
    name = '<i>Pseudomonas aeruginosa ATCC 27853</i>',
    marker=dict(color='rgba(50, 50, 250, 0.5)'))
data = [trace1,trace2,trace3]
layout = go.Layout(barmode='overlay',
                   title='Distribution length of 3 different strains',
                   xaxis=dict(title='Sequence Length'),
                   yaxis=dict( title='Count'),
)
fig = go.Figure(data = data, layout = layout)
iplot(fig)

In [19]:
def con_label(concentration):
    if concentration > 1.8:
        return 'Higher MIC'
    elif concentration < 0.5:
        return 'Lower MIC'
    else: 
        return 'Medium MIC'
    
SA_feature_genome['Con_label'] = SA_feature_genome['NEW-CONCENTRATION'].map(lambda x: con_label(x))
EC_feature_genome['Con_label'] = EC_feature_genome['NEW-CONCENTRATION'].map(lambda x: con_label(x))
PA_feature_genome['Con_label'] = PA_feature_genome['NEW-CONCENTRATION'].map(lambda x: con_label(x))

In [20]:
_labels = ['Higher MIC', 'Medium MIC', 'Lower MIC']
_SA_counts = [len(SA_feature_genome[SA_feature_genome['Con_label'] == 'Higher MIC']),
              len(SA_feature_genome[SA_feature_genome['Con_label'] == 'Medium MIC']),
              len(SA_feature_genome[SA_feature_genome['Con_label'] == 'Lower MIC'])]
_EC_counts = [len(EC_feature_genome[EC_feature_genome['Con_label'] == 'Higher MIC']),
              len(EC_feature_genome[EC_feature_genome['Con_label'] == 'Medium MIC']),
              len(EC_feature_genome[EC_feature_genome['Con_label'] == 'Lower MIC'])]
_PA_counts = [len(PA_feature_genome[PA_feature_genome['Con_label'] == 'Higher MIC']),
              len(PA_feature_genome[PA_feature_genome['Con_label'] == 'Medium MIC']),
              len(PA_feature_genome[PA_feature_genome['Con_label'] == 'Lower MIC'])]

# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=3, specs=[[{'type': 'domain'}, {'type': 'domain'}, {'type': 'domain'}]])

fig.add_trace(go.Pie(labels=_labels, values=_SA_counts, name="Staphylococcus aureus ATCC 25923", textinfo='value+percent'),
              1, 1)
fig.add_trace(go.Pie(labels=_labels, values=_EC_counts, name="Escherichia coli ATCC 25922", textinfo='value+percent'),
              1, 2)
fig.add_trace(go.Pie(labels=_labels, values=_PA_counts, name="Pseudomonas aeruginosa ATCC 27853", textinfo='value+percent'),
              1, 3)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="Classification of MIC",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='SA', x=0.125, y=0.5, font_size=20, showarrow=False),
                 dict(text='EC', x=0.500, y=0.5, font_size=20, showarrow=False),
                 dict(text='PA', x=0.875, y=0.5, font_size=20, showarrow=False)],
    legend={'traceorder': 'normal'})

fig.show()

In [18]:
metrics=['MSE','RMSE', 'R²', 'PCC']
fig = go.Figure(data=[
    go.Bar(name='CNN(Pre-trained embeddings)', x=metrics, y=[0.256,math.sqrt(0.256), 0.548, 0.740]),
    go.Bar(name='Bi-LSTM(Pre-trained embeddings *)', x=metrics, y=[0.235,math.sqrt(0.235), 0.584, 0.772]),
    go.Bar(name='Ensemble Model(Pre-trained embeddings *)', x=metrics, y=[0.225,math.sqrt(0.225), 0.603, 0.781])
])
# Change layout
fig.update_layout(title_text='Compare the results of different models on <i>E. coli ATCC 25922</i>',
                  barmode='group',xaxis_title ='Performance Metrics',yaxis_title ='Measure')
fig.show()

In [None]:
def plot_pred_real(y_pred,y_test,our_model,y_pred_1,y_test_1,other_model,bacterium_name):
    trace1 =go.Scatter( x = y_test,
                        y = y_pred,
                        hovertemplate =
                        '<i>INDEX</i>: %{text}'+
                        '<br><b>Test</b>: %{x}'+
                        '<br><b>Prediction</b>: %{y}<br>',
                        mode = "markers",
                        name='{}'.format(our_model),
                        marker = dict(color = 'blue'))
    
    trace2 =go.Scatter( x = y_test_1,
                        y = y_pred_1,
                        hovertemplate =
                        '<i>INDEX</i>: %{text}'+
                        '<br><b>Test</b>: %{x}'+
                        '<br><b>Prediction</b>: %{y}<br>',
                        mode = "markers",
                        name='{}'.format(other_model),
                        marker = dict(color = 'red'))
    
    # Calculate regression line
    regression_line1 = np.polyfit(y_test, y_pred, 1)
    x_values = np.linspace(min(y_test), max(y_test), 100)
    y_values = np.polyval(regression_line1, x_values)

    trace3 = go.Scatter(
        x=x_values,
        y=y_values,
        mode='lines',
        name='{} regression line'.format(our_model),
        opacity=0.5,
        marker=dict(color='blue')
    )
    
    # Calculate regression line
    regression_line2 = np.polyfit(y_test_1, y_pred_1, 1)
    x_values1 = np.linspace(min(y_test_1), max(y_test_1), 100)
    y_values1 = np.polyval(regression_line2, x_values1)

    trace4 = go.Scatter(
        x=x_values1,
        y=y_values1,
        mode='lines',
        name='{} regression line'.format(other_model),
        opacity=0.5,
        marker=dict(color='red')
    )

    # Create perfect positive correlation line
    perfect_line = np.linspace(min(min(y_test), min(y_test_1)), max(max(y_test), max(y_test_1)), 100)
    
    trace5 = go.Scatter(
        x=perfect_line,
        y=perfect_line,
        mode='lines',
        name='Perfect positive correlation line',
        opacity=0.2,
        marker=dict(color='green')
    )
    
    data = [trace1,trace2,trace3,trace4,trace5]
    layout = dict(title = '<i>{}</i> Prediction and Experiment MIC Comparison'.format(bacterium_name),
                  xaxis= dict(title= 'Experiment (Unit: log µM)',ticklen= 5,zeroline= False),
                  yaxis= dict(title= 'Prediction (Unit: log µM)',ticklen= 5,zeroline= False)
                 )
    fig = dict(data = data, layout = layout)

    iplot(fig)

In [23]:
#AAC
SA_AAC = pd.DataFrame({'AAC': SA_feature_genome.iloc[:,5:25].columns,
                      'Higher': SA_feature_genome[SA_feature_genome['Con_label']=='Higher MIC'].iloc[:,5:25].sum()/len(SA_feature_genome[SA_feature_genome['Con_label']=='Higher MIC']),
                      'Medium': SA_feature_genome[SA_feature_genome['Con_label']=='Medium MIC'].iloc[:,5:25].sum()/len(SA_feature_genome[SA_feature_genome['Con_label']=='Medium MIC']),
                      'Lower': SA_feature_genome[SA_feature_genome['Con_label']=='Lower MIC'].iloc[:,5:25].sum()/len(SA_feature_genome[SA_feature_genome['Con_label']=='Lower MIC'])})
EC_AAC = pd.DataFrame({'AAC': EC_feature_genome.iloc[:,5:25].columns,
                      'Higher': EC_feature_genome[EC_feature_genome['Con_label']=='Higher MIC'].iloc[:,5:25].sum()/len(EC_feature_genome[EC_feature_genome['Con_label']=='Higher MIC']),
                      'Medium': EC_feature_genome[EC_feature_genome['Con_label']=='Medium MIC'].iloc[:,5:25].sum()/len(EC_feature_genome[EC_feature_genome['Con_label']=='Medium MIC']),
                      'Lower': EC_feature_genome[EC_feature_genome['Con_label']=='Lower MIC'].iloc[:,5:25].sum()/len(EC_feature_genome[EC_feature_genome['Con_label']=='Lower MIC'])})
PA_AAC = pd.DataFrame({'AAC': PA_feature_genome.iloc[:,5:25].columns,
                      'Higher': PA_feature_genome[PA_feature_genome['Con_label']=='Higher MIC'].iloc[:,5:25].sum()/len(PA_feature_genome[PA_feature_genome['Con_label']=='Higher MIC']),
                      'Medium': PA_feature_genome[PA_feature_genome['Con_label']=='Medium MIC'].iloc[:,5:25].sum()/len(PA_feature_genome[PA_feature_genome['Con_label']=='Medium MIC']),
                      'Lower': PA_feature_genome[PA_feature_genome['Con_label']=='Lower MIC'].iloc[:,5:25].sum()/len(PA_feature_genome[PA_feature_genome['Con_label']=='Lower MIC'])})
#Length
SA_MIC = pd.DataFrame({'Length': np.arange(6,41),
                      'Higher': SA_feature_genome[SA_feature_genome['Con_label']=='Higher MIC']['SEQ_LEN'].value_counts().sort_index(),
                      'Medium': SA_feature_genome[SA_feature_genome['Con_label']=='Medium MIC']['SEQ_LEN'].value_counts().sort_index(),
                      'Lower': SA_feature_genome[SA_feature_genome['Con_label']=='Lower MIC']['SEQ_LEN'].value_counts().sort_index()})
EC_MIC = pd.DataFrame({'Length': np.arange(6,41),
                      'Higher': EC_feature_genome[EC_feature_genome['Con_label']=='Higher MIC']['SEQ_LEN'].value_counts().sort_index(),
                      'Medium': EC_feature_genome[EC_feature_genome['Con_label']=='Medium MIC']['SEQ_LEN'].value_counts().sort_index(),
                      'Lower': EC_feature_genome[EC_feature_genome['Con_label']=='Lower MIC']['SEQ_LEN'].value_counts().sort_index()})
PA_MIC = pd.DataFrame({'Length': np.arange(6,41),
                      'Higher': PA_feature_genome[PA_feature_genome['Con_label']=='Higher MIC']['SEQ_LEN'].value_counts().sort_index(),
                      'Medium': PA_feature_genome[PA_feature_genome['Con_label']=='Medium MIC']['SEQ_LEN'].value_counts().sort_index(),
                      'Lower': PA_feature_genome[PA_feature_genome['Con_label']=='Lower MIC']['SEQ_LEN'].value_counts().sort_index()})

In [22]:
fig = go.Figure(data=[
    go.Bar(name='Higher MIC', x=SA_AAC['AAC'], y=SA_AAC['Higher']),
    go.Bar(name='Medium MIC', x=SA_AAC['AAC'], y=SA_AAC['Medium']),
    go.Bar(name='Lower MIC', x=SA_AAC['AAC'], y=SA_AAC['Lower']),
])
# Change layout
fig.update_layout(title_text='Between Amino Acid and MIC level proportion on <i>S. aureus ATCC 25923<i>',
                  barmode='group',xaxis_title ='Amino Acid',yaxis={'tickformat': '.0%','title': 'Propotion'})
fig.show()

In [25]:
fig = go.Figure(data=[
    go.Bar(name='Higher MIC', x=SA_MIC['Length'], y=SA_MIC['Higher']/2644),
    go.Bar(name='Medium MIC', x=SA_MIC['Length'], y=SA_MIC['Medium']/2644),
    go.Bar(name='Lower MIC', x=SA_MIC['Length'], y=SA_MIC['Lower']/2644),
])
# Change layout
fig.update_layout(title_text='Between length and MIC level proportion on <i>S. aureus ATCC 25923<i>',
                  barmode='group',xaxis_title ='Length of AMP sequences',yaxis={'tickformat': '.0%','title': 'Propotion'})
fig.show()

In [34]:
#調整輸入的特徵
pca=PCA(n_components=2)
df = PA_feature_genome
components = pca.fit_transform(df.iloc[:,5:250])
df = pd.concat([df.reset_index(drop=True),pd.DataFrame(components,columns = ['PC1','PC2'])],axis=1)
fig = px.scatter(df, x='PC1', y='PC2',color=df['Con_label'],title='PCA for <i> P. aeruginosa ATCC 27853 </i> on iFeatures')
fig.show()