In [1]:
import os
import json
import dreem
import numpy as np
import pandas as pd

In [17]:
## Plot and save coverage and dms signal for all references

samples_dir = '/Users/alberic/Desktop/Pro/RouskinLab/projects/deep_learning/RNA_data_old/DMS/dataset/Justin/samples/'

# Create a study object to process the data
study = dreem.draw.study.Study(
    data = [json.load(open(os.path.join(samples_dir,'Dragui_1_S6_L001/output/Dragui_1_S6_L001.json'), 'r')),
            json.load(open(os.path.join(samples_dir,'Dragui_2_S7_L001/output/Dragui_2_S7_L001.json'), 'r'))]
)

for sample in study.df['sample'].unique():
    for ref in study.get_df(sample=sample)['reference'].unique():

        # Base coverage
        save_dir = os.path.join(samples_dir, sample, 'plots', ref)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        result = study.base_coverage(sample=sample, reference=ref)
        result['fig'].write_image(os.path.join(save_dir, 'base_coverage.png'))

        # Mutation fraction
        result = study.mutation_fraction(sample=sample, reference=ref)
        result['fig'].update_layout(xaxis = dict(tickmode = 'array', tickvals=[]))
        result['fig'].write_image(os.path.join(save_dir, 'mutation_fraction.png'))


# Filter out references with nan in sub_rate
study.df = study.df[study.df['min_cov']>0].reset_index(drop=True)

Turning data into a dataframe...
Dragui_1_S6_L001... Dragui_2_S7_L001... Done.
Setting dataframe...
Done.


In [67]:
# Bar plot of the minimum coverage across all reference, sorted

import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Bar(
    x=np.arange(len(study.df['min_cov'])),
    y=study.df[study.df['sample']==study.df['sample'].unique()[0]]['min_cov'].sort_values(ascending=False),
    name='min_cov'
))
fig.update_layout(title=f'Minimum coverage across all references', 
                  xaxis_title='Reference number', yaxis_title='Minimum coverage')
fig.update_yaxes(type="log")
fig.show()

In [62]:
## Average coverage and dms signal across all references

streched_dms = []
streched_coverage = []
streched_bases = np.linspace(0, 1000, 1000)

for i in range(len(study.df)):
    streched_dms.append(np.interp(streched_bases, np.arange(len(study.df['sub_rate'][i])), study.df['sub_rate'][i]))
    streched_coverage.append(np.interp(streched_bases, np.arange(len(study.df['cov'][i])), study.df['cov'][i]))

def plot_average_std(signals):

    average_signal = np.mean(signals, axis=0)
    std_signal = 0.3*np.std(signals, axis=0)


    fig = go.Figure()

    # Add the average signal as a bar plot
    fig.add_trace(go.Scatter(
        x=np.arange(len(average_signal)),
        y=average_signal,
    ))

    fig.add_trace(go.Scatter(
        x=np.arange(len(average_signal)),
        y=average_signal - std_signal,
        mode='lines',
        line=dict(color='blue', width=0.5),
        fill=None,
    ))
    fig.add_trace(go.Scatter(
        x=np.arange(len(average_signal)),
        y=average_signal + std_signal,
        mode='lines',
        line=dict(color='blue', width=0.5),
        fill='tonexty'
    ))

    

    # Show the plot
    return fig

In [63]:
fig = plot_average_std(streched_dms)
# Update layout and axis labels
fig.update_layout(
    title='Average Signal with Standard Deviation',
    xaxis_title='Normalized position',
    yaxis_title='DMS signal',
    showlegend=False,
    height=600,
)
fig.show()

In [68]:
fig = plot_average_std(streched_coverage)
# fig.add_trace(go.Scatter(x=np.arange(len(streched_coverage[0])), y=3000*np.ones(len(streched_coverage[0])), mode='lines', line=dict(color='red', width=0.5)))
# Update layout and axis labels
fig.update_layout(
    title='Average Coverage with Standard Deviation',
    xaxis_title='Normalized position',
    yaxis_title='coverage',
    showlegend=False
)
fig.update_yaxes(type="log")
fig.show()

## Additional anaylsis

In [18]:
n_zeros_begin = []

for sub_rate in study.df['sub_rate']:
    n_zero = 0
    for i in range(len(sub_rate)):
        if sub_rate[i]==0:
            n_zero += 1
        else:
            break
    n_zeros_begin.append(n_zero)

In [19]:
import plotly.express as px
px.histogram(n_zeros_begin)

## Findings

- Most of the data has good coverage, only 2 ref have 0 coverage in first sample, 3 in second sample
- All ref have the first two bases with 0 dms
- Lower coverage and DMS signal at the end of the sequences