In [2]:
import plotly
import pandas as pd
from pathlib import Path
import plotly.express as px
import plotly.io as io

In [74]:
df = pd.read_csv("cov/result_PVAL_71_S7.hist", 
                 sep="\t", 
                 names=["chrom", "start", "end", "name", "score", "strand","depth","num_bases_at_depth","size_of_feature","pros_of_feature_at_depth"])
df = df[df.chrom != "all"].copy()
df[['ID', 'rest']] = df['name'].str.split('_cds_', -1, expand=True) # https://stackoverflow.com/a/39358924
df[["exon_number", "unknown", "exon_chrom", "exon_start_pos", "exon_strand"]] = df['rest'].str.split('_', -1, expand=True)
df = df.drop(["name","rest"], axis = 1)

df = df.astype({
           'chrom':'str',
           'start':'int',
           'end':'int',
           'score':'float',
           'strand':'str',
           'depth':'int',
           'num_bases_at_depth':'int',
           'size_of_feature':'int',
           'pros_of_feature_at_depth':'float',
           'ID':'category',
           "exon_number":'category', 
           "unknown":'category', 
           "exon_chrom":"category", 
           "exon_start_pos":"int", 
           "exon_strand":"category"
          }
         )
transcripts = df['ID'].unique()

grouped = df.groupby(df.ID)

transcripts_list = []

for isof in transcripts:
    c = grouped.get_group(isof)
    # Scrape NCBI transcript ID
    NCBI_id = c.iloc[0]['ID']
    # Duplicate rows with several bases 
    c = c.loc[c.index.repeat(c.num_bases_at_depth)] # https://stackoverflow.com/a/57009491
    # Extract key values for the depth column
    c = c.describe()['depth'].to_frame(NCBI_id).T
    transcripts_list.append(c)

# Join all key data into one df
all_transcripts = (pd.concat(transcripts_list, axis=0)
                   .rename(columns={'count': 'total_exons_length'})
                   .astype({
                           'total_exons_length':'int',
                           'max':'int',
                           'min':'int'}
                          )                  
                  )
all_transcripts.to_csv()


In [7]:
sample_covs = Path('cov/')

for path in sorted(sample_covs.glob("*.hist")):
    sample_name = str(path.stem).split("results_")[1]
    out_path = "sample_coverages/"
    df = pd.read_csv(path, 
                     sep="\t", 
                     names=["chrom", "start", "end", "name", "score", "strand","depth","num_bases_at_depth","size_of_feature","pros_of_feature_at_depth"])
    df = df[df.chrom != "all"].copy()
    df[['ID', 'rest']] = df['name'].str.split('_cds_', -1, expand=True) # https://stackoverflow.com/a/39358924
    df[["exon_number", "unknown", "exon_chrom", "exon_start_pos", "exon_strand"]] = df['rest'].str.split('_', -1, expand=True)
    df = df.drop(["name","rest"], axis = 1)

    df = df.astype({
               'chrom':'str',
               'start':'int',
               'end':'int',
               'score':'float',
               'strand':'str',
               'depth':'int',
               'num_bases_at_depth':'int',
               'size_of_feature':'int',
               'pros_of_feature_at_depth':'float',
               'ID':'category',
               "exon_number":'category', 
               "unknown":'category', 
               "exon_chrom":"category", 
               "exon_start_pos":"int", 
               "exon_strand":"category"
              }
             )
    df = df.loc[df.index.repeat(df.num_bases_at_depth)] # https://stackoverflow.com/a/57009491
    box_fig = px.box(df, x="ID", y="depth")
    box_fig.write_html(out_path + "box/" + sample_name + "_box" + ".html")    
    #vio_fig = px.violin(df, x="ID", y="depth", box=True)
    
    # Extract a list of all unique NCBI ID:s
    transcripts = df['ID'].unique()
    
    grouped = df.groupby(df.ID)

    transcripts_list = []
    
    for isof in transcripts:
        c = grouped.get_group(isof)
        # Scrape NCBI transcript ID
        NCBI_id = c.iloc[0]['ID']
        # Duplicate rows with several bases 
        #c = c.loc[c.index.repeat(c.num_bases_at_depth)] # https://stackoverflow.com/a/57009491
        # Extract key values for the depth column
        c = c.describe()['depth'].to_frame(NCBI_id).T
        transcripts_list.append(c)

    # Join all key data into one df
    all_transcripts = (pd.concat(transcripts_list, axis=0)
                       .rename(columns={'count': 'total_exons_length'})
                       .astype({
                               'total_exons_length':'int',
                               'max':'int',
                               'min':'int'}
                              )                  
                      )
    all_transcripts.index.name = "NCBI_ID"
    bar_fig = px.bar(all_transcripts.reset_index(), x='NCBI_ID', y='mean')
    bar_fig.write_html(out_path + "bar/" + sample_name + "_bar" + ".html")
    out_path = Path(out_path + "metrics/" + sample_name + ".csv")
    all_transcripts.to_csv(out_path)