In [1]:
import pandas as pd
import plotly.express as px
import plotly.io as io
import pathlib

In [2]:
df = pd.read_csv("chrs/chrs_PVAL_65_S1.tumor_deduped.cov.ids.tsv", 
    sep="\t", 
    # Note this must have the same contents as in add_ids.awk's header_row variable
    names=["sample","name","chromosome","start","end","coverage","num_bases_at_depth","length","procent_of_coverage_in_region"],
    header=0)
# Assign correct datatypes to each column
df = df.astype({
    'sample':'str',
    'name':'str',
    'chromosome':'str',
    'start':'int',
    'end':'int',
    'coverage':'int',
    'num_bases_at_depth':'int',
    'length':'int',
    'procent_of_coverage_in_region':'float'
    })

In [3]:
df

Unnamed: 0,sample,name,chromosome,start,end,coverage,num_bases_at_depth,length,procent_of_coverage_in_region
0,PVAL_65_S1,45801726-45801846,chr1,45801726,45801846,449,1,120,0.008333
1,PVAL_65_S1,45801726-45801846,chr1,45801726,45801846,449,1,120,0.008333
2,PVAL_65_S1,45801726-45801846,chr1,45801726,45801846,451,1,120,0.008333
3,PVAL_65_S1,45801726-45801846,chr1,45801726,45801846,458,1,120,0.008333
4,PVAL_65_S1,45801726-45801846,chr1,45801726,45801846,459,1,120,0.008333
...,...,...,...,...,...,...,...,...,...
36811,PVAL_65_S1,21991921-21992041,chr9,21991921,21992041,361,8,120,0.066667
36812,PVAL_65_S1,21991921-21992041,chr9,21991921,21992041,362,3,120,0.025000
36813,PVAL_65_S1,21991921-21992041,chr9,21991921,21992041,363,1,120,0.008333
36814,PVAL_65_S1,21991921-21992041,chr9,21991921,21992041,364,1,120,0.008333


In [4]:
# Duplicate rows with several bases, this enables calculating averages in a more easier way
df = df.loc[df.index.repeat(df.num_bases_at_depth)] # https://stackoverflow.com/a/57009491

In [5]:
# Print in order to double check that the previous step went OK
df.to_csv("temp/chrs_PVAL_65_S1.tumor_deduped_repeated_cov.csv")

In [6]:
# Get a list of names so they can be looped through later on
region = df['name'].unique()
# Split the df into "name" subgroups
grouped_names = df.groupby(df.name)
# This will hold a list of df:s with depth metrics data such as 
region_list = []
len(region)

534

In [7]:
for r in region:
    c = grouped_names.get_group(r)
    # Scrape NCBI transcript ID
    name_id = c.iloc[0]['name']
    c.index.name = "row_no"
    # Extract metrics values for the depth column
    c = c.describe()['coverage'].to_frame(name_id).T
    region_list.append(c)

In [8]:
region_list[0]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
45801726-45801846,120.0,606.783333,76.295113,449.0,565.75,626.5,671.0,702.0


In [9]:
region_list[1]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
45804226-45804346,120.0,529.858333,48.762724,443.0,485.75,546.5,573.0,592.0


In [11]:
# Join all metrics data into one df
regions_metrics = (pd.concat(region_list, axis=0)
                           .rename(columns={'count': 'total_length_of_region'})
                           .astype({
                                 'total_length_of_region':'int',
                                 'max':'int',
                                 'min':'int'}
                                  ))
regions_metrics.index.name = "ID"
regions_metrics.to_csv("temp/chrs_PVAL_65_S1.tumor_deduped_repeated_cov.metrics.csv")
regions_metrics

Unnamed: 0_level_0,total_length_of_region,mean,std,min,25%,50%,75%,max
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
45801726-45801846,120,606.783333,76.295113,449,565.75,626.5,671.00,702
45804226-45804346,120,529.858333,48.762724,443,485.75,546.5,573.00,592
156789941-156790061,120,358.466667,39.340559,269,330.25,372.5,392.25,412
156792441-156792561,120,460.200000,46.336656,335,433.25,479.0,498.25,509
156794941-156795061,120,549.991667,74.733082,387,504.75,561.0,616.25,652
...,...,...,...,...,...,...,...,...
21981921-21982041,120,324.583333,45.268310,229,286.00,336.5,362.00,381
21984421-21984541,120,271.050000,23.516077,209,254.75,276.0,291.00,302
21986921-21987041,120,359.450000,43.407750,277,320.00,368.5,400.00,418
21989421-21989541,120,473.266667,54.965189,363,432.75,483.5,520.50,560


In [13]:
# Create bar plots
bar_fig = px.bar(regions_metrics.reset_index(), 
    y='mean', 
    x='ID', 
    hover_data=["total_length_of_region", "mean", 'std', 'min', '25%', '50%', '75%', 'max',"ID"],
    title="Sample name: PVAL_65_S1")


# Join all metrics tables into one
#all_metrics = pd.concat(metrics_df_list, axis=0, ignore_index=True)
# Give the index column own name so it looks nice in the csv
#all_metrics.index.name = "ID"

# https://stackoverflow.com/a/59869358
# Write all bar plots into one html page
bar_fig.write_html("plot.html")

#with open("plot.html", 'a') as f:
#    f.write(bar_fig.to_html(full_html=False, 
#    include_plotlyjs='cdn', 
#    config= {'displaylogo': False}))