In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import plotly.express as px
import csv
import statistics
import umap
import plotly.graph_objs as go
from pandas.api.types import CategoricalDtype

In [None]:
# Sequence class labels for data sets

names = open("cnames.tsv")
names1 = csv.reader(names, delimiter="\t")
seq_names = pd.DataFrame(names1)

new_header = seq_names.iloc[0]
seq_names= seq_names[1:] 
seq_names.columns = new_header 
seq_names['ID'] = seq_names['ID'].astype(str) + " " + seq_names['name'].astype(str)
seq_names['index'] = seq_names['index'].astype(int)

seq_names_original = seq_names.sort_values(by = "index")

labels = (seq_names_original["ID"].astype(str)).tolist()
labels_reordered = (seq_names["ID"].astype(str)).tolist()

In [None]:
# Load and Set-up Breast Cancer Dataset

data = pd.DataFrame(np.load("breast_cancer.finemapping.sc_scores.npy")).iloc[:, 0:40]
data.columns = labels
data = data[labels_reordered]
bc_labels = csv.reader(open("breast_cancer.finemapping.row_labels.txt"), delimiter="\t")
bc_labels1 = pd.DataFrame(bc_labels)
new_header1 = bc_labels1.iloc[0]
bc_labels1 = bc_labels1[1:]
bc_labels1.columns = new_header1
bc_labels1 = bc_labels1.reset_index()
df1 = bc_labels1.join(data, how='outer').set_index('name').iloc[:, 1:]
df1.index.name = None



In [None]:
# Get subset of dataframe that matches chosen chromosome
sub_df = df1[df1['chrom'].str.endswith('chr' + str(2))].iloc[:, 1:]

# Take average of identical mutations
row_labels = sub_df.index
sub_df['name'] = sub_df.index
sub_df = (sub_df.groupby((sub_df['name'] != sub_df['name'].shift()).cumsum())
                    .mean()
                    .reset_index(drop=True))
sub_df.index = pd.unique(row_labels)

# Rearrange dataframe into three columns: x, y, color
df2 = sub_df.stack().reset_index().iloc[:, ::-1]
df2.columns = ['vals', 'profiles', 'mutation']
df2 = df2[['mutation', 'vals', 'profiles']]

# Use first 100k datasets
df2_sample = df2.head(int(1e5))

# Get x, y, color
df_xlabel = df2_sample['mutation'].tolist()
df_ylabel = df2_sample['vals'].tolist()
df_color = df2_sample['profiles'].tolist()  

In [None]:
# This function shows vis and automatically saves file as html
def scatter_plot_vis(xlabel, ylabel, color):
    d = {'xlabel': xlabel, 'ylabel': ylabel, 'color': color}
    df = pd.DataFrame(d)
    
    traces = [go.Scattergl(
        x = df[df.iloc[:, 2] == uc].iloc[:, 0].tolist(),
        y = df[df.iloc[:, 2] == uc].iloc[:, 1].tolist(),
        marker=dict(
                    size=5.0,
                    opacity=0.5,
                    #color=use_colors[uc % len(use_colors)],
                    line=dict(width=0.2,)),
                              #color=use_colors[uc % len(use_colors)],)),
                              #color='rgba(0,0,0,0.5)')),
        mode='markers',
        name=str(uc)
    ) for uc in np.unique(df.iloc[:, 2])]


    fig = go.Figure(
        data= traces,
        layout=go.Layout(
            title=go.layout.Title(text="Variant effects of chromosome specific mutations"),
            width = 1000,
            height = 500,
            xaxis=dict(showgrid=False, zeroline=False),
            yaxis=dict(showgrid=False, zeroline=False)
        )
    )
    
    fig.update_xaxes(title = "Mutations")
    fig.update_yaxes(title = "Variant Effect Scores")

    fig.show()
    
    fig.write_html("df_scatter.html")

In [None]:
# This function shows vis and returns fig, allowing user to save file under chosen name
def scatter_plot_vis2(xlabel, ylabel, color):
    d = {'xlabel': xlabel, 'ylabel': ylabel, 'color': color}
    df = pd.DataFrame(d)
    
    traces = [go.Scattergl(
        x = df[df.iloc[:, 2] == uc].iloc[:, 0].tolist(),
        y = df[df.iloc[:, 2] == uc].iloc[:, 1].tolist(),
        marker=dict(
                    size=5.0,
                    opacity=0.5,
                    #color=use_colors[uc % len(use_colors)],
                    line=dict(width=0.2,)),
                              #color=use_colors[uc % len(use_colors)],)),
                              #color='rgba(0,0,0,0.5)')),
        mode='markers',
        name=str(uc)
    ) for uc in np.unique(df.iloc[:, 2])]


    fig = go.Figure(
        data= traces,
        layout=go.Layout(
            title=go.layout.Title(text="Variant effects of chromosome specific mutations"),
            width = 1000,
            height = 500,
            xaxis=dict(showgrid=False, zeroline=False),
            yaxis=dict(showgrid=False, zeroline=False)
        )
    )
    
    fig.update_xaxes(title = "Mutations")
    fig.update_yaxes(title = "Variant Effect Scores")

    fig.show()
    
    return fig

In [None]:
# This function returns fig w/o showing vis, allowing user to choose title, labels, and save file under chosen name
def scatter_plot_vis3(xlabel, ylabel, color):
    d = {'xlabel': xlabel, 'ylabel': ylabel, 'color': color}
    df = pd.DataFrame(d)
    
    traces = [go.Scattergl(
        x = df[df.iloc[:, 2] == uc].iloc[:, 0].tolist(),
        y = df[df.iloc[:, 2] == uc].iloc[:, 1].tolist(),
        marker=dict(
                    size=5.0,
                    opacity=0.5,
                    #color=use_colors[uc % len(use_colors)],
                    line=dict(width=0.2,)),
                              #color=use_colors[uc % len(use_colors)],)),
                              #color='rgba(0,0,0,0.5)')),
        mode='markers',
        name=str(uc)
    ) for uc in np.unique(df.iloc[:, 2])]


    fig = go.Figure(
        data= traces,
        layout=go.Layout(
            title = "title",
            width = 1000,
            height = 500,
            xaxis=dict(showgrid=False, zeroline=False),
            yaxis=dict(showgrid=False, zeroline=False)
        )
    )
    
    
    return fig

In [None]:
for i in range(1):

    # Get subset of dataframe that matches chosen chromosome
    sub_df = df1[df1['chrom'].str.endswith('chr' + str(i+1))].iloc[:, 1:]

    # Take average of identical mutations
    row_labels = sub_df.index
    sub_df['name'] = sub_df.index
    sub_df = (sub_df.groupby((sub_df['name'] != sub_df['name'].shift()).cumsum())
                        .mean()
                        .reset_index(drop=True))
    sub_df.index = pd.unique(row_labels)

    # Rearrange dataframe into three columns: x, y, color
    df2 = sub_df.stack().reset_index().iloc[:, ::-1]
    df2.columns = ['vals', 'profiles', 'mutation']
    df2 = df2[['mutation', 'vals', 'profiles']]

    # Use first 100k datasets
    df2_sample = df2.head(int(1e5))

    # Get x, y, color
    df_xlabel = df2_sample['mutation'].tolist()
    df_ylabel = df2_sample['vals'].tolist()
    df_color = df2_sample['profiles'].tolist()
    
    fig = scatter_plot_vis3(df_xlabel, df_ylabel, df_color)
    
    fig.update_xaxes(title = "Mutations")
    fig.update_yaxes(title = "Variant Effect Scores")
    fig.update_layout(title = "Variant effects of certain chromosome "  + str(i+1) + " mutations", width = 1000, height = 500) 
    
    fig.show()
    
    fig.write_html("df_scatter_chr" + str(i+1) + ".html") 