Now that we've mastered some basics, lets load the data we requested from the GUI earlier and try some basic statistics.

In [None]:
import pandas as pd

df = pd.read_json ('../example_GUI_dataset/requested_data---chunk--0.json')
print(df)


In [None]:
# That's a lot of data. Let's take a look at the column names
print(df.columns)

In [None]:
# we don't need a lot of those columns. Let's drop them and keep only what we need
df = df[['log.tpm', 'z', 'model', 'gene', 'tumor_type']]
print(df)

Thats better! 

TPM:  Transcript Per Million. The sum of all TPM values is the same in all samples, such that a TPM value represents a relative expression level that, in principle, should be comparable between samples.
Z-score: Calculated by subtracting the overall average gene abundance from the raw expression for each gene, and dividing that result by the standard deviation (SD) of all of the measured counts across all samples

Let's try plotting one vs the other!

In [None]:
# import plotly express
#import sys
#!{sys.executable} -m pip install plotly.express # This will install the library. We only need to do this once.
import plotly.express as px

# Select just a couple cancer types
nsclc_df = df[df['tumor_type'].isin(['Colorectal','NSCLC'])]

# Plot the data
fig = px.scatter(nsclc_df, x=nsclc_df['log.tpm'], y=nsclc_df['z'], color="tumor_type", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
fig.show()

In [None]:
# How does expression of one gene correlate with expression of another?
# Let's plot the correlation between the two genes

# Pivot the data to make a matrix
values_df = nsclc_df.reset_index().pivot_table(index="model", columns="gene", values="log.tpm", aggfunc='mean')
print(values_df)

# Plot the data
fig = px.scatter(values_df, x=values_df['TP53'], y=values_df['TP53BP2'], marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
fig.show()


In [None]:
import plotly.express as px

# Select just a couple genes
genes = ['TP53', 'BRCA1', 'BRCA2']
nsclc_df = df[df['gene'].isin(genes)]
types = ['Colorectal','NSCLC']
nsclc_df = df[df['tumor_type'].isin(types)]
print(nsclc_df)

fig = px.violin(nsclc_df, y="log.tpm", x="gene", color="tumor_type", box=True, points="all", hover_data=df.columns)
fig.show()

In [None]:
# Let's make a basic heatmap of the data

# Limit to NSCLC
types = ['NSCLC']
nsclc_df = df[df['tumor_type'].isin(types)]

# Pivot the data to make a matrix
values_df = nsclc_df.reset_index().pivot_table(index="model", columns="gene", values="z", aggfunc='mean')
#print(values_df)

#print(list(values_df))
#print(list(values_df.index.values))

fig = px.imshow(values_df,
                labels=dict(x="gene", y="model", color="z score"),
                y=list(values_df.index.values),
                x=list(values_df)
               )
fig.update_xaxes(side="top")
fig.update_layout(
    title="NSCLC Expression",
    coloraxis_colorbar=dict(
        title="Z Score"),
    width = 700, height = 700,
    autosize = False)
fig.show()

In [None]:
# We can do better. Let's make a heatmap with our data clustered and add dendrogram
#import sys
#!{sys.executable} -m pip install seaborn # This will install the library. We only need to do this once.
import seaborn as sns; sns.set_theme(color_codes=True)

g = sns.clustermap(values_df)