# Toxin heatmap

This notebook generates an interactive heatmap of the presence/absence of toxin outliers using [`plotly`](https://plotly.com/python/).

---

## 0. Install [`arcadia_pycolor`](https://github.com/Arcadia-Science/arcadia-pycolor)
This package distributes Arcadia's colors for use in Python. See more info [here](https://github.com/Arcadia-Science/arcadia-pycolor).

In [None]:
!pip install git+https://github.com/Arcadia-Science/arcadia-pycolor.git#egg=arcadia_pycolor --upgrade

----
## 1. Import necessary packages for plotting

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import arcadia_pycolor as apc

---
## 2. Load and process data

In [None]:
# read dataframe
# the matrix is a presence/ absence matrix where each row is a species and each column is a Uniprot ID
# there is an extra column and an extra row that describe the category of each species or Uniprot ID
df = pd.read_csv('Matrix_Outliers.tsv', sep = '\t')

# this extracts the species name vs species category as a two-column dataframe 
species_cat = df[['Species', 'Species_Cat']]
species_cat = species_cat[species_cat['Species'] != 'Toxin_desc'] # this removes the Toxin_desc row which is irrelevant

# set the order of categories for plotting
cat_order = ['Snake', 'Blood worm', 'Cone snail', 'Wasp and ant', 'Centipede', 'Scorpion', 'Spider', 'Tick']
# assign each category to a number from 1 to the length of the categories
# this is used to be able to color individual cells of the heatmap based on species
cat_vals = np.arange(1, len(cat_order) + 1)

# instantiate the list of colors
colors = ['#efefef'] + apc.Palettes['arcadia:AccentAllOrdered'].list[0: len(cat_order)]

# creates a linear heatmap based on the number of bins of colors
# in this case, the number of categories
def make_discrete_color_map(color_list):
    # create a range of values from 0 to 1
    bins = [i for i in np.arange(0, 1, 1 / (len(color_list)))] + [1]
    # pair those values together as a list of paired lists, since Plotly uses repeated values to create hard boundaries between color blocks
    pairs = [[bins[i], bins[i + 1]] for i in np.arange(len(bins[:-1]))]
    
    # flatten that paired list of indices
    flat_list = [num for sublist in pairs for num in sublist]
    
    # pair the colors with each positional value, returning as a new list of paired values
    return_list = [[flat_list[i], color_list[int(i // 2)]] for i in np.arange(len(flat_list))]
    
    return return_list

# run the above function
color_map = make_discrete_color_map(colors)

# create a dictionary between the category numbers and their values
cat_val_dict = dict(zip(cat_order, cat_vals))

# generate a collector dataframe for re-sorting the data
species_cat_sorted = pd.DataFrame()

# sort the data and add it to the new dataframe
for cat in cat_order:
    sliced_df = species_cat[species_cat['Species_Cat'] == cat]
    species_cat_sorted = pd.concat([species_cat_sorted, sliced_df])
    
# create a dictionary for the categories and species within them
species_cat_dict = dict(zip(species_cat_sorted['Species'], species_cat_sorted['Species_Cat']))

#######################################################
'''
This section isn't actually used for plotting, 
but if you wanted to figure out how to display the different categories of proteins, 
this processes them into a DataFrame for you.
'''

# create a dataframe that's just the toxin descriptions
toxin_desc = pd.DataFrame(df[df['Species'] == 'Toxin_desc'].drop(columns = ['Species', 'Species_Cat']).iloc[0])
toxin_desc.reset_index(inplace = True)
toxin_desc.rename(columns = {'index': 'UniProtID', 0: 'description'}, inplace = True)

# make a dictionary out of that
toxin_desc_dict = dict(zip(toxin_desc['UniProtID'].values, toxin_desc['description'].values))

#######################################################

# filter the full presence-absence dataframe, setting the index and columns to be viable for heatmap plotting
matrix = df[df['Species'] != 'Toxin_desc']
matrix.index = matrix['Species']
matrix.drop(columns = ['Species', 'Species_Cat'], inplace = True)

# transpose the matrix so species are on the X and uniprot IDs are on the Y
matrix = matrix.T
matrix = matrix[species_cat_sorted['Species'].values]

# make sure every cell is an integer
for col in matrix:
    category = species_cat_dict[col]
    matrix[col] = cat_val_dict[category] * matrix[col].astype(int)

display(matrix)

---
## 3. Generate the plot

In [None]:
# generate a hovertext list collector
hovertext = list()

# iterate over the cells, generating a hovertext entry for each cell
for yi, yy in enumerate(matrix.index):
    hovertext.append(list())
    for xi, xx in enumerate(matrix.columns):
        hovertext[-1].append('<b>Taxon:</b> {}<br><b>Species:</b> {}<br><br><b>Protein Category:</b> {}<br><b>UniProt ID:</b> {}'.format(species_cat_dict[xx], xx, toxin_desc_dict[yy], yy))

# make an empty figure
fig1 = go.Figure()

# add the heatmap ot the figure
fig1.add_trace(go.Heatmap(z = matrix, x = matrix.columns, y = matrix.index, 
                          colorscale = color_map, # this controls the species - color assignments
                          xgap = 1, # these two values control the amount of space between cells; space was added so we can see the grid
                          ygap = 1,
                          hoverinfo = 'text', # sets the hover-over info to whatever is passed to the "text" argument below
                          text = hovertext, # pulls in the hovertext we generated in the for loops above
                          colorbar = dict(
                                title="<b>Taxon</b>",
                                titleside="top",
                                tickmode="array",
                                tickvals=[0.5, 1.375, 2.25, 3.125, 4,4.875, 5.75, 6.625, 7.5], # these are set so that labels are centered relative to their blocks
                                ticktext= ['Not found'] + cat_order,
                              thickness = 24, # controls the width of the colorbar
                              len = 0.15, # controls the length of the colorbar as a fraction of the total heatmap y length
                              tickfont_size = 12, # sets font of labels to the right of colorbar
                              y = 1, # sets the y position of the top of the color bar...
                              yanchor = 'top', # because we set yanchor to top
                              xpad = 3, # controls space around the legend; this is the minimum value to avoid cutting off the "Wasp and ant" text
                              ypad = 0
                            )
                         ))

# set the size of the figure and make the margins really small
fig1.update_layout(
    width = 865, height = 1550, font_size = 8,
    margin=dict(l=5, r=5, t=5, b=5)
)

# the font sizes here are the maximum possible for this set of dimensions
fig1.update_yaxes(tickfont_size = 9.5)
fig1.update_xaxes(tickangle = -90, tickfont_size = 9.5, side = 'top')

# make the font conform with Arcadia's standards
fig1.update_layout(font=dict(family="Arial"))

# this controls what buttons are in the hover-over menu; most don't make sense for this plot type so I removed them
plot_config = {'modeBarButtonsToRemove': ['zoom', 'pan', 'zoomIn', 'zoomOut', 'autoscale']}

# save the figure
fig1.write_html('Fig4_outliers_toxin_heatmap_interactive.html', 
                config=plot_config)

# show the figure using the same plot config parameters as used for saving
fig1.show(config=plot_config)