### _Arabidopsis_ gene sequence expression mapper graph with centroid lens

**Imort useful packages / modules**

In [1]:
# import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ML tools
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler
from sklearn import ensemble

# For output display
from IPython.display import IFrame

# # If running locally, set current directory as projdir
# projdir = '.'

**For google collab**

In [2]:
# # Only if running in Google Colab..!!
# # DO NOT run this cell if running locally - simply comment it out.
# from google.colab import drive
# drive.mount('/content/gdrive/')

# projdir = '/content/gdrive/MyDrive/PlantsAndPython-2021-10-22'
# sys.path.append(projdir)

**Modules from Sourabh's files**

In [3]:
# # import helper_functions
# from helper_functions import loaddata # for data loading
from helper_functions import colorscale_from_matplotlib_cmap # for kmapper color palette

# # import Nicolaou et al. 2011 lense function
# from lenses import fsga_transform

# keppler mapper
import kmapper as km

**Import database**

In [9]:
metadata = pd.read_csv("metadata_abovebelowtissues_centroids.csv")
# This database has:
# - tissue types labeled according to the hypothesis generation group classification
# - euclidean distances of each coordinate t-SNE point to its respective tissue type centroid
# This database was made 24/Nov/21

In [10]:
data = pd.read_csv("37328_genes.csv")
# This database contains 37,328 genes from the MapRateFiltered_v1.csv file in the CleanData HPCC directory
# This database was made 24/Nov/21

ParserError: Error tokenizing data. C error: Expected 37328 fields in line 908, saw 37329


**Subsetting gene expression columns**

In [8]:
genes = list(data.columns) # create list with the gene names
len(genes)

1001

**Set factors and factors levels**

In [None]:
factors = ['Tissue','VegetativeRepro','AboveBelow','Sample Type']
levels = ['Root','Root','Below','knl2 mutant line (flowering buds)']

# filter_by_factor, filter_by_level = ('Tissue', 'Root')
# filter_by_factor, filter_by_level = ('VegetativeRepro', 'Root')
# filter_by_factor, filter_by_level = ('AboveBelow', 'Below')
filter_by_factor, filter_by_level = ('SampleName', 'knl2 mutant line (flowering buds)')

color_by_factor, color_by_level = ('Tissue', 'Root')
# color_by_factor, color_by_level = ('VegetativeRepro', 'Root')
# color_by_factor, color_by_level = ('AboveBelow', 'Below')

**Initialize a KeplerMapper object**

In [11]:
# Initialize mapper object
mymapper = km.KeplerMapper(verbose=1)

# Define Nerve
nerve = km.GraphNerve(min_intersection=1)

KeplerMapper(verbose=1)


**Define lens**

According to Dan's description: _"take the centroid/median of each tissue cluster, and the lens is the eucledian distance of each sample to its respective tissue center"_

In [None]:
# Centroid lens
Clens = df["eucl_dist"] # the euclidean distances are found in the "eucl_dist" column
lens_type = 'Centroid'
# plt.plot(Clens) # plot the lens to see how well represents the data

**Define cover:**

Overlap must be between 0 and 100. Intervals must be less than 90: try between 25 to 85.

In [None]:
# Define cover
cubes, overlap = (100, 90) # cubes = intervals
cover = km.cover.Cover(n_cubes=cubes, perc_overlap=overlap/100.)

**Define clustering algorithm:**

DBSCAN with default parameters. Metric: correlation distance (1 - correlation) between a pair of gene expression profiles.

In [None]:
# Define clustering algorithm
clust_metric = 'correlation'
clusterer = DBSCAN(metric=clust_metric)

**Construct the mapper graph:**

Keep an eye on the number of hypercubes, nodes and edges reported by the algorithm. You can change the graph size by changing the cover parameters.

In [None]:
# Create mapper 'graph' with nodes, edges and meta-information.
graph = mymapper.map(lens=Clens,
                     X=df[genes],
                     clusterer=clusterer,
                     cover=cover,
                     nerve=nerve,
                     precomputed=False,
                     remove_duplicate_nodes=True)

**Kmapper coloring**

In [None]:
# Color nodes by specified color_by_factor, color_by_level

df[color_by_factor] = df[color_by_factor].astype('category')
color_vec = np.asarray([0 if(val == color_by_level) else 1 for val in df[color_by_factor]])
cscale = colorscale_from_matplotlib_cmap(plt.get_cmap('coolwarm'))

**Set coloring levels as kmapper tooltips**

In [None]:
# show color_by_factor levels in tooltip

temp = ['({}, {})'.format(str(p[0]), str(p[1])) for p in zip(df[color_by_factor], df[filter_by_factor])]
df['tooltips'] = temp

**Create and save kmapper graph as html**

In [None]:
# Specify file to save html output
fname = 'LensType_{}_ColorBy_{}_Tips_{}_Data_{}_Cubes_{}_Overlap_{}_Genes_{}.html'.format(lens_type,
                                                              color_by_factor,
                                                              filter_by_factor,
                                                              df_name,
                                                              cubes,
                                                              overlap,
                                                              len(genes))

figtitle = 'Lens type: {}, Tips {}, Color by {} ({}), Database: {}, intervals {}, overlap {}, genes {}'.format(lens_type,
                                                                                                              filter_by_factor,
#                                                                                                               filter_by_level,
                                                                                                              color_by_factor,
                                                                                                              color_by_level,
                                                                                                              df_name,
                                                                                                              cubes, 
                                                                                                              overlap/100.0,
                                                                                                              len(genes))

fpath = '/mnt/home/f0103237/' + fname # is this synthax correct if I run it in the HPCC?

# Create visualization and save to specified file
_ = mymapper.visualize(graph,
                       path_html=fpath,
                       title=figtitle,
                       color_values=color_vec,
                       color_function_name=color_by_factor,
                       colorscale=cscale,
                       custom_tooltips=df['tooltips'])

# Load the html output file
IFrame(src=fpath, width=1000, height=800)