In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import statistics as st

from sklearn.metrics import pairwise_distances
from scipy.spatial import distance_matrix

#from sklearn import cluster
#from sklearn.preprocessing import StandardScaler
%matplotlib inline


$$
\begin{aligned}
&\begin{array}{|c|c|c|}
\hline
\hline \textbf { Constantes} & \textbf {  Nombre } & \textbf { Tipo }  \\
\hline
\text{Genome ids (list)} &  &  \\
\hline
\text{Family ids (list o dict)} &  & \\
\hline
\text{NP ids}  &  &  \\
\hline
\text{Tamaños de los dataframes} & & \\
\end{array}
\end{aligned}
$$

In [3]:
df1 = pd.read_csv("/home/csar/Proyectos/Posdoc/Proyecto_pos/CsarPos/blast_EF_vs_GenomeDB/pscplos17.blast", sep = "\t", names = ['query', 'subject',
           'pc_identity', 'aln_length', 'mismatches', 'gaps_opened',
           'query_start', 'query_end', 'subject_start', 'subject_end',
           'e_value', 'bitscore'], engine = 'python')

In [4]:
#Función para filtrar por bitscore, devuelve un dataframe con las entradas que tengan 
#bitscore >= threshold
def bitscore_filter(data, threshold):
    data.query("bitscore >= @threshold", inplace = True)
    data.reset_index(drop=True, inplace=True)
    return data
    

In [4]:
DF1 = df1[df1["bitscore"]>=100]
DF1.reset_index(drop=True, inplace=True)
DF1["query"][0]

'3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogenase_1|Cglu'

In [5]:
def extract_GenomeInfo(data):
    genomas_id = []
    genes_id = []
    genomas_nombres = []
    for x in data:
        x_nombre = x.split("|")[-1]
        x_ID = x.split("|")[1]
        x_gi = x_ID.split(".")[0]+"."+ x_ID.split(".")[1]
        x_gen = x_ID.split(".")[2]
        genomas_nombres.append(x_nombre)
        genomas_id.append(x_gi)
        genes_id.append(x_gen)
    
    genome_info = {"genome_name": genomas_nombres, "id_genome": genomas_id, "id_gene": genes_id}
    genome_info = pd.DataFrame(genome_info, columns = ["genome_name", "id_genome", "id_gene"])
    return genome_info


DF1 = pd.concat([DF1,extract_GenomeInfo(DF1["subject"])], axis=1)
DF1

Unnamed: 0,query,subject,pc_identity,aln_length,mismatches,gaps_opened,query_start,query_end,subject_start,subject_end,e_value,bitscore,genome_name,id_genome,id_gene
0,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|6666666.146852.1090|6666666.146852|NC_00293...,79.62,530,108,0,1,530,1,530,0.000000e+00,847.0,CorynebacteriumdiphtheriaeNCTC13129s,6666666.146852,1090
1,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|749927.13.1727|749927.13|NC_014318.1|D3phos...,59.59,532,213,2,1,530,1,532,0.000000e+00,620.0,AmycolatopsismediterraneiU32s,749927.13,1727
2,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|1156913.7.6305|1156913.7|NC_021252.1|D3phos...,59.20,527,214,1,1,526,1,527,0.000000e+00,613.0,AmycolatopsisorientalisHCCB10007s,1156913.7,6305
3,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|6666666.104540.4265|6666666.104540|AOHO01|D...,59.20,527,214,1,1,526,1,527,0.000000e+00,612.0,AmycolatopsisdecaplaninaDSM44594s,6666666.104540,4265
4,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|446462.15.6083|446462.15|NC_021252.1|D3phos...,60.34,532,209,2,1,530,1,532,0.000000e+00,595.0,ActinosynnemamirumDSM43827NC0130931s,446462.15,6083
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,3PGA_AMINOACIDS|2|Phosphoserine_aminotransfera...,gi|6666666.146852.774|6666666.146852|NC_002935...,61.33,362,139,1,1,362,11,371,5.000000e-165,471.0,CorynebacteriumdiphtheriaeNCTC13129s,6666666.146852,774
278,3PGA_AMINOACIDS|2|Phosphoserine_aminotransfera...,gi|6666666.146872.438|6666666.146872|NC_017803...,60.06,363,145,0,1,363,12,374,9.000000e-165,471.0,ActinoplanesspSE50110s,6666666.146872,438
279,3PGA_AMINOACIDS|2|Phosphoserine_aminotransfera...,gi|6666666.111572.2478|6666666.111572|NC_00948...,64.64,362,126,2,1,362,302,661,1.000000e-156,460.0,ClavibactermichiganensissubspmichiganensisNCPP...,6666666.111572,2478
280,3PGA_AMINOACIDS|2|Phosphoserine_aminotransfera...,gi|367928.21.407|367928.21|NC_008618.1|Phospho...,57.30,370,151,2,1,363,11,380,6.000000e-150,433.0,BifidobacteriumadolescentisATCC15703s,367928.21,407


**Función para obtener las semillas y familias de las enzimas**

In [6]:
def extract_fam_seed(data):
    fam_list = []
    seed_list = []
    for x in data:
        x_string = x.split("|")
        x_fam = x_string[1]
        x_enz = x_string[2].split("_")[-1]
        #x_value = "F"+x_fam+"S"+x_enz
        fam_list.append(x_fam)
        seed_list.append(x_enz)
    enzime_info = {"fam": fam_list, "seed": seed_list} #Diccionario para crear dataframe
    enzime_info = pd.DataFrame(enzime_info, columns=["fam", "seed"])
    return enzime_info
DF1 = pd.concat([DF1,extract_fam_seed(DF1["query"])], axis=1)
DF1

Unnamed: 0,query,subject,pc_identity,aln_length,mismatches,gaps_opened,query_start,query_end,subject_start,subject_end,e_value,bitscore,genome_name,id_genome,id_gene,fam,seed
0,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|6666666.146852.1090|6666666.146852|NC_00293...,79.62,530,108,0,1,530,1,530,0.000000e+00,847.0,CorynebacteriumdiphtheriaeNCTC13129s,6666666.146852,1090,1,1
1,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|749927.13.1727|749927.13|NC_014318.1|D3phos...,59.59,532,213,2,1,530,1,532,0.000000e+00,620.0,AmycolatopsismediterraneiU32s,749927.13,1727,1,1
2,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|1156913.7.6305|1156913.7|NC_021252.1|D3phos...,59.20,527,214,1,1,526,1,527,0.000000e+00,613.0,AmycolatopsisorientalisHCCB10007s,1156913.7,6305,1,1
3,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|6666666.104540.4265|6666666.104540|AOHO01|D...,59.20,527,214,1,1,526,1,527,0.000000e+00,612.0,AmycolatopsisdecaplaninaDSM44594s,6666666.104540,4265,1,1
4,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|446462.15.6083|446462.15|NC_021252.1|D3phos...,60.34,532,209,2,1,530,1,532,0.000000e+00,595.0,ActinosynnemamirumDSM43827NC0130931s,446462.15,6083,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,3PGA_AMINOACIDS|2|Phosphoserine_aminotransfera...,gi|6666666.146852.774|6666666.146852|NC_002935...,61.33,362,139,1,1,362,11,371,5.000000e-165,471.0,CorynebacteriumdiphtheriaeNCTC13129s,6666666.146852,774,2,3
278,3PGA_AMINOACIDS|2|Phosphoserine_aminotransfera...,gi|6666666.146872.438|6666666.146872|NC_017803...,60.06,363,145,0,1,363,12,374,9.000000e-165,471.0,ActinoplanesspSE50110s,6666666.146872,438,2,3
279,3PGA_AMINOACIDS|2|Phosphoserine_aminotransfera...,gi|6666666.111572.2478|6666666.111572|NC_00948...,64.64,362,126,2,1,362,302,661,1.000000e-156,460.0,ClavibactermichiganensissubspmichiganensisNCPP...,6666666.111572,2478,2,3
280,3PGA_AMINOACIDS|2|Phosphoserine_aminotransfera...,gi|367928.21.407|367928.21|NC_008618.1|Phospho...,57.30,370,151,2,1,363,11,380,6.000000e-150,433.0,BifidobacteriumadolescentisATCC15703s,367928.21,407,2,3


In [7]:
DF1[(DF1["fam"]=="1")&(DF1["genome_name"]=="CatenulisporaacidiphilaDSM44928s")]["id_gene"].drop_duplicates()

6      7934
35     6857
36     3876
42      130
47      635
53     6960
59     5905
62     2729
112    5387
Name: id_gene, dtype: object

**Función para obtener las copias dentro de cada genoma**

Argumento: -data -> DataFrame DF1 que contiene solo aquellos genomas con bitscore >=100.
           -fam_id -> número que identifica a la familia
           -GenomeNames -> Lista con los nombres de los genomas.

Salida: Diccionario con los genomas como claves, y el valor de cada clave es una lista con los id
        de las copias de las semillas de la Familia F dentro del genoma da nombre a la clave. 
        

In [8]:
GenomeNames = DF1["genome_name"]
genome_dict = {}
#GenomeNames
for name in GenomeNames:
    genome_dict[name] = []
genome_dict


{'CorynebacteriumdiphtheriaeNCTC13129s': [],
 'AmycolatopsismediterraneiU32s': [],
 'AmycolatopsisorientalisHCCB10007s': [],
 'AmycolatopsisdecaplaninaDSM44594s': [],
 'ActinosynnemamirumDSM43827NC0130931s': [],
 'Allokutzneriaalbatas': [],
 'CatenulisporaacidiphilaDSM44928s': [],
 'ActinoplanesspSE50110s': [],
 'ArthrobacterarilaitensisRE117s': [],
 'ActinoplanesspN902109s': [],
 'ArthrobacterphenanthrenivoransSphe3s': [],
 'ArthrobacterchlorophenolicusA6s': [],
 'ClavibactermichiganensissubspmichiganensisNCPPB382s': [],
 'ArthrobacteraurescensTC1s': [],
 'CitricoccusspCH26s': [],
 'Bifidobacteriumanimalissubsplactiss': [],
 'BifidobacteriumadolescentisATCC15703s': []}

In [9]:

def gene_copies(data, fam_id):   #dataframe DF1 y número de identificación de la familia
    gene_dict = genome_dict
    L = len(data)
    for i in range(L):
        if ((data["fam"][i] == fam_id) & (data["id_gene"][i] not in gene_dict[data["genome_name"][i]])):
            gene_dict[data["genome_name"][i]].append(data["id_gene"][i])
    
    return gene_dict

gene_copies(DF1, "1")
        
            

{'CorynebacteriumdiphtheriaeNCTC13129s': ['1090'],
 'AmycolatopsismediterraneiU32s': ['1727',
  '1890',
  '8523',
  '2787',
  '7471',
  '1713',
  '3265',
  '5511'],
 'AmycolatopsisorientalisHCCB10007s': ['6305',
  '7552',
  '6102',
  '2764',
  '3546',
  '1496',
  '5908'],
 'AmycolatopsisdecaplaninaDSM44594s': ['4265',
  '4027',
  '1533',
  '4526',
  '2782',
  '6732'],
 'ActinosynnemamirumDSM43827NC0130931s': ['6083', '4214', '6094', '2953'],
 'Allokutzneriaalbatas': ['5461', '1464', '3096'],
 'CatenulisporaacidiphilaDSM44928s': ['7934',
  '6857',
  '3876',
  '130',
  '635',
  '6960',
  '5905',
  '2729',
  '5387'],
 'ActinoplanesspSE50110s': ['7282', '1182', '4085'],
 'ArthrobacterarilaitensisRE117s': ['1460', '435'],
 'ActinoplanesspN902109s': ['6869', '1388', '4410', '3493', '2685'],
 'ArthrobacterphenanthrenivoransSphe3s': ['2378', '608'],
 'ArthrobacterchlorophenolicusA6s': ['2320',
  '89',
  '129',
  '719',
  '191',
  '27',
  '3605',
  '34'],
 'Clavibactermichiganensissubspmichigan

In [10]:
DF1[["fam","genome_name","id_gene"]].drop_duplicates().value_counts(["fam", "genome_name", "id_gene"]).head(20)


fam  genome_name                            id_gene
1    ActinoplanesspN902109s                 1388       1
                                            2685       1
     CatenulisporaacidiphilaDSM44928s       6857       1
                                            635        1
                                            5905       1
                                            5387       1
                                            3876       1
                                            2729       1
                                            130        1
     Bifidobacteriumanimalissubsplactiss    829        1
                                            379        1
     BifidobacteriumadolescentisATCC15703s  325        1
                                            1188       1
                                            1038       1
     ArthrobacterphenanthrenivoransSphe3s   608        1
                                            2378       1
     ArthrobacterchlorophenolicusA6s

In [11]:
DF1[["fam","genome_name","id_gene"]].drop_duplicates().value_counts(["fam", "genome_name"]).to_frame("copy_number")


Unnamed: 0_level_0,Unnamed: 1_level_0,copy_number
fam,genome_name,Unnamed: 2_level_1
1,CatenulisporaacidiphilaDSM44928s,9
1,AmycolatopsismediterraneiU32s,8
1,ArthrobacterchlorophenolicusA6s,8
1,AmycolatopsisorientalisHCCB10007s,7
1,AmycolatopsisdecaplaninaDSM44594s,6
1,ActinoplanesspN902109s,5
1,ArthrobacteraurescensTC1s,5
1,ActinosynnemamirumDSM43827NC0130931s,4
1,Allokutzneriaalbatas,3
1,ActinoplanesspSE50110s,3


In [12]:
DF_copies = DF1[["fam","genome_name","id_gene"]].drop_duplicates().value_counts(["fam", "genome_name"]).to_frame("copy_number").reset_index()
DF_copies
#####################Observaciones###########
#DF_copies muestra el número de copias en cada genoma por familia de enzimas (no por semilla).
#Agregar una columna extra con la lista de los id de los genes que 
#corresponden a las copias que se están contando

Unnamed: 0,fam,genome_name,copy_number
0,1,CatenulisporaacidiphilaDSM44928s,9
1,1,AmycolatopsismediterraneiU32s,8
2,1,ArthrobacterchlorophenolicusA6s,8
3,1,AmycolatopsisorientalisHCCB10007s,7
4,1,AmycolatopsisdecaplaninaDSM44594s,6
5,1,ActinoplanesspN902109s,5
6,1,ArthrobacteraurescensTC1s,5
7,1,ActinosynnemamirumDSM43827NC0130931s,4
8,1,Allokutzneriaalbatas,3
9,1,ActinoplanesspSE50110s,3


**Función para obtener media y desviación estandar del número de copias**

Argumento: dataframe con fam, genome_name, copy_number.

Salida: Una lista con los umbrales para determinar a partir de cuántas copias se considera que hay expansión.
Hay que tomar en cuenta que el elemento 0 de la lista corresponde a la familia 1 (los valores están desplazados).

In [13]:
##Se comentó el umbral en media+std (el artículo dice que habría que poner media+2*std). 

## Ya que la salida de la prueba de evo parece solo utilizar la media. En la prueba se 
## eligen genomas que contengan al menos 5 copias, es este umbral el que utilizaremos. 

def exp_threshold(data):
    F = data["fam"].drop_duplicates().value_counts().sum() #Número de familias distintas
    threshold = []
    for i in range(1,F+1): 
        mean = data[data["fam"]==str(i)]["copy_number"].mean() #Cantidad promedio de copias por genoma
        #std = data[data["fam"]==str(i)]["copy_number"].std() #Desviación estandar del número de copias por genoma
        #threshold.append(mean+std)
        threshold.append(mean)
    return threshold
exp_threshold(DF_copies)

[4.176470588235294, 1.0]

**Filtrar DF_copies para que solo muestre aquellos genomas que pasan el umbral. Es decir, aquellos que 
formaran las familias expandidas**

Argumento: DataFrame DF_copies y la lista de threshold

Salida: Lista con los genomas en los que se observó expansión. 

In [14]:
##Corregir, debe dar un diccionario por cada familia.

def exp_genomes(data, thresholds):
    F = data["fam"].drop_duplicates().value_counts().sum() #Número de familias distintas
    fam_list = list(range(1,F+1))
    L = len(data)
    #exp_genomes = []
    exp_genomes_dic = {}
    for f in fam_list:            ##Se puede optimizar con memoization
        for i in range(L):
            if ((data["fam"][i] == str(f)) & (data["copy_number"][i]>thresholds[f-1])):
                #exp_genomes.append(data["genome_name"][i])
                exp_genomes_dic[data["genome_name"][i]] = data["copy_number"][i]
    return exp_genomes_dic

exp_genomes(DF_copies,exp_threshold(DF_copies))
        
            
            
        
    

{'CatenulisporaacidiphilaDSM44928s': 9,
 'AmycolatopsismediterraneiU32s': 8,
 'ArthrobacterchlorophenolicusA6s': 8,
 'AmycolatopsisorientalisHCCB10007s': 7,
 'AmycolatopsisdecaplaninaDSM44594s': 6,
 'ActinoplanesspN902109s': 5,
 'ArthrobacteraurescensTC1s': 5}

In [15]:
DF_gene_copies = DF_copies[['genome_name','copy_number']][DF_copies["fam"]=='1'].reset_index(drop="True")
DF_gene_copies["fam_2"] = DF_copies[DF_copies["fam"]=='2']["copy_number"].reset_index(drop= "True")
DF_gene_copies.columns = ['genome_name', "fam_1", "fam_2"]

In [16]:
DF_gene_copies

Unnamed: 0,genome_name,fam_1,fam_2
0,CatenulisporaacidiphilaDSM44928s,9,1
1,AmycolatopsismediterraneiU32s,8,1
2,ArthrobacterchlorophenolicusA6s,8,1
3,AmycolatopsisorientalisHCCB10007s,7,1
4,AmycolatopsisdecaplaninaDSM44594s,6,1
5,ActinoplanesspN902109s,5,1
6,ArthrobacteraurescensTC1s,5,1
7,ActinosynnemamirumDSM43827NC0130931s,4,1
8,Allokutzneriaalbatas,3,1
9,ActinoplanesspSE50110s,3,1


In [17]:
cmap, norm = mcolors.from_levels_and_colors([1, 4.9, 10], ['white', 'brown'])
ax = sns.heatmap(DF_heat, fmt = " ", cmap=cmap, annot = True,
            cbar= False,linewidths=0.30, linecolor = "black", square = True, norm=norm)
ax.tick_params(labelbottom=False,labeltop=True)
plt.xticks(
    ticks = [0.3,1.3],
    rotation=45, 
    horizontalalignment='left',
    fontweight='light',
    labels = ["Familia 1", "Familia 2"]
    #fontsize='x-large'  
)
#plt.xticks(ticks = [0.5,1.5], labels = ["Familia 1", "Familia 2"], rotation = 45)

NameError: name 'DF_heat' is not defined

In [18]:
s = DF_gene_copies.style
#cell_hover = {  # for row hover use <tr> instead of <td>
#    'selector': 'td:hover',
#    'props': [('background-color', '#ffffb3')]
#}
#cols_names = {
#    'selector': '.col_heading',
#    'props': 'text-align: center; font-weight: bold;'
#}
#headers = {
#    'selector': 'th:not(.index_name)',
#    'props': 'background-color: #000066; color: white;'
#}
#s.set_table_styles([cell_hover, cols_names])

In [19]:
#DF_gene_copies[DF_gene_copies["fam_1"]>=DF_gene_copies["fam_1"].mean()]
TF_fam_1=['true' if (x> DF_gene_copies["fam_1"].mean()) else 'false' for x in DF_gene_copies["fam_1"] ]
TF_fam_2=['true' if (x> DF_gene_copies["fam_2"].mean()) else 'false' for x in DF_gene_copies["fam_2"] ]
TF_gen_name = ['false'for i in range(0,17)]

In [70]:
def style_tips(function, data):
    data_tips = {}
    cols = list(DF_gene_copies.select_dtypes([np.number]).columns)
    if funtion == "mode":
        tips_dic = {"gen_names": ['false'for i in range(0,len(data))],
                    'TF_'+cols[0]: ['true' if (x> DF_gene_copies[cols[0]].) else 'false' for x in DF_gene_copies[cols[0]]],
                   'TF_'+cols[1]: ['true' if (x> DF_gene_copies[cols[1]].apply(function)) else 'false' for x in DF_gene_copies[cols[1]]]
                    }

In [71]:
style_tips(np.mean, DF_gene_copies)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [69]:
cols = DF_gene_copies.select_dtypes([np.number]).columns
list(cols)

['fam_1', 'fam_2']

In [20]:
TF_gen_name

['false',
 'false',
 'false',
 'false',
 'false',
 'false',
 'false',
 'false',
 'false',
 'false',
 'false',
 'false',
 'false',
 'false',
 'false',
 'false',
 'false']

In [21]:

s.set_table_styles([  # create internal CSS classes
    {'selector': '.true', 'props': 'background-color: brown;'},
    {'selector': '.false', 'props': 'background-color: white;'}
], overwrite=False)
cell_color = pd.DataFrame({"genome_name": TF_gen_name, "fam_1": TF_fam_1, "fam_2": TF_fam_2},
                          index=DF_gene_copies.index
                          )
#cell_color
s.set_td_classes(cell_color).hide(axis="index")

genome_name,fam_1,fam_2
CatenulisporaacidiphilaDSM44928s,9,1
AmycolatopsismediterraneiU32s,8,1
ArthrobacterchlorophenolicusA6s,8,1
AmycolatopsisorientalisHCCB10007s,7,1
AmycolatopsisdecaplaninaDSM44594s,6,1
ActinoplanesspN902109s,5,1
ArthrobacteraurescensTC1s,5,1
ActinosynnemamirumDSM43827NC0130931s,4,1
Allokutzneriaalbatas,3,1
ActinoplanesspSE50110s,3,1


In [22]:
Border =['border-black' for x in DF_gene_copies.index]

#Bor_fam_2=['border-black' for x in DF_gene_copies["fam_2"] ]
#Bor_gen_name = ['border-black' for i in range(0,17)]


In [23]:
s.set_table_styles([  # create internal CSS classes
    {'selector': '.border-black', 'props': 'border: 2px solid;'},
    #{'selector': '.border-green', 'props': 'border: 2px dashed green;'},
], overwrite=False)
cell_border = pd.DataFrame({"genome_name":Border, "fam_1": Border, "fam_2":Border },
                          index=DF_gene_copies.index
                          )
s.set_td_classes(cell_border+cell_color)

genome_name,fam_1,fam_2
CatenulisporaacidiphilaDSM44928s,9,1
AmycolatopsismediterraneiU32s,8,1
ArthrobacterchlorophenolicusA6s,8,1
AmycolatopsisorientalisHCCB10007s,7,1
AmycolatopsisdecaplaninaDSM44594s,6,1
ActinoplanesspN902109s,5,1
ArthrobacteraurescensTC1s,5,1
ActinosynnemamirumDSM43827NC0130931s,4,1
Allokutzneriaalbatas,3,1
ActinoplanesspSE50110s,3,1


In [24]:
cell_border

Unnamed: 0,genome_name,fam_1,fam_2
0,border-black,border-black,border-black
1,border-black,border-black,border-black
2,border-black,border-black,border-black
3,border-black,border-black,border-black
4,border-black,border-black,border-black
5,border-black,border-black,border-black
6,border-black,border-black,border-black
7,border-black,border-black,border-black
8,border-black,border-black,border-black
9,border-black,border-black,border-black


In [25]:
def mean_highlighter(x):
    style_lt = "background-color: white; color: black; font-weight: bold; border: solid; text-align: center; font-size:1.2em;"
    style_gt = "background-color: LightGreen; color: black; font-weight: bold; border: solid; text-align:center;"
    gt_mean = x > x.mean()
    return [style_gt if i else style_lt for i in gt_mean]
    

In [47]:
def median_highlighter(x):
    style_lt = "background-color: white; color: black; font-weight: bold; border: solid; text-align: center; font-size:1.2em;"
    style_gt = "background-color: LightGreen; color: black; font-weight: bold; border: solid; text-align:center;"
    gt_median = x > x.mean()
    return [style_gt if i else style_lt for i in gt_median]

In [62]:
DF_gene_copies["fam_1"].mode()
DF_gene_copies.apply()

0    2
Name: fam_1, dtype: int64

In [64]:
def mode_highlighter(x):
    style_lt = "background-color: white; color: black; font-weight: bold; border: solid; text-align: center; font-size:1.2em;"
    style_gt = "background-color: LightGreen; color: black; font-weight: bold; border: solid; text-align:center;"
    gt_mode = x > st.mode(x)
    return [style_gt if i else style_lt for i in gt_mode]

In [38]:
def std_highlighter(x):
    style_lt = "background-color: white; color: black; font-weight: bold; border: solid; text-align: center; font-size:1.2em;"
    style_gt = "background-color: LightGreen; color: black; font-weight: bold; border: solid; text-align:center;"
    gt_std = x > x.mean() + x.std()
    return [style_gt if i else style_lt for i in gt_std]

In [39]:
def genomes_highlighter(x):
    style_genome = "background-color: white; color: black; font-weight: bold; border: solid; text-align: center; font-size:1.2em;"
    return [style_genome for i in x]


In [40]:
#style_header = "background-color: Aqua; color: black; font-weight: bold; border: solid; text-align: center; font-size:1.2em;"
headers = {
    'selector': 'th.col_heading',
    'props': "background-color: OldLace; color: black; font-weight: bold; border: solid; text-align: center; font-size:1.2em;"
}    

headers2 = [{ 
    'selector' : 'th.col_heading',
    'props':'text-align: center; white-space:nowrap;transform: rotate(270deg); height: 100px; position:relative;'
},
{
    "selector" : 'th:nth-child(2)',
    "props" : 'text-align: center; font-weight: bold; border: solid; background-color: OldLace; '
}]

In [41]:
def apply_highlighter(function, style_highlighter, headers_style):
    return DF_gene_copies.style.apply(function,subset=["fam_1", "fam_2"]).apply(style_highlighter, subset="genome_name").hide(axis="index").set_table_styles([headers_style], overwrite = False)
    

In [65]:
apply_highlighter(mode_highlighter, genomes_highlighter, headers)

genome_name,fam_1,fam_2
CatenulisporaacidiphilaDSM44928s,9,1
AmycolatopsismediterraneiU32s,8,1
ArthrobacterchlorophenolicusA6s,8,1
AmycolatopsisorientalisHCCB10007s,7,1
AmycolatopsisdecaplaninaDSM44594s,6,1
ActinoplanesspN902109s,5,1
ArthrobacteraurescensTC1s,5,1
ActinosynnemamirumDSM43827NC0130931s,4,1
Allokutzneriaalbatas,3,1
ActinoplanesspSE50110s,3,1


In [66]:
DF_gene_copies.style.apply(mean_highlighter,subset=["fam_1", "fam_2"]).apply(genomes_highlighter, subset="genome_name").hide(axis="index").set_table_styles([headers], overwrite = False)
#DF_gene_copies.set_index("genome_name").style.apply(mean_highlighter)

genome_name,fam_1,fam_2
CatenulisporaacidiphilaDSM44928s,9,1
AmycolatopsismediterraneiU32s,8,1
ArthrobacterchlorophenolicusA6s,8,1
AmycolatopsisorientalisHCCB10007s,7,1
AmycolatopsisdecaplaninaDSM44594s,6,1
ActinoplanesspN902109s,5,1
ArthrobacteraurescensTC1s,5,1
ActinosynnemamirumDSM43827NC0130931s,4,1
Allokutzneriaalbatas,3,1
ActinoplanesspSE50110s,3,1


In [29]:
Tips_Gen = [ "" if x=="false" else "expanded" for x in TF_gen_name]
Tips_Fam1 = [ "" if DF_gene_copies["fam_1"][x]<= DF_gene_copies["fam_1"].mean() else gene_copies(DF1, "1")[DF_gene_copies["genome_name"][x]] for x in range(0,len(DF_gene_copies["fam_1"]))]
Tips_Fam2 = [ "" if DF_gene_copies["fam_2"][x]<= DF_gene_copies["fam_2"].mean() else gene_copies(DF1,"2")[DF_gene_copies["genome_name"][x]] for x in range(0,len(DF_gene_copies["fam_2"]))]

In [30]:
Tips_Fam1

[['7934', '6857', '3876', '130', '635', '6960', '5905', '2729', '5387'],
 ['1727', '1890', '8523', '2787', '7471', '1713', '3265', '5511'],
 ['2320', '89', '129', '719', '191', '27', '3605', '34'],
 ['6305', '7552', '6102', '2764', '3546', '1496', '5908'],
 ['4265', '4027', '1533', '4526', '2782', '6732'],
 ['6869', '1388', '4410', '3493', '2685'],
 ['2545', '1979', '4252', '67', '4032'],
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [31]:
Tips =pd.DataFrame(data = {"genome_name":Tips_Gen, "fam_1": Tips_Fam1, "fam_2": Tips_Fam2}, index=DF_gene_copies.index)

In [32]:
tips_props = "visibility:hidden;position:absolute; z-index: 1;transform: translate(10px, 0px);border: 1px solid #000066; border-radius: 0.5em;background-color: yellow; color: blue; font-size: 1.1em;"
DF_gene_copies.style.apply(mean_highlighter,subset=["fam_1", "fam_2"]).set_tooltips(Tips, props= tips_props ).hide(axis="index").set_table_styles([headers], overwrite=False)

genome_name,fam_1,fam_2
CatenulisporaacidiphilaDSM44928s,9,1
AmycolatopsismediterraneiU32s,8,1
ArthrobacterchlorophenolicusA6s,8,1
AmycolatopsisorientalisHCCB10007s,7,1
AmycolatopsisdecaplaninaDSM44594s,6,1
ActinoplanesspN902109s,5,1
ArthrobacteraurescensTC1s,5,1
ActinosynnemamirumDSM43827NC0130931s,4,1
Allokutzneriaalbatas,3,1
ActinoplanesspSE50110s,3,1


In [74]:
#def apply_highlighter(function, style_highlighter, headers_style):
# funtion: mode_highlighter, mean_highlighter, median_highlighter o 
#std_highlighter (std_highlighter considera #copias > media+std)
# 

apply_highlighter(median_highlighter, genomes_highlighter, headers)

genome_name,fam_1,fam_2
CatenulisporaacidiphilaDSM44928s,9,1
AmycolatopsismediterraneiU32s,8,1
ArthrobacterchlorophenolicusA6s,8,1
AmycolatopsisorientalisHCCB10007s,7,1
AmycolatopsisdecaplaninaDSM44594s,6,1
ActinoplanesspN902109s,5,1
ArthrobacteraurescensTC1s,5,1
ActinosynnemamirumDSM43827NC0130931s,4,1
Allokutzneriaalbatas,3,1
ActinoplanesspSE50110s,3,1
