# Exploration des données de HLA Epitope Registry

#### Librairies

In [1]:
import ipywidgets as widgets
from ipywidgets import interact
import requests
from bs4 import BeautifulSoup
import pandas as pd
from mlxtend.frequent_patterns import apriori
import re
from IPython.display import display, HTML
from ipymol import viewer as pymol


#### Fonctions

In [8]:
def import_data_from_HLA_Epitope_Registry(database) :
    """Gets the contents of one database of HLA Epitope Registry.

    Parameters
    ----------
    database : str
        The database name ("ABC", "DRB", "DQ"/"DQB+DQA", "DP"/"DPB+DPA")

    Returns
    -------
    (entries, list_of_luminex_alleles, list_of_all_alleles) : (dict of str: [str, str, str, str, str], list of str, list of str)
        entries[eplet] = [ellipro_score, polymorphic_residues, antibody_reactivity, luminex_alleles, all_alleles]
        Comma-separated lists of alleles
    """

    if database == "DQB+DQA" :
        database = "DQ"
    elif database == "DPB+DPA" :
        database = "DP"

    url = "https://www.epregistry.com.br/index/databases/database/"+database+"/"
    page = requests.get(url)

    soup = BeautifulSoup(page.content, "html.parser")

    entries = {}
    list_of_luminex_alleles = []
    list_of_all_alleles = []
    for l in soup.find(id="table-result").find("tbody").find_all("tr", recursive=False) :  # lignes du tableau
        e = l.find_all("td", recursive=False)   # les différentes 'cases'
        eplet = e[0].text.strip()
        ellipro_score = e[1].text.strip()
        polymorphic_residues = e[2].text.strip()
        if e[4].text.strip() == '' :
            antibody_reactivity = 0
        else :
            antibody_reactivity = 1
            eplet = eplet+"c"
        luminex_alleles = e[6].find("div", {"class":"modal-body"}).text.strip()
        all_alleles = e[7].find("div", {"class":"modal-body"}).text.strip()
        entries[eplet] = [ellipro_score, polymorphic_residues, antibody_reactivity, luminex_alleles, all_alleles]

        for allele in luminex_alleles.split() :
            allele = allele.replace(',', '')
            if allele not in list_of_luminex_alleles :
                list_of_luminex_alleles.append(allele)
        for allele in all_alleles.split() :
            allele = allele.replace(',', '')
            if allele not in list_of_all_alleles :
                list_of_all_alleles.append(allele)

    return (entries, list_of_luminex_alleles, list_of_all_alleles)

def create_matrices(entries, list_of_luminex_alleles, list_of_all_alleles) :
    """Creates 4 DataFrames from the output of import_data_from_HLA_Epitope_Registry().

    Outputs the matrices created by the combinations of confirmed eplets / all eplets and Luminex alleles / all alleles.

    Parameters
    ----------
    Output of import_data_from_HLA_Epitope_Registry()

    Returns
    -------
    (df_all_eplets_vs_Luminex_alleles, df_all_eplets_vs_all_alleles, df_confirmed_eplets_vs_Luminex_alleles, df_confirmed_eplets_vs_all_alleles) :
        tuple of 4 pandas DataFrames
        Binary matrices (filled with zeroes and ones). Eplets as rows, alleles as columns.
    """
    sorted_list_of_luminex_alleles = sorted(list_of_luminex_alleles)
    sorted_list_of_all_alleles = sorted(list_of_all_alleles)

    eplets_list = []
    confirmed_eplets_list = []

    Luminex_alleles_vs_all_eplets = {}
    Luminex_alleles_vs_confirmed_eplets = {}
    all_alleles_vs_all_eplets = {}
    all_alleles_vs_confirmed_eplets = {}

    for eplet in entries :
        Luminex_alleles_associated_with_eplet = [x.strip() for x in entries[eplet][3].split(',')]
        all_alleles_associated_with_eplet = [x.strip() for x in entries[eplet][4].split(',')]

        eplets_list.append(eplet)

        d = []
        for a in sorted_list_of_luminex_alleles :
            if a in Luminex_alleles_associated_with_eplet :
                d.append(1)
            else :
                d.append(0)
        Luminex_alleles_vs_all_eplets[eplet] = d

        d = []
        for a in sorted_list_of_all_alleles :
            if a in all_alleles_associated_with_eplet :
                d.append(1)
            else :
                d.append(0)
        all_alleles_vs_all_eplets[eplet] = d

        if entries[eplet][2] : # if antibody reactivity
            confirmed_eplets_list.append(eplet)

            d = []
            for a in sorted_list_of_luminex_alleles :
                if a in Luminex_alleles_associated_with_eplet :
                    d.append(1)
                else :
                    d.append(0)
            Luminex_alleles_vs_confirmed_eplets[eplet] = d

            d = []
            for a in sorted_list_of_all_alleles :
                if a in all_alleles_associated_with_eplet :
                    d.append(1)
                else :
                    d.append(0)
            all_alleles_vs_confirmed_eplets[eplet] = d


    df_all_eplets_vs_Luminex_alleles = pd.DataFrame(Luminex_alleles_vs_all_eplets, index=sorted_list_of_luminex_alleles).transpose()
    df_all_eplets_vs_all_alleles = pd.DataFrame(all_alleles_vs_all_eplets, index=sorted_list_of_all_alleles).transpose()
    df_confirmed_eplets_vs_Luminex_alleles = pd.DataFrame(Luminex_alleles_vs_confirmed_eplets, index=sorted_list_of_luminex_alleles).transpose()
    df_confirmed_eplets_vs_all_alleles = pd.DataFrame(all_alleles_vs_confirmed_eplets, index=sorted_list_of_all_alleles).transpose()

    return (df_all_eplets_vs_Luminex_alleles, df_all_eplets_vs_all_alleles, df_confirmed_eplets_vs_Luminex_alleles, df_confirmed_eplets_vs_all_alleles)


def split_A_B(df_eplets_vs_alleles) :
    A = df_eplets_vs_alleles.filter(regex='A')
    B = df_eplets_vs_alleles.filter(regex='B')
    A = A[(A.T != 0).any()] # delete rows with only zeroes
    B = B[(B.T != 0).any()]
    return (A, B)


def get_that_fucking_list(df) :
    array = df.to_numpy()
    l = []
    list_of_items_names = df.columns.values 
    list_of_transactions_names = df.index.values
    for e in array :
        panier = []
        for i in range(0, len(list_of_items_names)) :
            if e[i] == 1 :
                panier.append(list_of_items_names[i])
        l.append(panier)
    return l

def get_transactions(list_of_transactions_names, list_of_transactions, itemset) :
    output = ""
    for i in range (0, len(list_of_transactions_names)) :
        if itemset.issubset(frozenset(sorted(list_of_transactions[i]))) :
            output += str(list_of_transactions_names[i])+", "
    return output

def to_resi_list(eplets_list) :
    output = ""
    eplets = re.findall(r'\d+', eplets_list)
    print(eplets)
    for eplet in eplets :
        output += str(eplet)+"+"
    return output[:-1] # to delete the last '+'

In [10]:
database = widgets.Dropdown(
    options = ["ABC", "DRB", "DQB+DQA", "DPB+DPA"],
    value= None, 
    description='Database:', disabled=False )

chain = widgets.RadioButtons(
    options=['B', 'A'],
    value=None,
    description='Chaîne :',
    disabled=False )

direction = widgets.ToggleButtons(
    options=['Eplets x Allèles', 'Allèles x Eplets'],
    value=None,
    disabled=False )

eplets = widgets.ToggleButtons(
    options=['Eplets confirmés', 'Tous les éplets'],
    value=None,
    disabled=False )

alleles = widgets.ToggleButtons(
    options=['Allèles Luminex', 'Toutes les allèles'],
    value=None,
    disabled=False )

min_support = widgets.IntText(
    value= 50,
    description='Support minimum (en pourcentage) : ',
    disabled=False
)

get_itemsets_button = widgets.Button(
    description='Obtenir les itemsets fréquents',
    disabled=False,
    icon='check'
)


def database_eventhandler(*args):
    output1.clear_output()
    output2.clear_output()
    
    with output1 :
        print("Téléchargement des données depuis HLA Epitope Registry ... \n")
    (entries, list_of_luminex_alleles, list_of_all_alleles) = import_data_from_HLA_Epitope_Registry(database.value)
    global matrixes 
    matrixes = create_matrices(entries, list_of_luminex_alleles, list_of_all_alleles)
    #(df_all_eplets_vs_Luminex_alleles, df_all_eplets_vs_all_alleles, df_confirmed_eplets_vs_Luminex_alleles, df_confirmed_eplets_vs_all_alleles)     
    
    output1.clear_output()
    with output1 :    
        print( str(len(matrixes[0]))+" éplets (dont "+str(len(matrixes[2]))+ \
          " confirmés) parmi "+ str(len(matrixes[1].columns))+" allèles (dont "+ \
          str(len(matrixes[3].columns))+" allèles Luminex) \n" )
        
    if database.value in ["DQB+DQA", "DPB+DPA"] :
        (df_A, df_B) = split_A_B(matrixes[1])
        (df_A_c, df_B_c) = split_A_B(matrixes[3])
        with output1 :
            print("Chaîne B : "+ str(len(df_B)) +" éplets (dont "+ str(len(df_B_c))+" confirmés) - Chaîne A : "+ str(len(df_A)) +" éplets (dont "+ str(len(df_A_c))+" confirmés)" )
            
            display(chain)
            
    else:
        accordion()
            
def chain_eventhandler(*args) :
    accordion()

def accordion(*args) :
    display(direction)
    display(eplets)
    display(alleles)
    display(min_support)
    display(get_itemsets_button)

def on_change(change):
    cls = change['new'].split(' ')
    with output1 :
        print("Here we are.")
        print(cls)
    if len(cls) == 3: 
        place, txtrow, txtcol = cls
        res = re.search(r'\d+',txtrow).group(0)
        
        print(freq_itemsets.at[int(res), 'itemsets'])
        
        #print("chain B and resi "+to_resi_list(str(freq_itemsets.at[int(res), 'itemsets'])))
                    
        try:
            to_color = "chain B and resi "+str(to_resi_list(str(freq_itemsets.at[int(res), 'itemsets'])))
            print(to_color)
            pymol.color('cyan', to_color)
      
        except ValueError:
            print("Error")
    
def get_itemsets_eventhandler(*args) :
    if direction.value != None and eplets.value != None and alleles.value != None and min_support.value != None :
        df = pd.DataFrame()
                
        if eplets.value == 'Eplets confirmés' :
            if alleles.value == 'Allèles Luminex' :
                if database.value in ["DQB+DQA", "DPB+DPA"] :
                    (A, B) = split_A_B(matrixes[2])
                    if chain.value == 'A' :
                        df = A
                    else :
                        df = B
                else :
                    df = matrixes[2]
            else :
                if database.value in ["DQB+DQA", "DPB+DPA"] :
                    (A, B) = split_A_B(matrixes[3])
                    if chain.value == 'A' :
                        df = A
                    else :
                        df = B
                else :
                    df = matrixes[3]
        else :
            if alleles.value == 'Allèles Luminex' :
                if database.value in ["DQB+DQA", "DPB+DPA"] :
                    (A, B) = split_A_B(matrixes[0])
                    if chain.value == 'A' :
                        df = A
                    else :
                        df = B
                else :
                    df = matrixes[0]
            else :
                if database.value in ["DQB+DQA", "DPB+DPA"] :
                    (A, B) = split_A_B(matrixes[1])
                    if chain.value == 'A' :
                        df = A
                    else :
                        df = B
                else :
                    df = matrixes[1]
        if direction.value == 'Allèles x Eplets' :
            df = df.transpose()
        
        global freq_itemsets
        freq_itemsets = apriori(df, min_support=min_support.value/100, use_colnames=True)
        freq_itemsets['support'] = freq_itemsets['support'].apply(lambda x : round(x, 2))
        freq_itemsets['count'] = freq_itemsets['support'].apply(lambda x : round(len(df)*x))

        dataset = get_that_fucking_list(df)
        freq_itemsets['transactions'] = freq_itemsets['itemsets'].apply(lambda x : get_transactions(df.index.values, dataset, x))

                    
        # javascript-part
        script = """
        <script>
        var input
        var xpath = "//input[contains(@placeholder,'undefined')]";

        function addHandlers() {
            input = document.evaluate(xpath, document, null, 
                XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
            input.setAttribute("hidden","");

            var table = document.querySelector("#T_table");
            var headcells = [].slice.call(table.getElementsByTagName("th"));
            var datacells = [].slice.call(table.getElementsByTagName("td"));
            var cells = headcells.concat(datacells);
            for (var i=0; i < cells.length; i++) {
               var createClickHandler = function(cell) {
                 return function() { 
                    input = document.evaluate(xpath, document, null,
                        XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
                    input.value = cell.className; 
                    var event = new Event('change', { bubbles: true });
                    input.dispatchEvent(event);
              }}
              cells[i].onclick = createClickHandler(cells[i]);
            };
        }

        window.onload = setTimeout(addHandlers, 500);
        </script>
        """
        display(HTML(script))

        html = freq_itemsets[['count', 'itemsets', 'transactions']].sort_values('count', ascending=False) \
            .style.format({"itemsets": lambda x : list(x)}) \
            .hide_index() \
            .background_gradient(cmap='Blues', subset='count')\
            .set_uuid('table') 

        status = widgets.Text(placeholder='undefined',layout={'font-size':'6px'}) 
        status.observe(on_change,names=['value'])

        table = widgets.Output()
        #display(table)

        with table:
            display(html)         

        a = widgets.VBox([status])
        
        display(table)
        display(a)

        try: 
            pymol.start()   # Start PyMOL RPC server
            pymol._add_methods()
            pymol.fetch('4D8P') # Fetch PDB

        except ValueError:
            print("Error")
            
   #    with output1 :
    #        print("Hello")
    else :
        with output1 :
            print("So. Here we are.")          
            
            
output1 = widgets.Output()
output2 = widgets.Output() # tabs

database.observe(database_eventhandler, 'value')
chain.observe(chain_eventhandler, 'value')
get_itemsets_button.on_click(get_itemsets_eventhandler)



display(database)
    
display(output1)

Dropdown(description='Database:', options=('ABC', 'DRB', 'DQB+DQA', 'DPB+DPA'), value=None)

Output()

In [None]:
        if eplets.value == 'Eplets confirmés' :
            if alleles.value == 'Allèles Luminex' :
                if database.value in ["DQB+DQA", "DPB+DPA"] :
                    (A, B) = split_A_B(matrixes[2])
                    if chain.value == 'A' :
                        df = A
                    else :
                        df = B
                else :
                    df = matrixes[2]
            else :
                if database.value in ["DQB+DQA", "DPB+DPA"] :
                    (A, B) = split_A_B(matrixes[3])
                    if chain.value == 'A' :
                        df = A
                    else :
                        df = B
                else :
                    df = matrixes[3]
        else :
            if alleles.value == 'Allèles Luminex' :
                if database.value in ["DQB+DQA", "DPB+DPA"] :
                    (A, B) = split_A_B(matrixes[0])
                    if chain.value == 'A' :
                        df = A
                    else :
                        df = B
                else :
                    df = matrixes[0]
            else :
                if database.value in ["DQB+DQA", "DPB+DPA"] :
                    (A, B) = split_A_B(matrixes[1])
                    if chain.value == 'A' :
                        df = A
                    else :
                        df = B
                else :
                    df = matrixes[1]
        if direction.value == 'Allèles x Eplets' :
            df = t(df)
        
        freq_itemsets = apriori(df, min_support=min_support.value/100, use_colnames=True)
        freq_itemsets['support'] = freq_itemsets['support'].apply(lambda x : round(x, 2))
        freq_itemsets['count'] = freq_itemsets['support'].apply(lambda x : round(len(df)*x))

        first_tab_contents = widgets.Output()
        with first_tab_contents :
        
            freq_itemsets[['count', 'itemsets']].sort_values('count', ascending=False) \
            .style.format({"itemsets": lambda x : list(x)}) \
            .hide_index() \
            .background_gradient(cmap='Blues', subset='count') 



In [None]:
def table_eventhandler(*args):
        if table.value == 'Eplets confirmés x Allèles Luminex' :
            matrix = df_confirmed_eplets_vs_Luminex_alleles
        elif table.value == 'Eplets confirmés x Toutes les allèles' :
            matrix = df_confirmed_eplets_vs_all_alleles
        elif table.value == 'Tous les éplets x Allèles Luminex' :
        
        elif table.value == 'Tous les éplets x Toutes les allèles' :
            
        elif table.value == , 'Allèles Luminex x Eplets confirmés', 'Toutes les allèles x Eplets confirmés', \
             'Allèles Luminex x Tous les éplets', 'Toutes les allèles x Tous les éplets'

In [None]:
try: import pymol


from ipymol import viewer as pymol

pymol.start()   # Start PyMOL RPC server
pymol._add_methods()
pymol.fetch('4D8P') # Fetch PDB
pymol.show_as('cartoon') # Show as cartoon
#pymol.bg_color('white') # Set background color to white




In [None]:
pymol.show_as('cartoon')
pymol.set('cartoon_oval_length', 2.5)
pymol.color('red')
pymol.color('green', 'chain B and resi 182+46+55+52+45')
#182S, 46VY, 55R, 52PQ, 45GV
#pymol.util.cbc(first_color=29, legacy=1)