> Query whether the pseudo-sequence designed by NetMHCpan/MHCflurry makes sense.
> In this notebook we fetch data from IPD database to get HLA protein sequences.
> We implemented the pseudo-sequence defined by NetMHCpan, NetMHCpan2/4, MHCflurry2 respectively.

## 1. Test if HLA alleles of the same group have identical pseudo-sequence.

In [1]:
import sys
sys.path.append("..")

In [2]:
from bioscript import IpdImgt
import pandas
import copy

ipd_imgt = IpdImgt()

hla_group = "A*02:01"

hla_alleles = ipd_imgt.get_alleles(starts_with=hla_group)
hla_iterator = ipd_imgt.iter(hla_alleles)
hla_all = list()
for item in hla_iterator:
    hla_all.append(copy.deepcopy(item))

df = pandas.DataFrame(hla_all)
print(df)

    accession            name
0    HLA00005   A*02:01:01:01
1    HLA00006   A*02:01:02:01
2    HLA00966      A*02:01:03
3    HLA01032      A*02:01:04
4    HLA01327      A*02:01:05
..        ...             ...
459  HLA38505     A*02:01:214
460  HLA38634     A*02:01:215
461  HLA38802     A*02:01:216
462  HLA39104  A*02:01:01:250
463  HLA39164  A*02:01:01:249

[464 rows x 2 columns]


In [8]:
from bioscript import PseudoSequence as ps


# Get a list of accession
pseudo_seq_netmhcpan = list()
pseudo_seq_netmhcpan_4 = list()
pseudo_seq_netmhcpan_4_set = set()
pseudo_seq_mhcflurry_2 = list()
pseudo_seq_mhcflurry_2_set = set()
for accession in list(df.accession):
    while True:
        try:
            single_allele = ipd_imgt.get_single_allele(accession)
            if "confirmation_status" in single_allele and single_allele["confirmation_status"]["confirmed"] is True and "sequence" in single_allele:
                amino_seq = single_allele["sequence"]["protein"]
                pseudo_seq_netmhcpan.append(ps.sequence_to_pseudo_sequence(amino_seq, method="netmhcpan"))
                ps_netmhcpan_4 = ps.sequence_to_pseudo_sequence(amino_seq, method="netmhcpan_4")
                pseudo_seq_netmhcpan_4.append(ps_netmhcpan_4)
                if ps_netmhcpan_4 not in pseudo_seq_netmhcpan_4_set:
                    print(single_allele["name"], "\t", "\nnetmhcpan_ps:", ps_netmhcpan_4, "\t",single_allele["wmda"]["P_group"]["name"])
                    pseudo_seq_netmhcpan_4_set.add(ps_netmhcpan_4)
                ps_mhcflurry_2 = ps.sequence_to_pseudo_sequence(amino_seq, method="mhcflurry_2")
                if ps_mhcflurry_2 not in pseudo_seq_mhcflurry_2_set:
                    print(single_allele["name"], "\t", "\nmhcflurry_ps:", ps_mhcflurry_2, "\t",single_allele["wmda"]["P_group"]["name"])
                    pseudo_seq_mhcflurry_2_set.add(ps_mhcflurry_2)
                pseudo_seq_mhcflurry_2.append(ps_mhcflurry_2)
            else:
                pseudo_seq_netmhcpan.append("")
                pseudo_seq_netmhcpan_4.append("")
                pseudo_seq_mhcflurry_2.append("")
            break
        except Exception:
            pass

HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Max retries exceeded with url: /cgi-bin/ipd/api/allele/HLA00005?project=HLA (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))
A*02:01:01:01 	 
netmhcpan_ps: YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY 	 A*02:01P
A*02:01:01:01 	 
mhcflurry_ps: YFAMYGEKVAHTHVDTLYGVRYDHYYTWAVLAYTWYA 	 A*02:01P
A*02:01:03 	 
netmhcpan_ps: RGTIKAHTHVDTLGYQSGDGDKYAAERYTVERNT 	 A*02:01P
A*02:01:03 	 
mhcflurry_ps: RGTIKAHTHVDTLGYQSGMDGDFKYAAERYTVERNTL 	 A*02:01P
A*02:01:23 	 
netmhcpan_ps: QVAHYQSGSTVMYCDSDFGDIAQTLLGER 	 A*02:01P
A*02:01:23 	 
mhcflurry_ps: QVAHYQSGSTVMYCDSDFYGDIKAQTLLGER 	 A*02:01P
A*02:01:43 	 
netmhcpan_ps: QVAHYQSGSTVMYCDSDFGDIAQTLLGERDAHHD 	 A*02:01P
A*02:01:43 	 
mhcflurry_ps: QVAHYQSGSTVMYCDSDFYGDIKAQTLLGERDAHHDQ 	 A*02:01P
HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Max retries exceeded with url: /cgi-bin/ipd/api/allele/HLA18218?project=HLA (Caused by SSLError(SSLEOFError(8, 'EOF occurred in

In [9]:
df.insert(2, "pseudo_seq_netmhcpan", pseudo_seq_netmhcpan)
df.insert(3, "pseudo_seq_netmhcpan_4", pseudo_seq_netmhcpan_4)
df.insert(4, "pseudo_seq_mhcflurry_2", pseudo_seq_mhcflurry_2)

In [10]:
df_filtered = df.replace("", pandas.NA).dropna()
df_filtered

Unnamed: 0,accession,name,pseudo_seq_netmhcpan,pseudo_seq_netmhcpan_4,pseudo_seq_mhcflurry_2
0,HLA00005,A*02:01:01:01,RLARRSDSQMEAPIEGPWSTRASTDYLESTAATW,YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY,YFAMYGEKVAHTHVDTLYGVRYDHYYTWAVLAYTWYA
1,HLA00006,A*02:01:02:01,RLARRSDSQMEAPIEGPWSTRASTDYLESTAATW,YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY,YFAMYGEKVAHTHVDTLYGVRYDHYYTWAVLAYTWYA
2,HLA00966,A*02:01:03,SAYVSMEAPIEGPYWETVLTRRYCLLWAAQTKAE,RGTIKAHTHVDTLGYQSGDGDKYAAERYTVERNT,RGTIKAHTHVDTLGYQSGMDGDFKYAAERYTVERNTL
3,HLA01032,A*02:01:04,RLARRSDSQMEAPIEGPWSTRASTDYLESTAATW,YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY,YFAMYGEKVAHTHVDTLYGVRYDHYYTWAVLAYTWYA
4,HLA01327,A*02:01:05,RLARRSDSQMEAPIEGPWSTRASTDYLESTAATW,YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY,YFAMYGEKVAHTHVDTLYGVRYDHYYTWAVLAYTWYA
...,...,...,...,...,...
404,HLA32199,A*02:01:01:207,RLARRSDSQMEAPIEGPWSTRASTDYLESTAATW,YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY,YFAMYGEKVAHTHVDTLYGVRYDHYYTWAVLAYTWYA
415,HLA32879,A*02:01:01:232,RLARRSDSQMEAPIEGPWSTRASTDYLESTAATW,YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY,YFAMYGEKVAHTHVDTLYGVRYDHYYTWAVLAYTWYA
418,HLA32920,A*02:01:01:228,RLARRSDSQMEAPIEGPWSTRASTDYLESTAATW,YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY,YFAMYGEKVAHTHVDTLYGVRYDHYYTWAVLAYTWYA
445,HLA34798,A*02:01:01:237,RLARRSDSQMEAPIEGPWSTRASTDYLESTAATW,YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY,YFAMYGEKVAHTHVDTLYGVRYDHYYTWAVLAYTWYA


In [11]:
print(df_filtered.groupby("pseudo_seq_netmhcpan_4").size())
print(df_filtered.groupby("pseudo_seq_netmhcpan").size())
print(df_filtered.groupby("pseudo_seq_mhcflurry_2").size())

pseudo_seq_netmhcpan_4
QVAHYQSGSTVMYCDSDFGDIAQTLLGER          23
QVAHYQSGSTVMYCDSDFGDIAQTLLGERDAHHD      5
RGTIKAHTHVDTLGYQSGDGDKYAAERYTVERNT      1
YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY    124
dtype: int64
pseudo_seq_netmhcpan
FTVEWETVKHSHRDLLRYQMGQADKEHARYLCLL     28
RLARRSDSQMEAPIEGPWSTRASTDYLESTAATW    124
SAYVSMEAPIEGPYWETVLTRRYCLLWAAQTKAE      1
dtype: int64
pseudo_seq_mhcflurry_2
QVAHYQSGSTVMYCDSDFYGDIKAQTLLGER           23
QVAHYQSGSTVMYCDSDFYGDIKAQTLLGERDAHHDQ      5
RGTIKAHTHVDTLGYQSGMDGDFKYAAERYTVERNTL      1
YFAMYGEKVAHTHVDTLYGVRYDHYYTWAVLAYTWYA    124
dtype: int64


In [13]:
from bioscript import IpdImgt
from bioscript import PseudoSequence as ps
import pandas
import copy

ipd_imgt = IpdImgt()

def hla_to_pseudo_seq(hla_group):
    hla_alleles = ipd_imgt.get_alleles(starts_with=hla_group)
    hla_iterator = ipd_imgt.iter(hla_alleles)
    hla_all = list()
    for item in hla_iterator:
        hla_all.append(copy.deepcopy(item))
    
    df = pandas.DataFrame(hla_all)
    print(df)

    # Get a list of accession
    pseudo_seq_netmhcpan = list()
    pseudo_seq_netmhcpan_4 = list()
    pseudo_seq_netmhcpan_4_set = set()
    pseudo_seq_mhcflurry_2 = list()
    pseudo_seq_mhcflurry_2_set = set()
    for accession in list(df.accession):
        single_allele = ipd_imgt.get_single_allele(accession)
        if "confirmation_status" in single_allele and single_allele["confirmation_status"]["confirmed"] is True and "sequence" in single_allele:
            if "wmda" in single_allele:
                if "P_group" in single_allele["wmda"]:
                    amino_seq = single_allele["sequence"]["protein"]
                    pseudo_seq_netmhcpan.append(ps.sequence_to_pseudo_sequence(amino_seq, method="netmhcpan"))
                    ps_netmhcpan_4 = ps.sequence_to_pseudo_sequence(amino_seq, method="netmhcpan_4")
                    pseudo_seq_netmhcpan_4.append(ps_netmhcpan_4)
                    if ps_netmhcpan_4 not in pseudo_seq_netmhcpan_4_set:
                        print(single_allele["name"], "\t", "\nnetmhcpan_ps:", ps_netmhcpan_4, "\t",single_allele["wmda"]["P_group"]["name"])
                        pseudo_seq_netmhcpan_4_set.add(ps_netmhcpan_4)
                    ps_mhcflurry_2 = ps.sequence_to_pseudo_sequence(amino_seq, method="mhcflurry_2")
                    if ps_mhcflurry_2 not in pseudo_seq_mhcflurry_2_set:
                        print(single_allele["name"], "\t", "\nmhcflurry_ps:", ps_mhcflurry_2, "\t",single_allele["wmda"]["P_group"]["name"])
                        pseudo_seq_mhcflurry_2_set.add(ps_mhcflurry_2)
                    pseudo_seq_mhcflurry_2.append(ps_mhcflurry_2)
                else:
                    pseudo_seq_netmhcpan.append("")
                    pseudo_seq_netmhcpan_4.append("")
                    pseudo_seq_mhcflurry_2.append("")
            else:
                pseudo_seq_netmhcpan.append("")
                pseudo_seq_netmhcpan_4.append("")
                pseudo_seq_mhcflurry_2.append("")
        else:
            pseudo_seq_netmhcpan.append("")
            pseudo_seq_netmhcpan_4.append("")
            pseudo_seq_mhcflurry_2.append("")

    df.insert(2, "pseudo_seq_netmhcpan", pseudo_seq_netmhcpan)
    df.insert(3, "pseudo_seq_netmhcpan_4", pseudo_seq_netmhcpan_4)
    df.insert(4, "pseudo_seq_mhcflurry_2", pseudo_seq_mhcflurry_2)

    df_filtered = df.replace("", pandas.NA).dropna()
    print(df_filtered)
    print(df_filtered.groupby("pseudo_seq_netmhcpan_4").size())
    print(df_filtered.groupby("pseudo_seq_netmhcpan").size())
    print(df_filtered.groupby("pseudo_seq_mhcflurry_2").size())

In [14]:
hla_to_pseudo_seq("A*02:07")

HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Max retries exceeded with url: /cgi-bin/ipd/api/allele?query=startsWith%28name%2CA%2A02%3A07%29&project=HLA (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))
   accession           name
0   HLA00012  A*02:07:01:01
1   HLA07326     A*02:07:02
2   HLA10218     A*02:07:03
3   HLA10811     A*02:07:04
4   HLA10812     A*02:07:05
5   HLA10905     A*02:07:06
6   HLA12413     A*02:07:07
7   HLA13174     A*02:07:08
8   HLA17229     A*02:07:09
9   HLA17578     A*02:07:10
10  HLA17648     A*02:07:11
11  HLA21755     A*02:07:12
12  HLA23210     A*02:07:13
13  HLA25642     A*02:07:14
14  HLA25826  A*02:07:01:02
15  HLA26783     A*02:07:15
16  HLA27707     A*02:07:16
17  HLA28005     A*02:07:17
18  HLA29555     A*02:07:18
19  HLA29578     A*02:07:19
20  HLA30186     A*02:07:20
21  HLA30286  A*02:07:01:03
22  HLA32976  A*02:07:01:06
23  HLA33233  A*02:07:01:05
24  HLA33348     A*02:07:21
25  HLA33357  A*02:

In [None]:
hla_to_pseudo_seq("A*31:01")

In [None]:
hla_to_pseudo_seq("B*58:01")

In [None]:
hla_to_pseudo_seq("B*15:01")

In [None]:
hla_to_pseudo_seq("C*08:01")

In [None]:
hla_to_pseudo_seq("C*03:02")