> Query whether the pseudo-sequence designed by NetMHCpan/MHCflurry makes sense.
> In this notebook we fetch data from IPD database to get HLA protein sequences.
> We implemented the pseudo-sequence defined by NetMHCpan, NetMHCpan2/4, MHCflurry2 respectively.

## 1. Test if HLA alleles of the same group have identical pseudo-sequence.

In [1]:
import sys
sys.path.append("..")

In [2]:
from bioscript import IpdImgt
import pandas
import copy

ipd_imgt = IpdImgt()

hla_group = "A*02:01"

hla_alleles = ipd_imgt.get_alleles(starts_with=hla_group)
hla_iterator = ipd_imgt.iter(hla_alleles)
hla_all = list()
for item in hla_iterator:
    hla_all.append(copy.deepcopy(item))

df = pandas.DataFrame(hla_all)
print(df)

   accession           name
0   HLA00012  A*02:07:01:01
1   HLA07326     A*02:07:02
2   HLA10218     A*02:07:03
3   HLA10811     A*02:07:04
4   HLA10812     A*02:07:05
5   HLA10905     A*02:07:06
6   HLA12413     A*02:07:07
7   HLA13174     A*02:07:08
8   HLA17229     A*02:07:09
9   HLA17578     A*02:07:10
10  HLA17648     A*02:07:11
11  HLA21755     A*02:07:12
12  HLA23210     A*02:07:13
13  HLA25642     A*02:07:14
14  HLA25826  A*02:07:01:02
15  HLA26783     A*02:07:15
16  HLA27707     A*02:07:16
17  HLA28005     A*02:07:17
18  HLA29555     A*02:07:18
19  HLA29578     A*02:07:19
20  HLA30186     A*02:07:20
21  HLA30286  A*02:07:01:03
22  HLA32976  A*02:07:01:06
23  HLA33233  A*02:07:01:05
24  HLA33348     A*02:07:21
25  HLA33357  A*02:07:01:04
26  HLA35637     A*02:07:22
27  HLA36701     A*02:07:23


In [3]:
from bioscript import PseudoSequence as ps


# Get a list of accession
pseudo_seq_netmhcpan = list()
pseudo_seq_netmhcpan_4 = list()
pseudo_seq_netmhcpan_4_set = set()
pseudo_seq_mhcflurry_2 = list()
pseudo_seq_mhcflurry_2_set = set()
for accession in list(df.accession):
    single_allele = ipd_imgt.get_single_allele(accession)
    if "confirmation_status" in single_allele and single_allele["confirmation_status"]["confirmed"] is True and "sequence" in single_allele:
        amino_seq = single_allele["sequence"]["protein"]
        pseudo_seq_netmhcpan.append(ps.sequence_to_pseudo_sequence(amino_seq, method="netmhcpan"))
        ps_netmhcpan_4 = ps.sequence_to_pseudo_sequence(amino_seq, method="netmhcpan_4")
        pseudo_seq_netmhcpan_4.append(ps_netmhcpan_4)
        if ps_netmhcpan_4 not in pseudo_seq_netmhcpan_4_set:
            print(single_allele["name"], "\t", "\nnetmhcpan_ps:", ps_netmhcpan_4, "\t",single_allele["wmda"]["P_group"]["name"])
            pseudo_seq_netmhcpan_4_set.add(ps_netmhcpan_4)
        ps_mhcflurry_2 = ps.sequence_to_pseudo_sequence(amino_seq, method="mhcflurry_2")
        if ps_mhcflurry_2 not in pseudo_seq_mhcflurry_2_set:
            print(single_allele["name"], "\t", "\nmhcflurry_ps:", ps_mhcflurry_2, "\t",single_allele["wmda"]["P_group"]["name"])
            pseudo_seq_mhcflurry_2_set.add(ps_mhcflurry_2)
        pseudo_seq_mhcflurry_2.append(ps_mhcflurry_2)
    else:
        pseudo_seq_netmhcpan.append("")
        pseudo_seq_netmhcpan_4.append("")
        pseudo_seq_mhcflurry_2.append("")

A*02:07:01:01 	 
netmhcpan_ps: YFAMYGEKVAHTHVDTLYVRCHYYTWAVLAYTWY 	 A*02:07P
A*02:07:01:01 	 
mhcflurry_ps: YFAMYGEKVAHTHVDTLYGVRCDHYYTWAVLAYTWYA 	 A*02:07P


In [4]:
df.insert(2, "pseudo_seq_netmhcpan", pseudo_seq_netmhcpan)
df.insert(3, "pseudo_seq_netmhcpan_4", pseudo_seq_netmhcpan_4)
df.insert(4, "pseudo_seq_mhcflurry_2", pseudo_seq_mhcflurry_2)

In [5]:
df_filtered = df.replace("", pandas.NA).dropna()
df_filtered

Unnamed: 0,accession,name,pseudo_seq_netmhcpan,pseudo_seq_netmhcpan_4,pseudo_seq_mhcflurry_2
0,HLA00012,A*02:07:01:01,RLARRSDSQMEAPIEGPWSTRASTDYLESTAATW,YFAMYGEKVAHTHVDTLYVRCHYYTWAVLAYTWY,YFAMYGEKVAHTHVDTLYGVRCDHYYTWAVLAYTWYA
19,HLA29578,A*02:07:19,RLARRSDSQMEAPIEGPWSTRASTDYLESTAATW,YFAMYGEKVAHTHVDTLYVRCHYYTWAVLAYTWY,YFAMYGEKVAHTHVDTLYGVRCDHYYTWAVLAYTWYA


In [6]:
print(df_filtered.groupby("pseudo_seq_netmhcpan_4").size())
print(df_filtered.groupby("pseudo_seq_netmhcpan").size())
print(df_filtered.groupby("pseudo_seq_mhcflurry_2").size())

pseudo_seq_netmhcpan_4
YFAMYGEKVAHTHVDTLYVRCHYYTWAVLAYTWY    2
dtype: int64
pseudo_seq_netmhcpan
RLARRSDSQMEAPIEGPWSTRASTDYLESTAATW    2
dtype: int64
pseudo_seq_mhcflurry_2
YFAMYGEKVAHTHVDTLYGVRCDHYYTWAVLAYTWYA    2
dtype: int64


In [4]:
from bioscript import IpdImgt
from bioscript import PseudoSequence as ps
import pandas
import copy

ipd_imgt = IpdImgt()

def hla_to_pseudo_seq(hla_group):
    hla_alleles = ipd_imgt.get_alleles(starts_with=hla_group)
    hla_iterator = ipd_imgt.iter(hla_alleles)
    hla_all = list()
    for item in hla_iterator:
        hla_all.append(copy.deepcopy(item))
    
    df = pandas.DataFrame(hla_all)
    print(df)

    # Get a list of accession
    pseudo_seq_netmhcpan = list()
    pseudo_seq_netmhcpan_4 = list()
    pseudo_seq_netmhcpan_4_set = set()
    pseudo_seq_mhcflurry_2 = list()
    pseudo_seq_mhcflurry_2_set = set()
    for accession in list(df.accession):
        single_allele = ipd_imgt.get_single_allele(accession)
        if "confirmation_status" in single_allele and single_allele["confirmation_status"]["confirmed"] is True and "sequence" in single_allele:
            if "wmda" in single_allele:
                if "P_group" in single_allele["wmda"]:
                    amino_seq = single_allele["sequence"]["protein"]
                    pseudo_seq_netmhcpan.append(ps.sequence_to_pseudo_sequence(amino_seq, method="netmhcpan"))
                    ps_netmhcpan_4 = ps.sequence_to_pseudo_sequence(amino_seq, method="netmhcpan_4")
                    pseudo_seq_netmhcpan_4.append(ps_netmhcpan_4)
                    if ps_netmhcpan_4 not in pseudo_seq_netmhcpan_4_set:
                        print(single_allele["name"], "\t", "\nnetmhcpan_ps:", ps_netmhcpan_4, "\t",single_allele["wmda"]["P_group"]["name"])
                        pseudo_seq_netmhcpan_4_set.add(ps_netmhcpan_4)
                    ps_mhcflurry_2 = ps.sequence_to_pseudo_sequence(amino_seq, method="mhcflurry_2")
                    if ps_mhcflurry_2 not in pseudo_seq_mhcflurry_2_set:
                        print(single_allele["name"], "\t", "\nmhcflurry_ps:", ps_mhcflurry_2, "\t",single_allele["wmda"]["P_group"]["name"])
                        pseudo_seq_mhcflurry_2_set.add(ps_mhcflurry_2)
                    pseudo_seq_mhcflurry_2.append(ps_mhcflurry_2)
                else:
                    pseudo_seq_netmhcpan.append("")
                    pseudo_seq_netmhcpan_4.append("")
                    pseudo_seq_mhcflurry_2.append("")
            else:
                pseudo_seq_netmhcpan.append("")
                pseudo_seq_netmhcpan_4.append("")
                pseudo_seq_mhcflurry_2.append("")
        else:
            pseudo_seq_netmhcpan.append("")
            pseudo_seq_netmhcpan_4.append("")
            pseudo_seq_mhcflurry_2.append("")

    df.insert(2, "pseudo_seq_netmhcpan", pseudo_seq_netmhcpan)
    df.insert(3, "pseudo_seq_netmhcpan_4", pseudo_seq_netmhcpan_4)
    df.insert(4, "pseudo_seq_mhcflurry_2", pseudo_seq_mhcflurry_2)

    df_filtered = df.replace("", pandas.NA).dropna()

    print(df_filtered.groupby("pseudo_seq_netmhcpan_4").size())
    print(df_filtered.groupby("pseudo_seq_netmhcpan").size())
    print(df_filtered.groupby("pseudo_seq_mhcflurry_2").size())

In [3]:
hla_to_pseudo_seq("A*02:07")
hla_to_pseudo_seq("A*31:01")
hla_to_pseudo_seq("B*58:01")

   accession           name
0   HLA00012  A*02:07:01:01
1   HLA07326     A*02:07:02
2   HLA10218     A*02:07:03
3   HLA10811     A*02:07:04
4   HLA10812     A*02:07:05
5   HLA10905     A*02:07:06
6   HLA12413     A*02:07:07
7   HLA13174     A*02:07:08
8   HLA17229     A*02:07:09
9   HLA17578     A*02:07:10
10  HLA17648     A*02:07:11
11  HLA21755     A*02:07:12
12  HLA23210     A*02:07:13
13  HLA25642     A*02:07:14
14  HLA25826  A*02:07:01:02
15  HLA26783     A*02:07:15
16  HLA27707     A*02:07:16
17  HLA28005     A*02:07:17
18  HLA29555     A*02:07:18
19  HLA29578     A*02:07:19
20  HLA30186     A*02:07:20
21  HLA30286  A*02:07:01:03
22  HLA32976  A*02:07:01:06
23  HLA33233  A*02:07:01:05
24  HLA33348     A*02:07:21
25  HLA33357  A*02:07:01:04
26  HLA35637     A*02:07:22
27  HLA36701     A*02:07:23
A*02:07:01:01 	 
netmhcpan_ps: YFAMYGEKVAHTHVDTLYVRCHYYTWAVLAYTWY 	 A*02:07P
A*02:07:01:01 	 
mhcflurry_ps: YFAMYGEKVAHTHVDTLYGVRCDHYYTWAVLAYTWYA 	 A*02:07P
pseudo_seq_netmhcpan_4
YFAMYGEK

ValueError: Sequence contains invalid characters

In [5]:
hla_to_pseudo_seq("B*15:01")
hla_to_pseudo_seq("C*03:02")

    accession            name
0    HLA00162   B*15:01:01:01
1    HLA00163  B*15:01:01:02N
2    HLA00164      B*15:01:02
3    HLA00978      B*15:01:03
4    HLA01373      B*15:01:04
..        ...             ...
143  HLA37571      B*15:01:81
144  HLA37886      B*15:01:82
145  HLA38696      B*15:01:83
146  HLA39094      B*15:01:84
147  HLA39217   B*15:01:01:66

[148 rows x 2 columns]
B*15:01:01:01 	 
netmhcpan_ps: YYAMYREISTNTYESNLYLRYDSYTWAEWAYLWY 	 B*15:01P
B*15:01:01:01 	 
mhcflurry_ps: YYAMYREISTNTYESNLYGLRYDDSYTWAEWAYLWYA 	 B*15:01P
B*15:01:09 	 
netmhcpan_ps: QVANYQSGSTLMYCDPDLGDIAQTLLGER 	 B*15:01P
B*15:01:09 	 
mhcflurry_ps: QVANYQSGSTLMYCDPDLSGDINAQTLLGER 	 B*15:01P
B*15:01:11 	 
netmhcpan_ps: QVANYQSGSTLMYCDPDLGDIAQTLLGERDPHHD 	 B*15:01P
B*15:01:11 	 
mhcflurry_ps: QVANYQSGSTLMYCDPDLSGDINAQTLLGERDPHHDQ 	 B*15:01P
HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Max retries exceeded with url: /cgi-bin/ipd/api/allele/HLA04216?project=HLA (Caused by SSLError(SSLEOFError(8, 'EOF

TypeError: argument of type 'NoneType' is not iterable

In [6]:
hla_to_pseudo_seq("C*08:01")

   accession           name
0   HLA00445  C*08:01:01:01
1   HLA01586     C*08:01:02
2   HLA05786     C*08:01:03
3   HLA06172     C*08:01:04
4   HLA06644     C*08:01:05
5   HLA08157     C*08:01:06
6   HLA08158     C*08:01:07
7   HLA08963     C*08:01:09
8   HLA09834     C*08:01:10
9   HLA10015     C*08:01:11
10  HLA10017     C*08:01:12
11  HLA10582     C*08:01:13
12  HLA11095     C*08:01:14
13  HLA11465     C*08:01:15
14  HLA11571     C*08:01:16
15  HLA11857     C*08:01:17
16  HLA12333     C*08:01:18
17  HLA13099     C*08:01:19
18  HLA17225     C*08:01:20
19  HLA17301  C*08:01:01:02
20  HLA17463     C*08:01:21
21  HLA19794     C*08:01:22
22  HLA20343     C*08:01:24
23  HLA20445     C*08:01:23
24  HLA20782     C*08:01:25
25  HLA20983     C*08:01:26
26  HLA22259     C*08:01:27
27  HLA22486     C*08:01:28
28  HLA23140  C*08:01:01:04
29  HLA23314  C*08:01:01:03
30  HLA23758     C*08:01:29
31  HLA24460  C*08:01:01:06
32  HLA24807  C*08:01:01:05
33  HLA24919  C*08:01:01:07
34  HLA24930  C*08:0