In [1]:
import pandas as pd

In [2]:
from malid import config, helpers
from malid.datamodels import GeneLocus

In [3]:
# cov-abdab
cov_abdab_all = pd.read_csv(config.paths.base_data_dir / "CoV-AbDab_130623.csv")
cov_abdab_all.shape

(12536, 25)

# Negatives

In [4]:
cov_abdab_negatives = cov_abdab_all.copy()
cov_abdab_negatives.dropna(subset=["Doesn't Bind to"], inplace=True)
cov_abdab_negatives.shape

(2816, 25)

In [5]:
# note: there seem to be a lot of "doesn't bind to sars-cov2" but still "neutralising vs sars-cov2" entries, so we screen those out.
cov_abdab_negatives = cov_abdab_negatives[
    (
        cov_abdab_negatives["Doesn't Bind to"]
        .str.lower()
        .apply(lambda s: "sars-cov2" in s)
    )
    & ~(
        cov_abdab_negatives["Binds to"]
        .fillna("")
        .str.lower()
        .apply(lambda s: "sars-cov2" in s)
    )
    & ~(
        cov_abdab_negatives["Neutralising Vs"]
        .fillna("")
        .str.lower()
        .apply(lambda s: "sars-cov2" in s)
    )
]
cov_abdab_negatives.shape

(622, 25)

In [6]:
# Most of the remainder don't have any any positive binding information
cov_abdab_negatives["Binds to"].isna().value_counts()

True     584
False     38
Name: Binds to, dtype: int64

In [7]:
# What do these remainder bind to, if listed?
cov_abdab_negatives["Binds to"].str.split(",|;").explode().value_counts()

SARS-CoV1           28
SARS-CoV1 (weak)     3
RmYN02               2
Rf1                  2
WIV-1                1
LyRa3                1
Rs4081               1
Yun11                1
RmYN02 (weak)        1
Rf1 (weak)           1
Rs4081 (weak)        1
SARS-CoV             1
MERS-CoV             1
Name: Binds to, dtype: int64

In [8]:
# What don't they bind to?
cov_abdab_negatives["Doesn't Bind to"].str.split(",|;").explode().value_counts()

SARS-CoV2_WT             622
MERS-CoV                  12
SARS-CoV2_Delta            7
SARS-CoV2_Omicron-BA1      7
SARS-CoV2_Omicron-BA2      5
RsSTT200                   5
Pang17                     5
RaTG13                     5
SARS-CoV1                  5
SHC014                     5
C028                       5
SARS-CoV2_Beta             5
BM4831                     5
BtKY72                     5
Khosta2                    5
WIV-1                      4
LyRA3                      4
Yun11                      4
Rs4081                     3
229E                       2
HKU1                       2
NL63                       2
Rf1                        2
SARS-CoV2_Gamma            2
RmYN02                     2
OC43                       2
Name: Doesn't Bind to, dtype: int64

In [9]:
cov_abdab_negatives["Heavy V Gene"].str.split("(").str[1].value_counts()

Human)     616
Mouse)       5
Alpaca)      1
Name: Heavy V Gene, dtype: int64

In [10]:
cov_abdab_negatives["Heavy V Gene"].isna().value_counts()

False    622
Name: Heavy V Gene, dtype: int64

In [11]:
cov_abdab_negatives.dropna(subset=["Heavy V Gene"], inplace=True)

In [12]:
cov_abdab_negatives = cov_abdab_negatives[
    cov_abdab_negatives["Heavy V Gene"].apply(lambda s: "(Human)" in s)
]
cov_abdab_negatives.shape

(616, 25)

In [13]:
cov_abdab_negatives["Heavy J Gene"].value_counts()

IGHJ4 (Human)    297
IGHJ6 (Human)    158
IGHJ3 (Human)     68
IGHJ5 (Human)     52
ND                16
IGHJ1 (Human)     13
IGHJ2 (Human)     12
Name: Heavy J Gene, dtype: int64

In [14]:
cov_abdab_negatives["Heavy J Gene"].isna().value_counts()

False    616
Name: Heavy J Gene, dtype: int64

In [15]:
cov_abdab_negatives = cov_abdab_negatives[cov_abdab_negatives["Heavy J Gene"] != "ND"]
cov_abdab_negatives.shape

(600, 25)

In [16]:
cov_abdab_negatives.dropna(subset=["Heavy J Gene"], inplace=True)

In [17]:
cov_abdab_negatives["Heavy J Gene"].value_counts()

IGHJ4 (Human)    297
IGHJ6 (Human)    158
IGHJ3 (Human)     68
IGHJ5 (Human)     52
IGHJ1 (Human)     13
IGHJ2 (Human)     12
Name: Heavy J Gene, dtype: int64

In [18]:
cov_abdab_negatives["Heavy V Gene"].value_counts()

IGHV3-23 (Human)      49
IGHV3-30 (Human)      48
IGHV1-69 (Human)      45
IGHV4-39 (Human)      43
IGHV5-51 (Human)      27
IGHV3-9 (Human)       26
IGHV1-18 (Human)      24
IGHV2-5 (Human)       23
IGHV1-46 (Human)      19
IGHV3-7 (Human)       19
IGHV4-59 (Human)      19
IGHV3-33 (Human)      19
IGHV3-30-3 (Human)    18
IGHV4-34 (Human)      18
IGHV3-21 (Human)      17
IGHV1-8 (Human)       16
IGHV3-48 (Human)      15
IGHV1-2 (Human)       15
IGHV7-4-1 (Human)     11
IGHV4-4 (Human)       11
IGHV3-11 (Human)      11
IGHV3-15 (Human)      11
IGHV4-31 (Human)      11
IGHV2-70 (Human)      11
IGHV3-53 (Human)       9
IGHV4-30-4 (Human)     8
IGHV1-3 (Human)        6
IGHV3-74 (Human)       5
IGHV4-38-2 (Human)     5
IGHV5-10-1 (Human)     4
IGHV2-26 (Human)       4
IGHV6-1 (Human)        3
IGHV3-49 (Human)       3
IGHV3-13 (Human)       3
IGHV1-58 (Human)       3
IGHV3-43 (Human)       3
IGHV1-24 (Human)       3
IGHV3-66 (Human)       2
IGHV3-73 (Human)       2
IGHV4-61 (Human)       2


In [19]:
cov_abdab_negatives["Heavy V Gene"] = (
    cov_abdab_negatives["Heavy V Gene"].str.split("(").str[0].str.strip()
)

In [20]:
cov_abdab_negatives["Heavy V Gene"]

3160     IGHV3-23
3172     IGHV1-69
3190     IGHV4-34
3192     IGHV1-69
3193      IGHV1-2
           ...   
8663     IGHV1-69
8920     IGHV3-11
8923     IGHV4-59
9936     IGHV3-30
10120    IGHV4-59
Name: Heavy V Gene, Length: 600, dtype: object

In [21]:
cov_abdab_negatives["Heavy J Gene"].value_counts()

IGHJ4 (Human)    297
IGHJ6 (Human)    158
IGHJ3 (Human)     68
IGHJ5 (Human)     52
IGHJ1 (Human)     13
IGHJ2 (Human)     12
Name: Heavy J Gene, dtype: int64

In [22]:
cov_abdab_negatives["Heavy J Gene"] = (
    cov_abdab_negatives["Heavy J Gene"].str.split("(").str[0].str.strip()
)

In [23]:
cov_abdab_negatives["Heavy J Gene"]

3160     IGHJ4
3172     IGHJ6
3190     IGHJ1
3192     IGHJ4
3193     IGHJ3
         ...  
8663     IGHJ6
8920     IGHJ4
8923     IGHJ4
9936     IGHJ6
10120    IGHJ4
Name: Heavy J Gene, Length: 600, dtype: object

In [24]:
cov_abdab_negatives = cov_abdab_negatives[cov_abdab_negatives["CDRH3"] != "ND"]
cov_abdab_negatives.shape

(600, 25)

In [25]:
cov_abdab_negatives["CDRH3"]

3160          AKDKFFGIEGPTYFDY
3172        ARLRLPRHTVAPHYGMDV
3190          ATVPWRSGPRGGYFKL
3192     ARQTLPAAIPKMRWLLTEFAY
3193              AREGWELHAFDI
                 ...          
8663         ARVQDIVIVPAVYGMDV
8920             ARMGPYGSGSFDY
8923      ARHPSSIFRGTVFTPYYFDY
9936      VTQRDNSRDYFPHYFHDMDV
10120            ARSGSYGDRTFDH
Name: CDRH3, Length: 600, dtype: object

In [26]:
cov_abdab_negatives["Ab or Nb"].value_counts()

Ab    600
Name: Ab or Nb, dtype: int64

In [27]:
cov_abdab_negatives["VHorVHH"].isna().value_counts()

False    600
Name: VHorVHH, dtype: int64

In [28]:
(cov_abdab_negatives["VHorVHH"] != "ND").value_counts()

True    600
Name: VHorVHH, dtype: int64

In [29]:
cov_abdab_negatives = cov_abdab_negatives[cov_abdab_negatives["VHorVHH"] != "ND"]
cov_abdab_negatives.shape

(600, 25)

In [30]:
cov_abdab_negatives.columns

Index(['Name', 'Ab or Nb', 'Binds to', 'Doesn't Bind to', 'Neutralising Vs',
       'Not Neutralising Vs', 'Protein + Epitope', 'Origin', 'VHorVHH', 'VL',
       'Heavy V Gene', 'Heavy J Gene', 'Light V Gene', 'Light J Gene', 'CDRH3',
       'CDRL3', 'Structures', 'ABB Homology Model (if no structure)',
       'Sources', 'Date Added', 'Last Updated', 'Update Description',
       'Notes/Following Up?', 'Unnamed: 23', 'Unnamed: 24'],
      dtype='object')

In [31]:
cov_abdab_negatives["Origin"].value_counts()

B-cells; SARS-CoV2 Human Patient       584
B-cells; SARS-CoV1 Human Patient        15
B-cells; SARS-CoV2_WT Human Patient      1
Name: Origin, dtype: int64

In [32]:
# Confirm all are of human origin
assert cov_abdab_negatives["Origin"].str.contains("Human").all()

In [33]:
cov_abdab_negatives.shape

(600, 25)

In [34]:
cov_abdab_negatives["Protein + Epitope"].value_counts()

S; Unk        584
S; non-RBD     12
S; NTD          2
S; RBD          1
S; S2           1
Name: Protein + Epitope, dtype: int64

In [35]:
cov_abdab_negatives["Binds to"].value_counts()

SARS-CoV1           12
SARS-CoV1 (weak)     3
SARS-CoV             1
Name: Binds to, dtype: int64

In [36]:
cov_abdab_negatives["Doesn't Bind to"].value_counts()

SARS-CoV2_WT                                                          584
SARS-CoV2_WT;MERS-CoV                                                  12
SARS-CoV2_WT;SARS-CoV2_Gamma;SARS-CoV2_Delta;SARS-CoV2_Omicron-BA1      2
229E;HKU1;NL63;OC43;SARS-CoV2_WT                                        2
Name: Doesn't Bind to, dtype: int64

In [37]:
cov_abdab_negatives["Not Neutralising Vs"].value_counts()

SARS-CoV2_WT;SARS-CoV2_Gamma    2
SARS-CoV2_WT                    1
SARS-CoV2_WT;SARS-CoV           1
Name: Not Neutralising Vs, dtype: int64

In [38]:
cov_abdab_negatives["Neutralising Vs"].value_counts()

SARS-CoV1    1
Name: Neutralising Vs, dtype: int64

# Positives

In [39]:
cov_abdab_positives = cov_abdab_all.copy()
cov_abdab_positives = cov_abdab_positives[
    (
        (
            cov_abdab_positives["Binds to"]
            .fillna("")
            .str.lower()
            .apply(lambda s: "sars-cov2" in s)
        )
        | (
            cov_abdab_positives["Neutralising Vs"]
            .fillna("")
            .str.lower()
            .apply(lambda s: "sars-cov2" in s)
        )
    )
    # avoid entries where the binding was selective for a particular strain of SARS-CoV-2:
    & ~(
        cov_abdab_positives["Doesn't Bind to"]
        .fillna("")
        .str.lower()
        .apply(lambda s: "sars-cov2" in s)
    )
]
cov_abdab_positives.shape

(10605, 25)

In [40]:
cov_abdab_positives["Binds to"].value_counts()

SARS-CoV2_WT                                                                                                                                                                                                                                          6253
SARS-CoV1;SARS-CoV2_WT                                                                                                                                                                                                                                 379
SARS-CoV2_WT;SARS-CoV1                                                                                                                                                                                                                                 273
SARS-CoV2_Omicron-BA2                                                                                                                                                                                                                                  

In [41]:
# most have positive binding information
cov_abdab_positives["Binds to"].isna().value_counts()

False    10605
Name: Binds to, dtype: int64

In [42]:
# neutralizing information is present for only about half
cov_abdab_positives["Neutralising Vs"].isna().value_counts()

False    5333
True     5272
Name: Neutralising Vs, dtype: int64

In [43]:
# What do these sequence bind to, if listed?
cov_abdab_positives["Binds to"].str.split(",|;").explode().value_counts()

SARS-CoV2_WT                9670
SARS-CoV2_Omicron-BA2       2192
SARS-CoV2_Omicron-BA1       2135
SARS-CoV1                   1379
SARS-CoV2_Omicron-BA2.75    1331
                            ... 
Rf1-2004 (weak)                1
HKU3                           1
Civet007-2004                  1
A021                           1
SARS-CoV_Gamma (weak)          1
Name: Binds to, Length: 221, dtype: int64

In [44]:
# What do these sequence neutralize, if listed?
cov_abdab_positives["Neutralising Vs"].str.split(",|;").explode().value_counts()

SARS-CoV2_WT                4095
SARS-CoV2_Omicron-BA1       1375
SARS-CoV2_Omicron-BA2       1214
SARS-CoV2_Omicron-BA2.75     922
SARS-CoV2_WT (weak)          802
                            ... 
HKU3                           1
LYRa11                         1
CS24                           1
Civet007-2004                  1
SARS-CoV2_Omicron (weak)       1
Name: Neutralising Vs, Length: 216, dtype: int64

In [45]:
# # remove weak binders
# cov_abdab_positives = cov_abdab_positives[
#     ~cov_abdab_positives["Binds to"].str.lower().apply(lambda s: "sars-cov2_wt (weak)" in s)
# ]
# cov_abdab_positives.shape

In [46]:
cov_abdab_positives["Binds to"].value_counts()

SARS-CoV2_WT                                                                                                                                                                                                                                          6253
SARS-CoV1;SARS-CoV2_WT                                                                                                                                                                                                                                 379
SARS-CoV2_WT;SARS-CoV1                                                                                                                                                                                                                                 273
SARS-CoV2_Omicron-BA2                                                                                                                                                                                                                                  

In [47]:
# cov_abdab_positives.dropna(subset=["Neutralising Vs"], inplace=True)
# cov_abdab_positives.shape

In [48]:
cov_abdab_positives["Neutralising Vs"].isna().value_counts()

False    5333
True     5272
Name: Neutralising Vs, dtype: int64

In [49]:
# cov_abdab_positives = cov_abdab_positives[
#     cov_abdab_positives["Neutralising Vs"].str.lower().apply(lambda s: "sars-cov2" in s)
# ]
# cov_abdab_positives.shape

In [50]:
cov_abdab_positives["Neutralising Vs"].value_counts()

SARS-CoV2_WT                                                                                                                                                                                                                                              2166
SARS-CoV2_WT (weak)                                                                                                                                                                                                                                        460
SARS-CoV2_WT;SARS-CoV2_Omicron-BA1                                                                                                                                                                                                                         138
SARS-CoV2_WT;SARS-CoV2_Omicron-BA1;SARS-CoV2_Omicron-BA2;SARS-CoV2_Omicron-BA2.75                                                                                                                                                          

In [51]:
cov_abdab_positives["Heavy V Gene"].str.split("(").str[1].value_counts()

Human)     9676
Alpaca)     671
Mouse)      218
Rhesus)       2
Name: Heavy V Gene, dtype: int64

In [52]:
cov_abdab_positives["Heavy V Gene"].isna().value_counts()

False    10605
Name: Heavy V Gene, dtype: int64

In [53]:
cov_abdab_positives.dropna(subset=["Heavy V Gene"], inplace=True)

In [54]:
cov_abdab_positives = cov_abdab_positives[
    cov_abdab_positives["Heavy V Gene"].apply(lambda s: "(Human)" in s)
]
cov_abdab_positives.shape

(9676, 25)

In [55]:
cov_abdab_positives["Heavy J Gene"].value_counts()

IGHJ4 (Human)    3875
IGHJ6 (Human)    2310
IGHJ3 (Human)    1277
IGHJ5 (Human)    1234
IGHJ2 (Human)     395
ND                312
IGHJ1 (Human)     273
Name: Heavy J Gene, dtype: int64

In [56]:
cov_abdab_positives["Heavy J Gene"].isna().value_counts()

False    9676
Name: Heavy J Gene, dtype: int64

In [57]:
cov_abdab_positives = cov_abdab_positives[cov_abdab_positives["Heavy J Gene"] != "ND"]
cov_abdab_positives.shape

(9364, 25)

In [58]:
cov_abdab_positives.dropna(subset=["Heavy J Gene"], inplace=True)

In [59]:
cov_abdab_positives["Heavy J Gene"].value_counts()

IGHJ4 (Human)    3875
IGHJ6 (Human)    2310
IGHJ3 (Human)    1277
IGHJ5 (Human)    1234
IGHJ2 (Human)     395
IGHJ1 (Human)     273
Name: Heavy J Gene, dtype: int64

In [60]:
cov_abdab_positives["Heavy V Gene"].value_counts()

IGHV3-30 (Human)      1115
IGHV1-69 (Human)       984
IGHV3-53 (Human)       508
IGHV1-46 (Human)       411
IGHV3-23 (Human)       400
IGHV3-9 (Human)        397
IGHV5-51 (Human)       376
IGHV3-33 (Human)       364
IGHV3-30-3 (Human)     350
IGHV3-66 (Human)       341
IGHV4-39 (Human)       339
IGHV4-59 (Human)       276
IGHV1-18 (Human)       261
IGHV3-13 (Human)       244
IGHV4-4 (Human)        216
IGHV1-2 (Human)        215
IGHV3-21 (Human)       201
IGHV4-31 (Human)       174
IGHV2-5 (Human)        171
IGHV3-7 (Human)        170
IGHV3-48 (Human)       159
IGHV4-34 (Human)       155
IGHV1-58 (Human)       131
IGHV3-15 (Human)       124
IGHV2-70 (Human)       116
IGHV1-8 (Human)        114
IGHV1-24 (Human)       109
IGHV4-61 (Human)       101
IGHV7-4-1 (Human)       98
IGHV3-11 (Human)        92
IGHV4-30-4 (Human)      66
IGHV5-10-1 (Human)      64
IGHV3-49 (Human)        62
IGHV3-64D (Human)       62
IGHV1-3 (Human)         56
IGHV3-20 (Human)        43
IGHV3-74 (Human)        42
I

In [61]:
cov_abdab_positives["Heavy V Gene"] = (
    cov_abdab_positives["Heavy V Gene"].str.split("(").str[0].str.strip()
)

In [62]:
cov_abdab_positives["Heavy V Gene"]

0        IGHV3-53
1        IGHV3-11
2        IGHV3-30
3        IGHV3-53
4        IGHV3-53
           ...   
12531    IGHV3-21
12532    IGHV1-58
12533    IGHV3-11
12534    IGHV3-64
12535    IGHV1-18
Name: Heavy V Gene, Length: 9364, dtype: object

In [63]:
cov_abdab_positives["Heavy J Gene"].value_counts()

IGHJ4 (Human)    3875
IGHJ6 (Human)    2310
IGHJ3 (Human)    1277
IGHJ5 (Human)    1234
IGHJ2 (Human)     395
IGHJ1 (Human)     273
Name: Heavy J Gene, dtype: int64

In [64]:
cov_abdab_positives["Heavy J Gene"] = (
    cov_abdab_positives["Heavy J Gene"].str.split("(").str[0].str.strip()
)

In [65]:
cov_abdab_positives["Heavy J Gene"]

0        IGHJ4
1        IGHJ4
2        IGHJ4
3        IGHJ3
4        IGHJ4
         ...  
12531    IGHJ5
12532    IGHJ2
12533    IGHJ1
12534    IGHJ2
12535    IGHJ6
Name: Heavy J Gene, Length: 9364, dtype: object

In [66]:
cov_abdab_positives = cov_abdab_positives[cov_abdab_positives["CDRH3"] != "ND"]
cov_abdab_positives.shape

(9364, 25)

In [67]:
cov_abdab_positives["CDRH3"]

0               ARDLVVYGLDY
1           AREFDLTKIIMVPPY
2           ARDSSGWHWGVPFDY
3             VRGSGGIHDAFDI
4              ARVDPRYEGFDY
                ...        
12531      ARDRFGEFIYPGALDI
12532         AAMVRGGWWYFDL
12533     ASDPGGLSELAAEYFHH
12534        ARESTEVTHWYFDL
12535    ARQLLFFGDLSGDNGMDV
Name: CDRH3, Length: 9364, dtype: object

In [68]:
cov_abdab_positives["Ab or Nb"].value_counts()

Ab    9338
Nb      26
Name: Ab or Nb, dtype: int64

In [69]:
cov_abdab_positives["VHorVHH"].isna().value_counts()

False    9364
Name: VHorVHH, dtype: int64

In [70]:
(cov_abdab_positives["VHorVHH"] != "ND").value_counts()

True     9277
False      87
Name: VHorVHH, dtype: int64

In [71]:
cov_abdab_positives = cov_abdab_positives[cov_abdab_positives["VHorVHH"] != "ND"]
cov_abdab_positives.shape

(9277, 25)

In [72]:
cov_abdab_positives.columns

Index(['Name', 'Ab or Nb', 'Binds to', 'Doesn't Bind to', 'Neutralising Vs',
       'Not Neutralising Vs', 'Protein + Epitope', 'Origin', 'VHorVHH', 'VL',
       'Heavy V Gene', 'Heavy J Gene', 'Light V Gene', 'Light J Gene', 'CDRH3',
       'CDRL3', 'Structures', 'ABB Homology Model (if no structure)',
       'Sources', 'Date Added', 'Last Updated', 'Update Description',
       'Notes/Following Up?', 'Unnamed: 23', 'Unnamed: 24'],
      dtype='object')

In [73]:
cov_abdab_positives["Origin"].value_counts()

B-cells; SARS-CoV2 Human Patient                     2293
B-cells; SARS-CoV2_WT Human Patient                  2259
B-cells (SARS-CoV2 Human Patient and/or Vaccinee)     758
BA.2 convalescents                                    751
B-cells; SARS-CoV2_WT Vaccinee                        516
                                                     ... 
Engineered from CR3041                                  1
Engineered from CR3042                                  1
Engineered from CR3043                                  1
Engineered from CR3045                                  1
Engineered from CR3032                                  1
Name: Origin, Length: 112, dtype: int64

In [74]:
cov_abdab_positives["Origin"].value_counts().head(n=25)

B-cells; SARS-CoV2 Human Patient                             2293
B-cells; SARS-CoV2_WT Human Patient                          2259
B-cells (SARS-CoV2 Human Patient and/or Vaccinee)             758
BA.2 convalescents                                            751
B-cells; SARS-CoV2_WT Vaccinee                                516
SARS convalescents                                            475
B-Cells, SARS-CoV2 double vaccinated human                    285
BA.5 convalescents                                            269
B-cells; SARS-CoV1 Human Patient                              248
B-cells (SARS-CoV2 Human Patient+Vaccinee)                    216
BA.1 convalescents                                            187
Semi-synthetic Human Fab Library                              116
B-cells (SARS-CoV2 Human Patient/Vaccinee)                     90
B-cells; SARS-CoV2 Vaccinee (ChAdOx1 then mRNA-1273)           72
Phage Display (Ab, human, non-immune)                          59
B-cells; S

In [75]:
cov_abdab_positives["Origin"].unique()

array(['B-cells; SARS-CoV2_WT Convalescent Patients',
       'B-cells; SARS-CoV2_WT Vaccinee (BBIBP-CoV)',
       'B-cells; SARS-CoV2_WT Vaccinee', 'TBC',
       'Immunised Rhesus Macaques',
       'B-cells; SARS-CoV2 Convalescent Patients',
       'B-cells; SARS-CoV2 Vaccinee',
       'B-cells; SARS-CoV2_WT Human Patient',
       'B-cells; SARS-CoV2 Vaccinee (Omicron BA2 Breakthrough)',
       'Phage Display Library (Ab, Human, Non-immune)',
       'Immunised Transgenic Mouse',
       'Phage Display Library (Ab, Human, Immune - SARS-CoV2)',
       'Phage Display Library (Ab, Human, Naive)',
       'B-cells; SARS-CoV2 Human Patient',
       'B-cells; SARS-CoV2_Omicron Human Patient',
       'B-cells; SARS-CoV2 Human Vaccinee (Moderna)',
       'B-cells; SARS-CoV2 Human Vaccinee (Pfizer)',
       'B-cells; SARS-CoV2 Human Vaccinee', nan,
       'B-cells; SARS-CoV2_Omicron-BA1 Breakthrough Infection',
       'B-cells; SARS-CoV2 Human Patients',
       'Phage Display (Ab, human, immune - 

In [76]:
# Test "keep human origin only" filter:
[
    origin
    for origin in cov_abdab_positives["Origin"].fillna("").unique()
    if (
        "human" in origin.lower()
        or "patient" in origin.lower()
        or "vaccinee" in origin.lower()
        or "breakthrough infection" in origin.lower()
    )
    and "humanised" not in origin.lower()
    and "phage display" not in origin.lower()
    and "synthetic" not in origin.lower()
]

['B-cells; SARS-CoV2_WT Convalescent Patients',
 'B-cells; SARS-CoV2_WT Vaccinee (BBIBP-CoV)',
 'B-cells; SARS-CoV2_WT Vaccinee',
 'B-cells; SARS-CoV2 Convalescent Patients',
 'B-cells; SARS-CoV2 Vaccinee',
 'B-cells; SARS-CoV2_WT Human Patient',
 'B-cells; SARS-CoV2 Vaccinee (Omicron BA2 Breakthrough)',
 'B-cells; SARS-CoV2 Human Patient',
 'B-cells; SARS-CoV2_Omicron Human Patient',
 'B-cells; SARS-CoV2 Human Vaccinee (Moderna)',
 'B-cells; SARS-CoV2 Human Vaccinee (Pfizer)',
 'B-cells; SARS-CoV2 Human Vaccinee',
 'B-cells; SARS-CoV2_Omicron-BA1 Breakthrough Infection',
 'B-cells; SARS-CoV2 Human Patients',
 'B-cells; SARS-CoV2 Human Vaccinee + Multiple Infection',
 'B-cells; Triple SARS-CoV2 Vaccinated Human',
 'B-cells; Unvaccinated SARS-CoV2_WT Human Patient',
 'B-Cells, SARS-CoV2 double vaccinated human',
 'B-cells; SARS-CoV2_Gamma Human Patient',
 'B-cells; SARS-CoV2 Vaccinee (2 x ChAdOx1)',
 'B-cells; SARS-CoV2 Vaccinee (ChAdOx1 then mRNA-1273)',
 'B-cells; SARS-CoV1 Human Pati

In [77]:
# rejects:
[
    origin
    for origin in cov_abdab_positives["Origin"].fillna("").unique()
    if not (
        (
            "human" in origin.lower()
            or "patient" in origin.lower()
            or "vaccinee" in origin.lower()
            or "breakthrough infection" in origin.lower()
        )
        and "humanised" not in origin.lower()
        and "phage display" not in origin.lower()
        and "synthetic" not in origin.lower()
    )
]

['TBC',
 'Immunised Rhesus Macaques',
 'Phage Display Library (Ab, Human, Non-immune)',
 'Immunised Transgenic Mouse',
 'Phage Display Library (Ab, Human, Immune - SARS-CoV2)',
 'Phage Display Library (Ab, Human, Naive)',
 '',
 'Phage Display (Ab, human, immune - SARS-CoV2)',
 'Humanised from Immunised Mouse',
 'ND',
 'Phage Display (Ab, human, non-immune)',
 'Immunised Humanised Mouse',
 'Transgenic Mouse',
 'Phage Library Engineered from SARS-CoV-1 binder',
 'Immunised mouse (TC-mAb)',
 'Immunised mouse (RenMab)',
 'Transgenic Mouse (H2L2)',
 'Phage Display (Ab, human, immune - CoV2_WT)',
 'HIV-1 induced Ab',
 'Phage Display (sdAbs from human VH)',
 'Phage Display (Humanised sdAbs, immune - CoV2)',
 'Engineered from ADI-55689',
 'Engineered from ADG-2',
 'Engineered from ADI-56046',
 'Phage Display (Ab, based on trastuzumab VH)',
 'Phage Display (sdAb, non-immune, human) + Engineering',
 'Phage Display (single-domain, human, non-immune)',
 'Computational Design',
 'Transgenic Mouse (

In [78]:
cov_abdab_positives.shape

(9277, 25)

In [79]:
# Apply "keep human origin only" filter:
cov_abdab_positives = cov_abdab_positives[
    cov_abdab_positives["Origin"]
    .fillna("")
    .apply(
        lambda origin: (
            "human" in origin.lower()
            or "patient" in origin.lower()
            or "vaccinee" in origin.lower()
            or "breakthrough infection" in origin.lower()
        )
        and "humanised" not in origin.lower()
        and "phage display" not in origin.lower()
        and "synthetic" not in origin.lower()
    )
]
cov_abdab_positives.shape

(7142, 25)

In [80]:
cov_abdab_positives["Origin"].value_counts()

B-cells; SARS-CoV2 Human Patient                                      2293
B-cells; SARS-CoV2_WT Human Patient                                   2259
B-cells (SARS-CoV2 Human Patient and/or Vaccinee)                      758
B-cells; SARS-CoV2_WT Vaccinee                                         516
B-Cells, SARS-CoV2 double vaccinated human                             285
B-cells; SARS-CoV1 Human Patient                                       248
B-cells (SARS-CoV2 Human Patient+Vaccinee)                             216
B-cells (SARS-CoV2 Human Patient/Vaccinee)                              90
B-cells; SARS-CoV2 Vaccinee (ChAdOx1 then mRNA-1273)                    72
B-cells; SARS-CoV2_Gamma Human Patient                                  47
B-cells; SARS-CoV2 Vaccinee (2 x ChAdOx1)                               45
B-cells; SARS-CoV2_Beta Human Patient                                   42
B-cells; SARS-CoV2_WT Convalescent Patients                             42
B-cells; SARS-CoV2_WT Vac

In [81]:
cov_abdab_positives["Protein + Epitope"].value_counts()

S; RBD                                     4270
S; Unk                                     1754
S; NTD                                      374
S; non-RBD                                  330
S; S2                                       275
N                                            38
S; non-S1                                    33
S; S1 non-RBD                                26
S; S2 Stem Helix                             10
S; S2' Cleavage Site/Fusion Peptide NTD       4
Unknown                                       4
S; S2 Fusion Peptide                          4
S; S1                                         4
S; Stem Helix                                 3
S; S1/S2                                      2
S; NTD-SD1                                    2
TBC                                           2
S; RBD/non-RBD                                2
S; SD1                                        1
S: NTD                                        1
S: RBD                                  

In [82]:
cov_abdab_positives["Binds to"].value_counts()

SARS-CoV2_WT                                                                                                                                                                                                                                          4511
SARS-CoV1;SARS-CoV2_WT                                                                                                                                                                                                                                 359
SARS-CoV2_WT;SARS-CoV1                                                                                                                                                                                                                                 228
SARS-CoV2_WT;SARS-CoV2_Omicron-BA1;SARS-CoV2_Beta;SARS-CoV2_Delta;SARS-CoV2_Omicron-BA2;SARS-CoV1                                                                                                                                                      

In [83]:
cov_abdab_positives["Doesn't Bind to"].value_counts()

SARS-CoV1                                                                     627
229E;HKU1;NL63;OC43                                                            70
SARS-CoV1;MERS-CoV;HKU1;OC43                                                   50
229E;HKU1;NL63;OC43;SARS-CoV1                                                  49
SARS-CoV1;MERS-CoV;OC43;HKU1;NL63                                              46
SARS-CoV1;MERS-CoV                                                             43
MERS-CoV;HKU1;OC43                                                             40
MERS-CoV                                                                       39
OC43                                                                           36
OC43;HKU1                                                                      34
229E;NL63;OC43                                                                 29
SARS-CoV1_Omicron-BA2                                                          16
GX/P2V/2017     

In [84]:
# Notice we have not required positives to be neutralizing against SARS-CoV-2.
cov_abdab_positives["Not Neutralising Vs"].value_counts()

SARS-CoV2_WT                                                                                                                                                                                             971
SARS-CoV1;SARS-CoV2_WT                                                                                                                                                                                   264
SARS-CoV1                                                                                                                                                                                                219
SARS-CoV1;SARS-CoV2_Omicron-XBB                                                                                                                                                                          134
SARS-CoV2_WT;SARS-CoV2_Gamma                                                                                                                                                        

# Combine positives and negatives, then export

In [85]:
cov_abdab_export = (
    pd.concat(
        [
            cov_abdab_positives.assign(Status="Positive"),
            cov_abdab_negatives.assign(Status="Negative"),
        ],
        axis=0,
    )[
        [
            "CDRH3",
            "Heavy J Gene",
            "Heavy V Gene",
            "VHorVHH",
            "Binds to",
            "Doesn't Bind to",
            "Neutralising Vs",
            "Not Neutralising Vs",
            "Protein + Epitope",
            "Origin",
            "Sources",
            "Status",
        ]
    ]
    .rename(columns={"Heavy J Gene": "j_gene", "Heavy V Gene": "v_gene"})
    .reset_index(drop=True)
)
cov_abdab_export

Unnamed: 0,CDRH3,j_gene,v_gene,VHorVHH,Binds to,Doesn't Bind to,Neutralising Vs,Not Neutralising Vs,Protein + Epitope,Origin,Sources,Status
0,ARDLVVYGLDY,IGHJ4,IGHV3-53,EVQLVESGGGLIQPGGSLRLSCAASGFIVSRNYMNWVRQAPGKGLE...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,S; RBD,B-cells; SARS-CoV2_WT Convalescent Patients,"Kiyomi Shitaoka et al., 2023 (https://www.natu...",Positive
1,AREFDLTKIIMVPPY,IGHJ4,IGHV3-11,QVQMVESGGGLVRPGGSLRLSCAASGFTFSDYYMSWIRQAPGKGLE...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Delta;S...,,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Delta;S...,SARS-CoV2_Beta;SARS-CoV2_Omicron-BA1;SARS-CoV2...,S; RBD,B-cells; SARS-CoV2_WT Convalescent Patients,"Kiyomi Shitaoka et al., 2023 (https://www.natu...",Positive
2,ARDSSGWHWGVPFDY,IGHJ4,IGHV3-30,QVQLVESGGGVVQPGRSLRLSCASSGFTFSTYHMHWVRQPPGKGLE...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,S; RBD,B-cells; SARS-CoV2_WT Vaccinee (BBIBP-CoV),"Yubin Liu et al., 2023 (https://www.nature.com...",Positive
3,VRGSGGIHDAFDI,IGHJ3,IGHV3-53,EVQLVESGGGLIQPGGSLRLSCAVSGFTVSRMSWVRQAPGKGLECV...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,SARS-CoV2_Omicron-BA2.75.2;SARS-CoV2-Omicron-B...,S; RBD,B-cells; SARS-CoV2_WT Vaccinee (BBIBP-CoV),"Yubin Liu et al., 2023 (https://www.nature.com...",Positive
4,ARVDPRYEGFDY,IGHJ4,IGHV3-53,EVQLVESGGGLIQPGGSLRLSCAASEFIVSRNYMSWVRQAPGKGLE...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,SARS-CoV2_Omicron-BA1;SARS-CoV2_Omicron-BA2;SA...,S; RBD,B-cells; SARS-CoV2_WT Vaccinee (BBIBP-CoV),"Yubin Liu et al., 2023 (https://www.nature.com...",Positive
...,...,...,...,...,...,...,...,...,...,...,...,...
7737,ARVQDIVIVPAVYGMDV,IGHJ6,IGHV1-69,QVQLVQSGAEVKKPGSSVKVSCKTSGGTFVNFNSYAISWVRQAPGQ...,SARS-CoV1,SARS-CoV2_WT;MERS-CoV,,,S; non-RBD,B-cells; SARS-CoV1 Human Patient,"Dapeng Li et al., 2021 (https://www.biorxiv.or...",Negative
7738,ARMGPYGSGSFDY,IGHJ4,IGHV3-11,EVQLVESGGGVVNPGGSLRLSCAGSGFTFSDYYMGWIRQAPGKGLE...,SARS-CoV1 (weak),229E;HKU1;NL63;OC43;SARS-CoV2_WT,,,S; Unk,B-cells; SARS-CoV1 Human Patient,"Anna Wec et al., 2020 (https://science.science...",Negative
7739,ARHPSSIFRGTVFTPYYFDY,IGHJ4,IGHV4-59,QVQLQESGPGLVKPSETLSLTCTVSGGSVSDTPFYWGWIRQPPGKG...,SARS-CoV1 (weak),229E;HKU1;NL63;OC43;SARS-CoV2_WT,,,S; Unk,B-cells; SARS-CoV1 Human Patient,"Anna Wec et al., 2020 (https://science.science...",Negative
7740,VTQRDNSRDYFPHYFHDMDV,IGHJ6,IGHV3-30,QAQLVESGGALVQPGRSLRLSCAASGFTFRNYAMHWVRQAPATGLQ...,SARS-CoV1,SARS-CoV2_WT,SARS-CoV1,SARS-CoV2_WT,S; RBD,B-cells; SARS-CoV1 Human Patient,"Dora Pinto et al., 2020 (https://www.nature.co...",Negative


In [86]:
cov_abdab_export["Status"].value_counts()

Positive    7142
Negative     600
Name: Status, dtype: int64

CDRH3 already has `C` prefix and `W` suffix removed - consistent with our internal data.

In [87]:
# compute cdr3_aa_sequence_trim_len

In [88]:
cov_abdab_export["cdr3_seq_aa_q_trim"] = (
    cov_abdab_export["CDRH3"]
    .str.replace(".", "", regex=False)
    .str.replace("-", "", regex=False)
    .str.replace(" ", "", regex=False)
    .str.replace("*", "", regex=False)
    .str.strip()
    .str.upper()
)
cov_abdab_export["cdr3_aa_sequence_trim_len"] = cov_abdab_export[
    "cdr3_seq_aa_q_trim"
].str.len()
cov_abdab_export

Unnamed: 0,CDRH3,j_gene,v_gene,VHorVHH,Binds to,Doesn't Bind to,Neutralising Vs,Not Neutralising Vs,Protein + Epitope,Origin,Sources,Status,cdr3_seq_aa_q_trim,cdr3_aa_sequence_trim_len
0,ARDLVVYGLDY,IGHJ4,IGHV3-53,EVQLVESGGGLIQPGGSLRLSCAASGFIVSRNYMNWVRQAPGKGLE...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,S; RBD,B-cells; SARS-CoV2_WT Convalescent Patients,"Kiyomi Shitaoka et al., 2023 (https://www.natu...",Positive,ARDLVVYGLDY,11
1,AREFDLTKIIMVPPY,IGHJ4,IGHV3-11,QVQMVESGGGLVRPGGSLRLSCAASGFTFSDYYMSWIRQAPGKGLE...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Delta;S...,,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Delta;S...,SARS-CoV2_Beta;SARS-CoV2_Omicron-BA1;SARS-CoV2...,S; RBD,B-cells; SARS-CoV2_WT Convalescent Patients,"Kiyomi Shitaoka et al., 2023 (https://www.natu...",Positive,AREFDLTKIIMVPPY,15
2,ARDSSGWHWGVPFDY,IGHJ4,IGHV3-30,QVQLVESGGGVVQPGRSLRLSCASSGFTFSTYHMHWVRQPPGKGLE...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,S; RBD,B-cells; SARS-CoV2_WT Vaccinee (BBIBP-CoV),"Yubin Liu et al., 2023 (https://www.nature.com...",Positive,ARDSSGWHWGVPFDY,15
3,VRGSGGIHDAFDI,IGHJ3,IGHV3-53,EVQLVESGGGLIQPGGSLRLSCAVSGFTVSRMSWVRQAPGKGLECV...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,SARS-CoV2_Omicron-BA2.75.2;SARS-CoV2-Omicron-B...,S; RBD,B-cells; SARS-CoV2_WT Vaccinee (BBIBP-CoV),"Yubin Liu et al., 2023 (https://www.nature.com...",Positive,VRGSGGIHDAFDI,13
4,ARVDPRYEGFDY,IGHJ4,IGHV3-53,EVQLVESGGGLIQPGGSLRLSCAASEFIVSRNYMSWVRQAPGKGLE...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,SARS-CoV2_Omicron-BA1;SARS-CoV2_Omicron-BA2;SA...,S; RBD,B-cells; SARS-CoV2_WT Vaccinee (BBIBP-CoV),"Yubin Liu et al., 2023 (https://www.nature.com...",Positive,ARVDPRYEGFDY,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7737,ARVQDIVIVPAVYGMDV,IGHJ6,IGHV1-69,QVQLVQSGAEVKKPGSSVKVSCKTSGGTFVNFNSYAISWVRQAPGQ...,SARS-CoV1,SARS-CoV2_WT;MERS-CoV,,,S; non-RBD,B-cells; SARS-CoV1 Human Patient,"Dapeng Li et al., 2021 (https://www.biorxiv.or...",Negative,ARVQDIVIVPAVYGMDV,17
7738,ARMGPYGSGSFDY,IGHJ4,IGHV3-11,EVQLVESGGGVVNPGGSLRLSCAGSGFTFSDYYMGWIRQAPGKGLE...,SARS-CoV1 (weak),229E;HKU1;NL63;OC43;SARS-CoV2_WT,,,S; Unk,B-cells; SARS-CoV1 Human Patient,"Anna Wec et al., 2020 (https://science.science...",Negative,ARMGPYGSGSFDY,13
7739,ARHPSSIFRGTVFTPYYFDY,IGHJ4,IGHV4-59,QVQLQESGPGLVKPSETLSLTCTVSGGSVSDTPFYWGWIRQPPGKG...,SARS-CoV1 (weak),229E;HKU1;NL63;OC43;SARS-CoV2_WT,,,S; Unk,B-cells; SARS-CoV1 Human Patient,"Anna Wec et al., 2020 (https://science.science...",Negative,ARHPSSIFRGTVFTPYYFDY,20
7740,VTQRDNSRDYFPHYFHDMDV,IGHJ6,IGHV3-30,QAQLVESGGALVQPGRSLRLSCAASGFTFRNYAMHWVRQAPATGLQ...,SARS-CoV1,SARS-CoV2_WT,SARS-CoV1,SARS-CoV2_WT,S; RBD,B-cells; SARS-CoV1 Human Patient,"Dora Pinto et al., 2020 (https://www.nature.co...",Negative,VTQRDNSRDYFPHYFHDMDV,20


In [89]:
# drop duplicate sequences with same V gene, J gene, CDR3 sequence, and positive/negative status
cov_abdab_export = cov_abdab_export.drop_duplicates(
    subset=["v_gene", "j_gene", "cdr3_seq_aa_q_trim", "Status"]
)
cov_abdab_export

Unnamed: 0,CDRH3,j_gene,v_gene,VHorVHH,Binds to,Doesn't Bind to,Neutralising Vs,Not Neutralising Vs,Protein + Epitope,Origin,Sources,Status,cdr3_seq_aa_q_trim,cdr3_aa_sequence_trim_len
0,ARDLVVYGLDY,IGHJ4,IGHV3-53,EVQLVESGGGLIQPGGSLRLSCAASGFIVSRNYMNWVRQAPGKGLE...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,S; RBD,B-cells; SARS-CoV2_WT Convalescent Patients,"Kiyomi Shitaoka et al., 2023 (https://www.natu...",Positive,ARDLVVYGLDY,11
1,AREFDLTKIIMVPPY,IGHJ4,IGHV3-11,QVQMVESGGGLVRPGGSLRLSCAASGFTFSDYYMSWIRQAPGKGLE...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Delta;S...,,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Delta;S...,SARS-CoV2_Beta;SARS-CoV2_Omicron-BA1;SARS-CoV2...,S; RBD,B-cells; SARS-CoV2_WT Convalescent Patients,"Kiyomi Shitaoka et al., 2023 (https://www.natu...",Positive,AREFDLTKIIMVPPY,15
2,ARDSSGWHWGVPFDY,IGHJ4,IGHV3-30,QVQLVESGGGVVQPGRSLRLSCASSGFTFSTYHMHWVRQPPGKGLE...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,S; RBD,B-cells; SARS-CoV2_WT Vaccinee (BBIBP-CoV),"Yubin Liu et al., 2023 (https://www.nature.com...",Positive,ARDSSGWHWGVPFDY,15
3,VRGSGGIHDAFDI,IGHJ3,IGHV3-53,EVQLVESGGGLIQPGGSLRLSCAVSGFTVSRMSWVRQAPGKGLECV...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,SARS-CoV2_Omicron-BA2.75.2;SARS-CoV2-Omicron-B...,S; RBD,B-cells; SARS-CoV2_WT Vaccinee (BBIBP-CoV),"Yubin Liu et al., 2023 (https://www.nature.com...",Positive,VRGSGGIHDAFDI,13
4,ARVDPRYEGFDY,IGHJ4,IGHV3-53,EVQLVESGGGLIQPGGSLRLSCAASEFIVSRNYMSWVRQAPGKGLE...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,SARS-CoV2_Omicron-BA1;SARS-CoV2_Omicron-BA2;SA...,S; RBD,B-cells; SARS-CoV2_WT Vaccinee (BBIBP-CoV),"Yubin Liu et al., 2023 (https://www.nature.com...",Positive,ARVDPRYEGFDY,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7737,ARVQDIVIVPAVYGMDV,IGHJ6,IGHV1-69,QVQLVQSGAEVKKPGSSVKVSCKTSGGTFVNFNSYAISWVRQAPGQ...,SARS-CoV1,SARS-CoV2_WT;MERS-CoV,,,S; non-RBD,B-cells; SARS-CoV1 Human Patient,"Dapeng Li et al., 2021 (https://www.biorxiv.or...",Negative,ARVQDIVIVPAVYGMDV,17
7738,ARMGPYGSGSFDY,IGHJ4,IGHV3-11,EVQLVESGGGVVNPGGSLRLSCAGSGFTFSDYYMGWIRQAPGKGLE...,SARS-CoV1 (weak),229E;HKU1;NL63;OC43;SARS-CoV2_WT,,,S; Unk,B-cells; SARS-CoV1 Human Patient,"Anna Wec et al., 2020 (https://science.science...",Negative,ARMGPYGSGSFDY,13
7739,ARHPSSIFRGTVFTPYYFDY,IGHJ4,IGHV4-59,QVQLQESGPGLVKPSETLSLTCTVSGGSVSDTPFYWGWIRQPPGKG...,SARS-CoV1 (weak),229E;HKU1;NL63;OC43;SARS-CoV2_WT,,,S; Unk,B-cells; SARS-CoV1 Human Patient,"Anna Wec et al., 2020 (https://science.science...",Negative,ARHPSSIFRGTVFTPYYFDY,20
7740,VTQRDNSRDYFPHYFHDMDV,IGHJ6,IGHV3-30,QAQLVESGGALVQPGRSLRLSCAASGFTFRNYAMHWVRQAPATGLQ...,SARS-CoV1,SARS-CoV2_WT,SARS-CoV1,SARS-CoV2_WT,S; RBD,B-cells; SARS-CoV1 Human Patient,"Dora Pinto et al., 2020 (https://www.nature.co...",Negative,VTQRDNSRDYFPHYFHDMDV,20


In [90]:
# However, there are some identical sequences with both positive and negative status!
# These may be due to sequence differences outside the CDR3 (elsewhere in VHorVHH)
cov_abdab_export.groupby(["v_gene", "j_gene", "cdr3_seq_aa_q_trim"]).size()[
    cov_abdab_export.groupby(["v_gene", "j_gene", "cdr3_seq_aa_q_trim"]).size() != 1
]

v_gene     j_gene  cdr3_seq_aa_q_trim
IGHV1-69   IGHJ4   ARERYSSTWTADFDY       2
IGHV3-11   IGHJ4   ARMGPYGSGSFDY         2
IGHV3-30   IGHJ6   AKSRGGNYYYGMDV        2
IGHV3-33   IGHJ4   ARGSGSGDY             2
IGHV3-53   IGHJ4   ARDHGGLRFDY           2
           IGHJ6   ARWYLVYGDNSGDYGMDV    2
IGHV4-34   IGHJ3   ARGNMIVVAFSAFDI       2
           IGHJ4   ARIEYSYGRRGLDY        2
IGHV4-39   IGHJ4   ATALVVTAAYF           2
           IGHJ5   ARPLGDYGDNVRRSWFDP    2
IGHV4-59   IGHJ3   ARGFDI                2
IGHV7-4-1  IGHJ4   ARDPEDYNFWSGYYVDY     2
dtype: int64

In [91]:
# Drop these conflicting sequences:
print(cov_abdab_export.shape)
cov_abdab_export = cov_abdab_export[
    cov_abdab_export.groupby(["v_gene", "j_gene", "cdr3_seq_aa_q_trim"])[
        "Status"
    ].transform(lambda grp: grp.nunique() == 1)
]
print(cov_abdab_export.shape)

(7153, 14)
(7129, 14)


In [92]:
assert (
    not cov_abdab_export[["v_gene", "j_gene", "cdr3_seq_aa_q_trim"]]
    .duplicated(keep=False)
    .any()
)

In [93]:
cov_abdab_export["Sources"].value_counts()

Wooseob Kim et al., 2022 (https://www.nature.com/articles/s41586-022-04527-1)                                                                                                                                                                                                                                                                                                                                                                                        2018
Yunlong Cao et al. 2022 (https://www.nature.com/articles/s41586-022-04980-y#Sec4) and Yunlong Cao et al., 2022 (https://www.biorxiv.org/content/10.1101/2022.09.15.507787v2)                                                                                                                                                                                                                                                                                          758
Alice Cho et al., 2021 (https://www.nature.com/articles/s41586-021-04060-7)         

In [94]:
cov_abdab_export.drop(["Sources"], axis=1).to_csv(
    config.paths.base_data_dir / "CoV-AbDab_130623.filtered.tsv", sep="\t", index=None
)