In [1]:
import os, sys
from pathlib import Path
sys.path.append(str(Path(os.getcwd()).parent) + "/immunogenicity-utilities/")

In [2]:
import pandas as pd
import ImmunogenicityUtilities as IU
import EpletUtilities as EU
import swifter

In [3]:
alleleTypeList_dict = ["A","B","DRB1"]
alleleTypeList = ["A","B", "DR"]
allele_conversion_dict = IU.make_allele_conversion_dict('/home/ddoffei/immunogenicity-utilities/Dictionaries/HLAConversionDataDictionary.csv')
allele_mapping_dict = IU.make_allele_mapping_dict('/home/ddoffei/immunogenicity-utilities/Dictionaries/HLAConversionDataDictionary.csv')
master_dict = IU.make_haplo_frequency_dict("/home/ddoffei/immunogenicity-utilities/Dictionaries/freq_dic/",alleleTypeList_dict)

In [4]:
dataset_deceased=pd.read_csv('cand_tx_ddonor_rec.csv', encoding="latin1", low_memory=False)

In [5]:
dataset_deceased = dataset_deceased[[
 'DON_RACE',
 'DON_A1',
 'DON_A2',
 'DON_B1',
 'DON_B2',
 'DON_DR1',
 'DON_DR2',
 'PERS_ID',
 'CAN_RACE',
 'REC_A1',
 'REC_A2',
 'REC_B1',
 'REC_B2',
 'REC_DR1',
 'REC_DR2']].dropna().reset_index(drop=True)

In [6]:
dataset_deceased.head(5)

Unnamed: 0,DON_RACE,DON_A1,DON_A2,DON_B1,DON_B2,DON_DR1,DON_DR2,PERS_ID,CAN_RACE,REC_A1,REC_A2,REC_B1,REC_B2,REC_DR1,REC_DR2
0,8.0,1.0,11.0,35.0,60.0,1.0,8.0,3759020.0,8.0,11.0,68.0,35.0,51.0,1.0,13.0
1,8.0,1.0,2.0,8.0,98.0,17.0,11.0,3198556.0,8.0,1.0,2.0,8.0,44.0,11.0,17.0
2,8.0,24.0,68.0,60.0,62.0,4.0,13.0,3147238.0,16.0,31.0,32.0,14.0,61.0,4.0,13.0
3,8.0,1.0,2.0,7.0,8.0,17.0,4.0,2599447.0,16.0,203.0,68.0,75.0,39.0,8.0,12.0
4,16.0,30.0,74.0,8.0,63.0,17.0,7.0,3257192.0,16.0,1.0,30.0,49.0,58.0,17.0,15.0


In [7]:
dataset_deceased.isna().sum()

DON_RACE    0
DON_A1      0
DON_A2      0
DON_B1      0
DON_B2      0
DON_DR1     0
DON_DR2     0
PERS_ID     0
CAN_RACE    0
REC_A1      0
REC_A2      0
REC_B1      0
REC_B2      0
REC_DR1     0
REC_DR2     0
dtype: int64

In [8]:
dataset_deceased['DON_RACE'].unique()

array([   8.,   16., 2000.,   64.,   32.,  128.,   40.,   24.,   80.,
         56.,   72.,  192.,  104.,  152.,   96.,   48.,   88.,  144.,
        136.,  200.,  224.,  160.,  248.,  112.,  120., 1024.])

In [9]:
def race_label(race):
    if race == 8:
        return "White"
    elif race == 16:
        return "Black or African American"
    elif race == 32:
        return "American Indian or Alaska Native"
    elif race == 64:
        return "Asian"
    elif race == 128:
        return "Native Hawaiian or Other Pacific Islander"
    elif race == 256:
        return "Arab or Middle Eastern"
    elif race == 512:
        return "Indian Sub-continent"
    elif race == 1024:
        return "Unknown (for Donor Referral only)"
    elif race == 2000:
        return "Hispanic/Latino"
    else:
        return "Multi-Racial"

In [10]:
# Apply race_label function to DON_RACE column
dataset_deceased['DON_RACE'] = dataset_deceased['DON_RACE'].apply(race_label)

In [11]:
# Apply race_label function to CAN_RACE column
dataset_deceased['CAN_RACE'] = dataset_deceased['CAN_RACE'].apply(race_label)

In [12]:
dataset_deceased.head(5)

Unnamed: 0,DON_RACE,DON_A1,DON_A2,DON_B1,DON_B2,DON_DR1,DON_DR2,PERS_ID,CAN_RACE,REC_A1,REC_A2,REC_B1,REC_B2,REC_DR1,REC_DR2
0,White,1.0,11.0,35.0,60.0,1.0,8.0,3759020.0,White,11.0,68.0,35.0,51.0,1.0,13.0
1,White,1.0,2.0,8.0,98.0,17.0,11.0,3198556.0,White,1.0,2.0,8.0,44.0,11.0,17.0
2,White,24.0,68.0,60.0,62.0,4.0,13.0,3147238.0,Black or African American,31.0,32.0,14.0,61.0,4.0,13.0
3,White,1.0,2.0,7.0,8.0,17.0,4.0,2599447.0,Black or African American,203.0,68.0,75.0,39.0,8.0,12.0
4,Black or African American,30.0,74.0,8.0,63.0,17.0,7.0,3257192.0,Black or African American,1.0,30.0,49.0,58.0,17.0,15.0


## HLA conversion to high resolution

In [13]:
#convert all donor HLA to high resolution
#observed the race input is case sensitive, eg if white instead of White is entered conversion does not occur. can we improve this to not be case sensitive in the future?
dataset_deceased[["DON_A1_HR","DON_A2_HR","DON_B1_HR","DON_B2_HR","DON_DR1_HR","DON_DR2_HR"]] = dataset_deceased.swifter.apply(lambda row: IU.get_high_resolution(row[0],row[1:7],alleleTypeList,allele_mapping_dict,allele_conversion_dict,master_dict),axis=1,result_type="expand")

Dask Apply:   0%|          | 0/72 [00:00<?, ?it/s]

In [14]:
dataset_deceased.head()

Unnamed: 0,DON_RACE,DON_A1,DON_A2,DON_B1,DON_B2,DON_DR1,DON_DR2,PERS_ID,CAN_RACE,REC_A1,...,REC_B1,REC_B2,REC_DR1,REC_DR2,DON_A1_HR,DON_A2_HR,DON_B1_HR,DON_B2_HR,DON_DR1_HR,DON_DR2_HR
0,White,1.0,11.0,35.0,60.0,1.0,8.0,3759020.0,White,11.0,...,35.0,51.0,1.0,13.0,A*01:01,A*11:01,B*40:01,B*35:01,DRB1*08:01,DRB1*01:01
1,White,1.0,2.0,8.0,98.0,17.0,11.0,3198556.0,White,1.0,...,8.0,44.0,11.0,17.0,,,,,,
2,White,24.0,68.0,60.0,62.0,4.0,13.0,3147238.0,Black or African American,31.0,...,14.0,61.0,4.0,13.0,A*24:02,A*68:01,B*15:01,B*40:01,DRB1*13:01,DRB1*04:01
3,White,1.0,2.0,7.0,8.0,17.0,4.0,2599447.0,Black or African American,203.0,...,75.0,39.0,8.0,12.0,A*01:01,A*02:01,B*08:01,B*07:02,DRB1*03:01,DRB1*04:01
4,Black or African American,30.0,74.0,8.0,63.0,17.0,7.0,3257192.0,Black or African American,1.0,...,49.0,58.0,17.0,15.0,A*30:02,A*74:01,B*08:01,B*15:16,DRB1*03:01,DRB1*07:01


In [15]:
#convert all receipient HLA to high resolution
dataset_deceased[["REC_A1_HR","REC_A2_HR","REC_B1_HR","REC_B2_HR","REC_DR1_HR","REC_DR2_HR"]] = dataset_deceased.swifter.apply(lambda row: IU.get_high_resolution(row[8],row[9:15],alleleTypeList,allele_mapping_dict,allele_conversion_dict,master_dict),axis=1,result_type="expand")

Dask Apply:   0%|          | 0/72 [00:00<?, ?it/s]

In [17]:
dataset_deceased.head()

Unnamed: 0,DON_RACE,DON_A1,DON_A2,DON_B1,DON_B2,DON_DR1,DON_DR2,PERS_ID,CAN_RACE,REC_A1,...,DON_B1_HR,DON_B2_HR,DON_DR1_HR,DON_DR2_HR,REC_A1_HR,REC_A2_HR,REC_B1_HR,REC_B2_HR,REC_DR1_HR,REC_DR2_HR
0,White,1.0,11.0,35.0,60.0,1.0,8.0,3759020.0,White,11.0,...,B*40:01,B*35:01,DRB1*08:01,DRB1*01:01,A*11:01,A*68:01,B*35:01,B*51:01,DRB1*01:01,DRB1*13:01
1,White,1.0,2.0,8.0,98.0,17.0,11.0,3198556.0,White,1.0,...,,,,,A*01:01,A*02:01,B*08:01,B*44:02,DRB1*03:01,DRB1*11:01
2,White,24.0,68.0,60.0,62.0,4.0,13.0,3147238.0,Black or African American,31.0,...,B*15:01,B*40:01,DRB1*13:01,DRB1*04:01,A*31:01,A*32:01,B*14:01,B*40:02,DRB1*04:05,DRB1*13:01
3,White,1.0,2.0,7.0,8.0,17.0,4.0,2599447.0,Black or African American,203.0,...,B*08:01,B*07:02,DRB1*03:01,DRB1*04:01,A*02:03,A*68:02,B*15:02,B*39:10,DRB1*12:02,DRB1*08:04
4,Black or African American,30.0,74.0,8.0,63.0,17.0,7.0,3257192.0,Black or African American,1.0,...,B*08:01,B*15:16,DRB1*03:01,DRB1*07:01,A*01:02,A*30:02,B*49:01,B*58:01,DRB1*15:03,DRB1*03:01


In [18]:
dataset_deceased.isna().sum()

DON_RACE          0
DON_A1            0
DON_A2            0
DON_B1            0
DON_B2            0
DON_DR1           0
DON_DR2           0
PERS_ID           0
CAN_RACE          0
REC_A1            0
REC_A2            0
REC_B1            0
REC_B2            0
REC_DR1           0
REC_DR2           0
DON_A1_HR     50667
DON_A2_HR     50667
DON_B1_HR     50667
DON_B2_HR     50667
DON_DR1_HR    50667
DON_DR2_HR    50667
REC_A1_HR     50948
REC_A2_HR     50948
REC_B1_HR     50948
REC_B2_HR     50948
REC_DR1_HR    50948
REC_DR2_HR    50948
dtype: int64

In [19]:
dataset_deceased = dataset_deceased.dropna(axis=0, subset=['DON_A1_HR'])
dataset_deceased = dataset_deceased.dropna(axis=0, subset=['REC_A1_HR'])

### immunogenicity calculation

In [20]:
aa_df = pd.read_csv("/home/ddoffei/immunogenicity-utilities/Dictionaries/AA_Code.csv")
aa_dict = IU.make_amino_acid_dict(aa_df)
classIseqdf = pd.read_csv("/home/ddoffei/immunogenicity-utilities/Dictionaries/Aligned_Seq.csv")
classIseq = IU.make_aligned_seq_dict(classIseqdf,aa_dict)
classIIseq = IU.make_allele_sequence_dict(['DPA1', 'DPB1', 'DQA1', 'DRB1345', 'DQB1'],"/home/ddoffei/immunogenicity-utilities/Dictionaries/Class_Two_Seq/",aa_dict)

In [21]:
dataset_deceased.iloc[0,15:19]

DON_A1_HR    A*01:01
DON_A2_HR    A*11:01
DON_B1_HR    B*40:01
DON_B2_HR    B*35:01
Name: 0, dtype: object

In [22]:
dataset_deceased.iloc[0,21:25]

REC_A1_HR    A*11:01
REC_A2_HR    A*68:01
REC_B1_HR    B*35:01
REC_B2_HR    B*51:01
Name: 0, dtype: object

In [23]:
dataset_deceased[["AB_HMS","AB_EMS","AB_AMS"]] = pd.DataFrame(list(dataset_deceased.swifter.apply(lambda row: IU.typeICalcs(row[15:19],row[21:25],classIseq),axis=1)))

Dask Apply:   0%|          | 0/72 [00:00<?, ?it/s]

In [24]:
dataset_deceased.iloc[0,19:21]

DON_DR1_HR    DRB1*08:01
DON_DR2_HR    DRB1*01:01
Name: 0, dtype: object

In [25]:
dataset_deceased.iloc[0,25:27]

REC_DR1_HR    DRB1*01:01
REC_DR2_HR    DRB1*13:01
Name: 0, dtype: object

In [26]:
dataset_deceased[['DR_HMS', 'DR_EMS', 'DR_AMS', 'DR_alphaHMS', 'DR_alphaEMS', 'DR_alphaAMS', 'DR_betaHMS',
       'DR_betaEMS', 'DR_betaAMS']] = pd.DataFrame(list(dataset_deceased.swifter.apply(lambda row: IU.typeIICalcs(row[19:21],row[25:27],classIIseq),axis=1)))

Dask Apply:   0%|          | 0/72 [00:00<?, ?it/s]

In [27]:
dataset_deceased.head()

Unnamed: 0,DON_RACE,DON_A1,DON_A2,DON_B1,DON_B2,DON_DR1,DON_DR2,PERS_ID,CAN_RACE,REC_A1,...,AB_AMS,DR_HMS,DR_EMS,DR_AMS,DR_alphaHMS,DR_alphaEMS,DR_alphaAMS,DR_betaHMS,DR_betaEMS,DR_betaAMS
0,White,1.0,11.0,35.0,60.0,1.0,8.0,3759020.0,White,11.0,...,5.75,5.1,2.805,3.5,0.0,0.0,0.0,5.1,2.805,3.5
2,White,24.0,68.0,60.0,62.0,4.0,13.0,3147238.0,Black or African American,31.0,...,10.0,15.9,19.025,13.0,0.0,0.0,0.0,15.9,19.025,13.0
3,White,1.0,2.0,7.0,8.0,17.0,4.0,2599447.0,Black or African American,203.0,...,6.25,13.1,16.9,9.5,0.0,0.0,0.0,13.1,16.9,9.5
4,Black or African American,30.0,74.0,8.0,63.0,17.0,7.0,3257192.0,Black or African American,1.0,...,5.0,12.9,15.045,8.5,0.0,0.0,0.0,12.9,15.045,8.5
5,White,2.0,11.0,51.0,55.0,4.0,11.0,2619730.0,White,3.0,...,5.25,8.1,5.72,7.0,0.0,0.0,0.0,8.1,5.72,7.0


In [28]:
dataset_deceased['Avg_EMS'] = dataset_deceased[['AB_EMS', 'DR_EMS']].mean(axis=1)
dataset_deceased['Avg_HMS'] = dataset_deceased[['AB_HMS', 'DR_HMS']].mean(axis=1)
dataset_deceased['Avg_AMS'] = dataset_deceased[['AB_AMS', 'DR_AMS']].mean(axis=1)

In [29]:
dataset_deceased.head()

Unnamed: 0,DON_RACE,DON_A1,DON_A2,DON_B1,DON_B2,DON_DR1,DON_DR2,PERS_ID,CAN_RACE,REC_A1,...,DR_AMS,DR_alphaHMS,DR_alphaEMS,DR_alphaAMS,DR_betaHMS,DR_betaEMS,DR_betaAMS,Avg_EMS,Avg_HMS,Avg_AMS
0,White,1.0,11.0,35.0,60.0,1.0,8.0,3759020.0,White,11.0,...,3.5,0.0,0.0,0.0,5.1,2.805,3.5,4.64375,6.05,4.625
2,White,24.0,68.0,60.0,62.0,4.0,13.0,3147238.0,Black or African American,31.0,...,13.0,0.0,0.0,0.0,15.9,19.025,13.0,18.835,17.7,11.5
3,White,1.0,2.0,7.0,8.0,17.0,4.0,2599447.0,Black or African American,203.0,...,9.5,0.0,0.0,0.0,13.1,16.9,9.5,11.52125,10.175,7.875
4,Black or African American,30.0,74.0,8.0,63.0,17.0,7.0,3257192.0,Black or African American,1.0,...,8.5,0.0,0.0,0.0,12.9,15.045,8.5,10.80625,10.0875,6.75
5,White,2.0,11.0,51.0,55.0,4.0,11.0,2619730.0,White,3.0,...,7.0,0.0,0.0,0.0,8.1,5.72,7.0,6.6075,8.825,6.125


In [30]:
dataset_deceased.isna().sum()

DON_RACE           0
DON_A1             0
DON_A2             0
DON_B1             0
DON_B2             0
DON_DR1            0
DON_DR2            0
PERS_ID            0
CAN_RACE           0
REC_A1             0
REC_A2             0
REC_B1             0
REC_B2             0
REC_DR1            0
REC_DR2            0
DON_A1_HR          0
DON_A2_HR          0
DON_B1_HR          0
DON_B2_HR          0
DON_DR1_HR         0
DON_DR2_HR         0
REC_A1_HR          0
REC_A2_HR          0
REC_B1_HR          0
REC_B2_HR          0
REC_DR1_HR         0
REC_DR2_HR         0
AB_HMS         51653
AB_EMS         51653
AB_AMS         51653
DR_HMS         51653
DR_EMS         51653
DR_AMS         51653
DR_alphaHMS    51653
DR_alphaEMS    51653
DR_alphaAMS    51653
DR_betaHMS     51653
DR_betaEMS     51653
DR_betaAMS     51653
Avg_EMS        51653
Avg_HMS        51653
Avg_AMS        51653
dtype: int64

### Eplet Load Calculations

In [31]:
melt_Df = pd.read_csv("/home/ddoffei/immunogenicity-utilities/Dictionaries/Eplet_Allele_ALL_Drop_NA_Melt.csv")
eplet_dict, eplet_type_dict = EU.build_dictionaries(melt_Df)

In [32]:
dataset_deceased[["DON_EPLETS","REC_EPLETS"]] = pd.DataFrame(list(dataset_deceased.swifter.apply(lambda row: EU.compare_eplets(row[['DON_A1_HR', 'DON_A2_HR', 'DON_B1_HR', 'DON_B2_HR', 'DON_DR1_HR',
       'DON_DR2_HR']],row[['REC_A1_HR',
       'REC_A2_HR', 'REC_B1_HR', 'REC_B2_HR', 'REC_DR1_HR', 'REC_DR2_HR']],eplet_dict)  ,axis=1)))

Dask Apply:   0%|          | 0/72 [00:00<?, ?it/s]

In [33]:
dataset_deceased.head()

Unnamed: 0,DON_RACE,DON_A1,DON_A2,DON_B1,DON_B2,DON_DR1,DON_DR2,PERS_ID,CAN_RACE,REC_A1,...,DR_alphaEMS,DR_alphaAMS,DR_betaHMS,DR_betaEMS,DR_betaAMS,Avg_EMS,Avg_HMS,Avg_AMS,DON_EPLETS,REC_EPLETS
0,White,1.0,11.0,35.0,60.0,1.0,8.0,3759020.0,White,11.0,...,0.0,0.0,5.1,2.805,3.5,4.64375,6.05,4.625,"{152V, 31F, 189S, 151H, 193PI, 85V, 144K, 180V...","{152V, 163T, 31F, 151H, 150AH, 105S, 85V, 144K..."
2,White,24.0,68.0,60.0,62.0,4.0,13.0,3147238.0,Black or African American,31.0,...,0.0,0.0,15.9,19.025,13.0,18.835,17.7,11.5,"{31F, 193PI, 144K, 77S, 71K, 62RN, 60Y, 181T, ...","{31F, 193PI, 144K, 77S, 62RN, 60Y, 156WA, 77T,..."
3,White,1.0,2.0,7.0,8.0,17.0,4.0,2599447.0,Black or African American,203.0,...,0.0,0.0,13.1,16.9,9.5,11.52125,10.175,7.875,"{152V, 163T, 69AA, 31F, 193PI, 45EE, 66I, 180V...","{142M, 152V, 69AA, 31F, 151H, 193PI, 70Q, 85V,..."
4,Black or African American,30.0,74.0,8.0,63.0,17.0,7.0,3257192.0,Black or African American,1.0,...,0.0,0.0,12.9,15.045,8.5,10.80625,10.0875,6.75,"{31F, 193PI, 144K, 77S, 62RN, 60Y, 181T, 16H, ...","{31F, 193PI, 70R, 144K, 62RN, 60Y, 181T, 16H, ..."
5,White,2.0,11.0,51.0,55.0,4.0,11.0,2619730.0,White,3.0,...,0.0,0.0,8.1,5.72,7.0,6.6075,8.825,6.125,"{31F, 193PI, 144K, 77S, 71K, 62RN, 60Y, 181T, ...","{31F, 193PI, 71K, 62RN, 60Y, 181T, 16H, rp67LK..."


In [35]:
dataset_deceased = dataset_deceased.dropna(axis=0, subset=['AB_HMS'])

In [36]:
dataset_deceased[["AbvMismatch" , "OthMismatch" , "Abv_Desc", "Oth_Desc"]] = dataset_deceased.swifter.apply(lambda row:EU.get_Eplet_Type(row["DON_EPLETS"],row["REC_EPLETS"],eplet_type_dict) ,axis=1,result_type="expand")

Pandas Apply:   0%|          | 0/81004 [00:00<?, ?it/s]

In [37]:
dataset_deceased.head()

Unnamed: 0,DON_RACE,DON_A1,DON_A2,DON_B1,DON_B2,DON_DR1,DON_DR2,PERS_ID,CAN_RACE,REC_A1,...,DR_betaAMS,Avg_EMS,Avg_HMS,Avg_AMS,DON_EPLETS,REC_EPLETS,AbvMismatch,OthMismatch,Abv_Desc,Oth_Desc
0,White,1.0,11.0,35.0,60.0,1.0,8.0,3759020.0,White,11.0,...,3.5,4.64375,6.05,4.625,"{152V, 31F, 189S, 151H, 193PI, 85V, 144K, 180V...","{152V, 163T, 31F, 151H, 150AH, 105S, 85V, 144K...",19,19,"[138MI, 180E, 76ESN, 144KR, 45KE, 44KM, 163EW,...","[189S, 193PI, 77S, 77NGT, 67F, 156R, 32Y, 74L,..."
2,White,24.0,68.0,60.0,62.0,4.0,13.0,3147238.0,Black or African American,31.0,...,13.0,18.835,17.7,11.5,"{31F, 193PI, 144K, 77S, 71K, 62RN, 60Y, 181T, ...","{31F, 193PI, 144K, 77S, 62RN, 60Y, 156WA, 77T,...",21,19,"[144KR, 74R, 163EW, 70QT, 65QIA, 11STS, 104A, ...","[71K, 181T, 16H, rp67LK, 37N, 152A, 270L, rq26..."
3,White,1.0,2.0,7.0,8.0,17.0,4.0,2599447.0,Black or African American,203.0,...,9.5,11.52125,10.175,7.875,"{152V, 163T, 69AA, 31F, 193PI, 45EE, 66I, 180V...","{142M, 152V, 69AA, 31F, 151H, 193PI, 70Q, 85V,...",16,19,"[180E, 25Q, 98E, 76ESN, 57V, 71TTS, 62RR, 104A...","[163T, 45EE, 77S, 62RN, 184A, 77D, 44RM, rp37F..."
4,Black or African American,30.0,74.0,8.0,63.0,17.0,7.0,3257192.0,Black or African American,1.0,...,8.5,10.80625,10.0875,6.75,"{31F, 193PI, 144K, 77S, 62RN, 60Y, 181T, 16H, ...","{31F, 193PI, 70R, 144K, 62RN, 60Y, 181T, 16H, ...",17,11,"[163RW, 65QIA, 62G, 107W, 104A, 47F, 142TKH, 1...","[77S, 152A, 74A, 12M, 156QA, rp58E, 66K, rq74A..."
5,White,2.0,11.0,51.0,55.0,4.0,11.0,2619730.0,White,3.0,...,7.0,6.6075,8.825,6.125,"{31F, 193PI, 144K, 77S, 71K, 62RN, 60Y, 181T, ...","{31F, 193PI, 71K, 62RN, 60Y, 181T, 16H, rp67LK...",11,19,"[144K, 37L, 16Y, 57S, 57V, 142TKH, 144KHA, 76E...","[77S, 156WA, rq57V, 30H, 149AH, 151H, 45EE, 71..."


### Adding data points according to specifications

### basic survival analysis using scikit-survival and lifelines libraries