In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from Bio import SeqIO
import datasets

In [2]:
data_path = '../../data/PI_DataSet.tsv'

dataset_root = '../../datasets/'
results_root = '../../results/'

In [3]:
shuffle_stream = np.random.RandomState(seed = 1234)

In [4]:
df = pd.read_csv(data_path, sep = '\t')
df['id'] = df['SeqID'].map(str)
df.head()

Unnamed: 0,SeqID,FPV,ATV,IDV,LPV,NFV,SQV,TPV,DRV,P1,...,P92,P93,P94,P95,P96,P97,P98,P99,CompMutList,id
0,12861,0.4,,0.5,,7.1,0.5,,,-,...,-,-,-,-,-,-,-,-,"D30N, R57G, N88D",12861
1,12862,0.8,,1.2,,24.7,0.9,,,-,...,-,-,-,-,-,-,-,-,"D30N, M46I, R57G, L63P, N88D",12862
2,12863,3.0,,2.8,,2.2,1.0,,,-,...,-,-,-,-,-,-,-,-,"M46I, R57G, L63P, V82T, I84V",12863
3,12864,4.4,,3.9,,3.6,1.7,,,-,...,-,-,-,-,-,-,-,-,"L10R, M46I, R57G, L63P, V82T, I84V",12864
4,12865,3.6,,3.6,,6.2,9.0,,,-,...,-,-,-,-,-,-,-,-,"L10I, R57G, L63P, A71V, I84V, L90M",12865


## Cleaning

First, we need to convert the "difference from reference" format back into a normal sequence.
Using the Uniprot reference we can add back in the missing information.

In [5]:
pr_seq = 'PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF'
# REF: https://hivdb.stanford.edu/pages/documentPage/consensus_amino_acid_sequences.html

seq_cols = [f'P{i}' for i in range(1,100)]
rep_dict = {}
for col, pr in zip(seq_cols, pr_seq):
    rep_dict[col] = {'-': pr, '*': ''}

make_seq = lambda row: ''.join(row.reindex(seq_cols).fillna(''))
seq_ser = df[seq_cols].replace(rep_dict).apply(make_seq, axis=1)
df['sequence'] = seq_ser
df.head()

Unnamed: 0,SeqID,FPV,ATV,IDV,LPV,NFV,SQV,TPV,DRV,P1,...,P93,P94,P95,P96,P97,P98,P99,CompMutList,id,sequence
0,12861,0.4,,0.5,,7.1,0.5,,,-,...,-,-,-,-,-,-,-,"D30N, R57G, N88D",12861,PQITLWQRPLVTIKIGGQLKEALLDTGADNTVLEEMNLPGRWKPKM...
1,12862,0.8,,1.2,,24.7,0.9,,,-,...,-,-,-,-,-,-,-,"D30N, M46I, R57G, L63P, N88D",12862,PQITLWQRPLVTIKIGGQLKEALLDTGADNTVLEEMNLPGRWKPKI...
2,12863,3.0,,2.8,,2.2,1.0,,,-,...,-,-,-,-,-,-,-,"M46I, R57G, L63P, V82T, I84V",12863,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKI...
3,12864,4.4,,3.9,,3.6,1.7,,,-,...,-,-,-,-,-,-,-,"L10R, M46I, R57G, L63P, V82T, I84V",12864,PQITLWQRPRVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKI...
4,12865,3.6,,3.6,,6.2,9.0,,,-,...,-,-,-,-,-,-,-,"L10I, R57G, L63P, A71V, I84V, L90M",12865,PQITLWQRPIVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...


We also need to account for the data sparseness.
The subset of drugs: FPV, IDV, NFV, and SQV have the highest mutual coverage. 
We'll use only those for downstream predictions.

In [6]:
wanted = ['FPV', 'IDV', 'NFV', 'SQV']
df.dropna(subset = wanted, inplace = True)

In [7]:
cutoff = 4 # fold increase over WT
resist = df[wanted]>4
resist['MULTI'] = resist.sum(axis=1)>0
resist.sum()

FPV      623
IDV      793
NFV      930
SQV      666
MULTI    967
dtype: int64

## Dataset Creation

In [9]:
# TODO: Add BibTeX citation
# Find for instance the citation on arxiv or on the dataset repo/website
_CITATION = """\
@InProceedings{huggingface:dataset,
title = {HIV Protease Drug Resistance Prediction Dataset},
author={Will Dampier
},
year={2021}
}
"""

# TODO: Add a link to an official homepage for the dataset here
_HOMEPAGE = ""

# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""

# TODO: Add description of the dataset here
# You can copy an official description
_DESCRIPTION = """\
This dataset was constructed the Stanford HIV Drug Resistance Database. 
https://hivdb.stanford.edu/pages/genopheno.dataset.html
The sequences were interpolated from the protease high-quality dataset.
Sequences with >4-fold increased resistance relative to wild-type was labeled as True.
"""


In [10]:
features = datasets.Features({
    'sequence': datasets.Value('string'),
    'id': datasets.Value('string'),
    'FPV': datasets.Value('bool'),
    'IDV': datasets.Value('bool'),
    'NFV': datasets.Value('bool'),
    'SQV': datasets.Value('bool'),
    'fold': datasets.Value('int32')
})

training_folds = shuffle_stream.randint(0,5, size = df['sequence'].values.shape)
df['fold'] = training_folds
info = datasets.DatasetInfo(description = _DESCRIPTION,
                                  features = features,
                                  homepage=_HOMEPAGE, license = _LICENSE, citation=_CITATION)

processed_df = df[wanted] > cutoff
processed_df['id'] = df['id']
processed_df['fold'] = df['fold']
processed_df['sequence'] = df['sequence']


dset = datasets.Dataset.from_pandas(processed_df,
                                          info = info,
                                          features = features)
#corecpt_dset
dset.save_to_disk(dataset_root + 'PR_resist')