In [29]:
from Bio import GenBank
from pathlib import Path
from tqdm.notebook import tqdm
import pandas as pd

## Loading dataset

In [30]:
DATA_PATH = '/data/hiv'
data = None

In [31]:
filenames = list(Path(DATA_PATH).glob("*.csv"))
for filename in tqdm(filenames, 'Loading files'):

    df = pd.read_csv(filename)
    
    if data is None:
        data = df
    else:
        data = data.append(df)
data.shape

Loading files:   0%|          | 0/88 [00:00<?, ?it/s]

(983029, 23)

In [32]:
data.index = data.accession
data = data.drop(columns=['Unnamed: 0','accession'])

In [33]:
data

Unnamed: 0_level_0,length,sequence,pol_pro,pol_loc,env_pro,env_loc,gag_pro,gag_loc,vpr_pro,vpr_loc,...,vif_loc,tat_pro,tat_loc,rev_pro,rev_loc,vpu_pro,vpu_loc,nef_pro,nef_loc,country
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MZ736147,851,AATGTAACAGAAAATTTTAACATGTGGAAAAATAATATGGCAGAGC...,,,NVTENFNMWKNNMAEQMQEDVISLWDQSLKPCVKLTPLCVVLNCTN...,<1..>851,,,,,...,,,,,,,,,,China
MZ736148,849,AATGTAACAGAAAATTTTAACATGTGGAAAAATAATATGGCAGAGC...,,,NVTENFNMWKNNMAEQMQEDVISLWDQSLKPCVKLTPLCVVLNCTN...,<1..>849,,,,,...,,,,,,,,,,China
MZ736149,851,AATGTAACAGAAAATTTTAACATGTGGAAAAATAATATGGCAGAGC...,,,NVTENFNMWKNNMAEQMQEDVISLWDQSLKPCVKLTPLCVVLNCTN...,<1..>851,,,,,...,,,,,,,,,,China
MZ736150,851,AATGTAACAGAAAATTTTAACATGTGGAAAAATAATATGGCAGAGC...,,,NVTENFNMWKNNMAEQMEEDVISLWDQSLKPCVKLTPLCVVLNCTN...,<1..>851,,,,,...,,,,,,,,,,China
MZ736151,849,AATGTAACAGAAAATTTTAACATGTGGAAAAATAATATGGCAGAGC...,,,NVTENFNMWKNNMAEQMQEDVISLWDQSLKPCVKLTPLCVVLNCTN...,<1..>849,,,,,...,,,,,,,,,,China
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MW145174,376,GTCTGCTACACCCTATGTGCCAGCACGGAATGGAGGATGAACACAG...,,,,,,,,,...,,,,,,,,,,
MW145175,393,ACTGTCTGCTACACCCTGTGTGCCAGCATGGAATGGGTGATCAGCA...,,,,,,,,,...,,,,,,,,,,
MW145176,313,TTGTTACACCCTGTCTGCCAGCATGGAATGGATGATGATCACAGAG...,,,,,,,,,...,,,,,,,,,,
MW145177,417,ATTACTGGAGCTACTGATGAGAGACACAGCCTGTTACACCCGATAT...,,,,,,,,,...,,,,,,,,,,


In [34]:
data.to_hdf('out/data01.h5', key='data', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['sequence', 'pol_pro', 'pol_loc', 'env_pro', 'env_loc', 'gag_pro',
       'gag_loc', 'vpr_pro', 'vpr_loc', 'vif_pro', 'vif_loc', 'tat_pro',
       'tat_loc', 'rev_pro', 'rev_loc', 'vpu_pro', 'vpu_loc', 'nef_pro',
       'nef_loc', 'country'],
      dtype='object')]

  pytables.to_hdf(


## Cutting

In [115]:
data=pd.read_hdf('out/data01.h5', key='data')
columns = pd.Series(data.columns)

In [150]:
for col in tqdm(columns[columns.str.contains('_loc$')]):
    m = pd.notna(data[col])
    data.loc[m, col] = data[col][m].str.replace('[<>]', '', regex=True)
    data.loc[m, col] = data[col][m].str.replace('order(', '', regex=False)
    data.loc[m, col] = data[col][m].str.replace('join(', '', regex=False)
    data.loc[m, col] = data[col][m].str.replace('complement(', '', regex=False)
    data.loc[m, col] = data[col][m].str.replace(')', '', regex=False)
    data.loc[m, col] = data[col][m].str.replace('..', ',', regex=False)

  0%|          | 0/9 [00:00<?, ?it/s]

In [157]:
for col in tqdm(columns[columns.str.contains('_loc$')]):
    gene, _ = col.split('_')

    if gene in data.columns:
        continue
    
    m = pd.notna(data[col])
    data[gene] = None
    
    s = data[col][m].str.split(',', expand=True)
    for i, row in tqdm(s.iterrows(), f'Processing {col}'):
        
        i1 = int(row[0])
        i2 = int(row[1]) if row[1] is not None else len(data.loc[i].sequence)
        if len(row) > 2:
            i2 = int(row[2]) if row[2] is not None else i2
        if len(row) > 3:
            i2 = int(row[3]) if row[3] is not None else i2
        i1, i2 = i1 - 1, i2 - 1
        
        data.loc[i, gene] = data.loc[i].sequence[i1:i2]

  0%|          | 0/9 [00:00<?, ?it/s]

Processing nef_loc: 0it [00:00, ?it/s]

In [158]:
data.nef[~pd.isnull(data.nef)]

accession
DQ007901    ATGGGTGGCAAGTGGTCAAAAAGTAGTATGGTTGGATGGCCTAATG...
DQ007902    ATGGGTGGATGGCCTAGTGTAAGGGAAAGAATGAGACGAGCTGAGC...
DQ007903    ATGGGTGGCAAGTGGTCAAAATGTAGTAAGGTTGGATGGCCTAGTG...
DQ011165    ATGGGGGGCAAATGGTCAAAAAGAAGCATAGTTGGATGGGATAATG...
DQ011166    ATGGGGGGCAAGTGGTCCAAACGCAGCATAGTTGGATGGTCTGAGG...
                                  ...                        
MK169864    ATGGGAGGCAAGTGGTCAAAAGGTAGTATAGGTGGATGGCCTAAGG...
MK169865    ATGGGAGGCAAGTGGTCAAAAGGTAGTATAGTTGGATGGCCTAAGG...
MK169891    ATGGGAGGCAAGTGGTCAAAAAGTAGTATAGTTGGATGGCCTAAGG...
MK169892    ATGGGAGGCAAGTGGTCAAAAGGTAGTATAGTTGGATGGCCTAAGG...
MW145181    ATGGGGGGCAAGTGGTCAAAACGCAGCAGAATTGAATGGCCTGCTA...
Name: nef, Length: 69525, dtype: object

In [154]:
#data = data.drop(columns=['nef'])

In [159]:
data.columns

Index(['length', 'sequence', 'pol_pro', 'pol_loc', 'env_pro', 'env_loc',
       'gag_pro', 'gag_loc', 'vpr_pro', 'vpr_loc', 'vif_pro', 'vif_loc',
       'tat_pro', 'tat_loc', 'rev_pro', 'rev_loc', 'vpu_pro', 'vpu_loc',
       'nef_pro', 'nef_loc', 'country', 'pol', 'env', 'gag', 'vpr', 'vif',
       'tat', 'rev', 'vpu', 'nef'],
      dtype='object')

In [160]:
data.to_csv('out/data02.csv')