In [1]:
import os
import pandas as pd
from tqdm import tqdm

In [2]:
class Seq:
    
    def __init__(self, name, description, sequence):
        self.name = name.lstrip('>')
        self.desc = description.split('(')[1].split(')')[0]
        self.seq = sequence.lower()


def read_seq(file):
    with open(file, 'r') as f:
        con = [i.rstrip('\n') for i in f.readlines()]
    return Seq(con[0], con[1], ''.join(con[2:]))


def load_sequences(folder):
    seq_files = [os.path.join(folder, i) for i in os.listdir(folder) if not i.startswith('.')]
    seqs = list(map(read_seq, seq_files))
    seqs_df_rows = [(x.name, x.desc, x.seq) for x in tqdm(seqs)]
    seqs_df = pd.DataFrame(seqs_df_rows, columns=['Name','CDSjoin','Sequence'])
    return seqs_df

def load_data():
    training_set = pd.read_hdf('../Data_files/features&labels/testing_set.h5')
    testing_set = pd.read_hdf('../Data_files/features&labels/testing_set.h5')
    return (training_set, testing_set)

def Summary(tr1, ts1, tr2, ts2):
    Sum = pd.DataFrame()
    Sum['nDonor (Training)'] = [tr1['IsDonor'].value_counts()[1], tr2['IsDonor'].value_counts()[1]]
    Sum['nPseudo (Training)'] = [tr1['IsDonor'].value_counts()[0], tr2['IsDonor'].value_counts()[0]]
    Sum['nDonor (Testing)'] = [ts1['IsDonor'].value_counts()[1], ts2['IsDonor'].value_counts()[1]]
    Sum['nPseudo (Testing)'] = [ts1['IsDonor'].value_counts()[0], ts2['IsDonor'].value_counts()[0]]
    Sum['nSample'] = [ts1.shape[0]+tr1.shape[0], ts2.shape[0]+tr2.shape[0]]
    Sum['nDonor : nPSeudo'] = Sum['nDonor (Training)'].apply(str) + ' : ' + Sum['nPseudo (Training)'].apply(str)
    Sum['Type'] = ['Unbalanced', 'Balanced']
    Sum.index = ['Dataset 1', 'Dataset 2']
    return Sum