### Import needed libraries. Load needed objects.

In [6]:
# Ad hoc library
from sbhandler import * 
def get_cov(df): return int((len(df.columns)-4)/2)

# Data Analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Machine Learning Framework
import torch
import torch.nn as nn
import torch.nn.functional as F

# Store and Load Objects
from pickle import load, dump  

In [7]:
# Assembly Matrices (pd.DataFrame objects)
# RFP_LIST = load( open('pickle/v1rfp.pickle', 'rb'))
BFP_LIST = load( open('pickle/v1bfp.pickle', 'rb'))

# Features to generate for Extended Logistic Regression Model
- Type (2):      &emsp;     &emsp;       &emsp; &emsp; &emsp; &emsp;  isSNP, isInsertion, (isDeletion ~ implied by other two)
- Length (1):    &emsp;  &emsp;  &emsp; &emsp;      &emsp;   &nbsp;        isSingle, (isMulti)
- Near another Indel (1):   &emsp;  isNear, (isFar)
- Neighbors (1):        &emsp;   &emsp; &ensp; &nbsp;   &emsp;   isConcord, (isDiscord)
- Observed Error Rate [5]:  &nbsp;    Map Phred Q Score to SNP/Indel (1-epsilon) for the Nucleotide *-OR-* (epsilon * conditional error)
    - Multiply instances
- Count [5]: &emsp; &emsp; &emsp; &emsp; &emsp; &ensp; Explicit read count of each nuc. 

### Add Column: Variation Type 

In [20]:
def get_cov(df): return int((len(df.columns)-4)/2)

def pd_reads_truncater(df, list_reads):
    # Removes meaningless indels that only arise from start, end alignment
    ret_list = []

    for read in list_reads:
        nan_indel_mask = read[read != 45]
        first_valid = nan_indel_mask.first_valid_index()
        last_valid = nan_indel_mask.last_valid_index()
        ret_list.append( (first_valid, last_valid) )
        # Replace all 45's before and after the idx tuple with 1
        if first_valid is not None and first_valid!= df.index[0]:
            read.loc[:first_valid] = 1 # start and excluding
        if last_valid is not None and last_valid != df.index[-1]:
            read.loc[last_valid+1:] = 1 # past and end
            
    return ret_list

def add_variation_type(df, cov):
    """Adds Variation Type: Assumes `is_indel` column exists

    Filters (conditional) by at least a single 45 instance in a row.
    
    If: ground truth is a 45 (-) -> It's an insertion.
    Else: It's a deletion.
    Args:
        df (DataFrame): RFP or BFP
        cov (int): coverage; no. of reads
        Return: adds `is_snip` and `is_ins` boolean column
    """
    df['is_ins'] = np.where( df[['target', 'is_indel']].eq([45, True], axis=1).all(axis=1), True, False)

    def check_row_variation(df, cov):
        left = df.iloc[:, 1:cov+1]
        right = df.iloc[:, 0]
        left, right = left.align(right, axis=1, copy=False)
        return np.where( (left != right).any(axis=1)), True, False)
    
    df['is_snp'] = np.where( df['is_indel'] == False, check_row_variation(df, cov), False)
    # Double conditional
    # First... check if its an indel -> immediately goes to False
    # Then... check if the target and at least one read disagree ->


In [32]:
test = BFP_LIST[0]
left = test.iloc[:, 1:4]
right = test.iloc[:, 0]
left, right = left.align(right, axis=1, copy=False)
np.where( (left!= right).any(axis=1), True, False)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

### Add Length Type

In [21]:
def isSingleSNP(df, idx):
    pass

def isSingleDel(df, cov, idx): # Checks if read has only one indel.
    # Assumes it's a del row.
    def check_del_left(df, idx, cov):
        if idx == 0: return False
        return (df.iloc[idx-1, 1:cov+1] == 45).any()
    def check_del_right(df, idx, cov):
        if idx == len(df) - 1: return False
        return ((df.iloc[idx+1, 1:cov+1] == 45).any())

    try:
        df.iloc[idx, -1] = True if ( check_del_right(df, idx, cov) and check_del_left(df, idx, cov)) else False
    except:
        print("is single index uh oh ")
    # isMulti if a single 45 is present left or right.
    
def isSingleIns(df, idx): # Checks if target has only one indel.
    def check_right(df, idx):
        if idx == len(df) - 1: return False
        return (df.iloc[idx+1, 0] == 45)
    def check_left(df, idx):
        if idx == 0: return False
        return (df.iloc[idx-1, 0] == 45)
    # print(idx)
    df.iloc[idx, -1] = True if (check_right(df, idx) and check_left(df, idx)) else False

In [22]:
def add_variation_length(df, cov):
    df['is_single'] = pd.Series([True for _ in range(len(df))])

    dels = df.iloc[np.where(( df[['is_indel', 'is_ins']] == [True, False] ).all(axis=1) == True)[0], :]
    to_check = dels.index.to_list()
    if not to_check == []:
        print(to_check)
        for i in to_check:
            isSingleDel( df, cov, i )

    ins = df.iloc[np.where((df[['is_indel', 'is_ins']] == [True, True]).all(axis=1) == True)[0], :]
    to_check = ins.index.to_list()
    if not to_check:
        for i in to_check:
            isSingleIns(df, i)

# Add Engineered Features to RFP and BFP pd.DataFrame objects

In [23]:
def add_features(df, cov, indel_parse=False, rfp=False):
    if rfp:
        print('RFP-specific parsing currently inactivated.')
    # if rfp:
    #     focus = [i for i in range(1, cov+1)]
    #     focus.append(2*cov+2) # contig?
    #     nan_mask = df.iloc[:, focus][df != 45]
    
    if indel_parse:
        nan_mask = df.iloc[:, 0:cov+1][df!=45]
        df['is_indel'] = np.where(nan_mask.count(axis = 1) == cov+1, False, True) #cov + 1 because targ/contig included
        
    # isSNP, isIns, isDel
    add_variation_type(df, cov)
    
    # isSingle, isMulti
    add_variation_length(df, cov)


In [24]:
def get_progress(n, each=False):
    if each:
        print(n, end="\r")
    else:
        if (n % 3551 == 0): print(f'{n / 355104:.0%}', end='\r') 

In [25]:
for n, bfp in enumerate(BFP_LIST):
    if bfp.empty: continue
    get_progress(n, each=True)
    add_features(bfp, get_cov(bfp))

0

TypeError: unhashable type: 'Series'

In [7]:
# Function to facilitate feature-type filtering -> will be used to grab specific rows for training
def pd_filter(df, columns, conditions, all=True):
    columns_to_filter_by = df[columns]
    if all:
        return df.iloc[np.where( (columns_to_filter_by == conditions).all(axis=1))[0]]
    else:
        return df.iloc[np.where( (columns_to_filter_by == conditions).any(axis=1))[0]]

# Data Preprocessing

### Dataset Parameters

In [15]:
TOTAL_ASSEMBLY_MATRICES = 355104
TOTAL_SAMPLES = TOTAL_ASSEMBLY_MATRICES * 50

PERCENT_RED = 0.4
PERCENT_BLUE = 0.6

PERCENT_Z = 0.3
PERCENT_Y = 0.5
PERCENT_REGULAR = 0.2

# To change as more features are added
SAMPLE_SHAPE = (14)
LABEL_SHAPE = (5)

COUNT_MAPPING = {65 : 0, 67 : 0, 71 : 2, 84: 3, 45 : 4,}

PROB_MAPPING = dict(zip([65, 67, 71, 84, 45], [5, 6, 7, 8, 9]))

CATEGORY_MAPPING = dict(zip(['is_snp', 'is_ins', 'is_single', 'is_rfp'], [0, 1, 2, 3]))

In [8]:
# Query by Quality Score... then query by ground truth and read.
CEM_INDELS = load( open('pickle/cem_indels_tensor.pickle', 'rb'))
CEM_SNIPS = load( open('pickle/cem_snips_tensor.pickle', 'rb'))

QADJ_INDELS = 0
QADJ_SNIPS = 0

In [9]:
# PHRED to ERROR RATE
def convert_to_epsilon(nuc, zero_shifted_quality, indel=True):
    database = QADJ_INDELS if indel else QADJ_SNIPS
    return database[nuc][zero_shifted_quality]
    
# PHRED to CONDITIONAL SEQ ERROR
def convert_to_cond_seq_error(nuc, ground_truth, zero_shifted_quality, indel=True):
    database = CEM_INDELS if indel else CEM_SNIPS
    cond_seq_error = database[zero_shifted_quality][PROB_MAPPING[nuc], PROB_MAPPING[ground_truth]]
    return cond_seq_error * convert_to_epsilon(nuc, zero_shifted_quality, indel=indel)

In [17]:
def single_df_run(df, all_samples, all_labels, reads_max, rfp=False, enforce_cov = False, cov_min=4):
    if df.empty: return 
    cov = get_cov(df)
    if enforce_cov and (cov < cov_min): return

    reads_idxs = [i for i in range(1, cov+1)]
    
    if cov > 4: 
        reads_to_use = np.random.sample(reads_idxs, reads_max)
    else:
        reads_to_use = reads_idxs
    # reads = [f'read{n}' for n in reads_to_use]
    
    # Get SNPs only
    snps = pd_filter(df, 'is_snp', True)
    
    # Get Y conflicts only
    ins = pd_filter(df, 'is_ins', True)
    dels = pd_filter(df, ['is_indel', 'is_ins'], [True, False])
    
    # Get regulars?
    def extend_data(s, l):
        all_samples.extend(s)
        all_labels.extend(l)
    
    # Actually update it
    s, l = sample_label_generator(df, snps, cov, reads_to_use, all_samples, all_labels, snps=True)
    extend_data(s, l)
    
    s, l = sample_label_generator(df, ins, cov, reads_to_use, ins=True)
    extend_data(s, l)
    
    s, l = sample_label_generator(df, dels, cov, reads_to_use)
    extend_data(s, l)

def sample_label_generator(df, filtered_data, cov, reads_to_use, rfp=False, snps=False, ins=False):
    samples = []
    labels = []
    
    for i in range(len(filtered_data)):
        sample = torch.empty(SAMPLE_SHAPE)
        label = torch.zeros(LABEL_SHAPE)
        count = np.zeros(5)
        prob = np.ones(5)
        
        if snps:
            category = np.array([1, 0, 0, 1]) if rfp else np.array([1, 0, 0, 0])
        elif ins:
            category = np.array([0, 1, 0, 1]) if rfp else np.array([0, 1, 0, 0])
        else:
            category = np.array([0, 0, 0, 1]) if rfp else np.array([0, 0, 0, 0])
        
        # Row by row...
        ground = df['target'].iloc[i]
        truth = df['contig'].iloc[i] if rfp else ground
            
        for r in reads_to_use:
            nuc = df[f'read{r}'].iloc[i]
            quality = df[f'q{r}'].iloc[i]
            if nuc == ground:
                prob[PROB_MAPPING.get(nuc)] *= (1 - convert_to_epsilon('snp', nuc, quality) )
            else: 
                prob[PROB_MAPPING.get(nuc)] *= convert_to_epsilon('snp', nuc, quality) * convert_to_cond_seq_error('snp', nuc, ground, quality)
            count[PROB_MAPPING.get(nuc)] += 1
        if df['is_single'].iloc[i] is True:
            category[CATEGORY_MAPPING.get('is_single')] = 1
            
        # Update sample, label
        sample[0:5] = count
        sample[5:10] = prob
        sample[10:14] = category
        label[PROB_MAPPING.get(truth)] = 1
        samples.append(sample)
        labels.append(label)
        
    return samples, labels

def get_data(df_list, rfp=False, enforce_cov=False, cov_min=4, reads_max=3, multiple_sizes=False, multiple_sampling=False, random_select=False):
    all_samples = []
    all_labels = []
    
    percent_of_total = PERCENT_BLUE
    if rfp: percent_of_total = PERCENT_RED
    
    size = percent_of_total * TOTAL_ASSEMBLY_MATRICES
    one_percent = 0.01 * size
    
    max_z = PERCENT_Z * TOTAL_SAMPLES
    max_y = PERCENT_Y * TOTAL_SAMPLES
    max_reg = PERCENT_REGULAR * TOTAL_SAMPLES
    
    count_z, count_y, count_reg = 0, 0, 0
    
    if random_select:
        random_idxs = np.random.permutation(TOTAL_ASSEMBLY_MATRICES)
        
        for n, idx in enumerate(random_idxs):
            if (n % one_percent == 0): print(f'{n / size:.0%}', end='\r') 
            df = df_list[idx]
            single_df_run(df, all_samples, all_labels, reads_max)
            
    return all_samples, all_labels

In [19]:
samples, labels = get_data(BFP_LIST, random_select=True)

0%

KeyError: 'is_snp'

In [None]:
def test_pd_update_method(x):
    sample = torch.empty(SAMPLE_SHAPE)
    label = torch.zeros(LABEL_SHAPE)
    count = np.zeros(5)
    prob = np.ones(5)
    category = np.array([1, 0, 0, 1]) if rfp else np.array([1, 0, 0, 0])
    
    # Row by row...
    ground = df['target']
    truth = df['contig']
    for r in reads_to_use:
        nuc = df[f'read{r}']
    quality = df[f'q{r}']
    if nuc == ground:
        prob[PROB_MAPPING.get(nuc)] *= (1 - convert_to_epsilon('snp', nuc, quality) )
    else: 
        prob[PROB_MAPPING.get(nuc)] *= convert_to_epsilon('snp', nuc, quality) * convert_to_cond_seq_error('snp', nuc, ground, quality)
    count[PROB_MAPPING.get(nuc)] += 1
    if df['is_single'] is True:
        category[CATEGORY_MAPPING.get('is_single')] = 1
    return [sample, label]