### Import needed libraries. Load needed objects.

In [1]:
# Ad hoc library
from sbhandler import * 

# Data Analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Machine Learning Framework
import torch
import torch.nn as nn
import torch.nn.functional as F

# Store and Load Objects
from pickle import load, dump  

In [2]:
# Assembly Matrices (pd.DataFrame objects)
RFP_LIST = load( open('pickle/v1rfp.pickle', 'rb'))
# BFP_LIST = load( open('pickle/v1bfp.pickle', 'rb'))

# Features to generate for Extended Logistic Regression Model
- Type (2):      &emsp;     &emsp;       &emsp; &emsp; &emsp; &emsp;  isSNP, isInsertion, (isDeletion ~ implied by other two)
- Length (1):    &emsp;  &emsp;  &emsp; &emsp;      &emsp;   &nbsp;        isSingle, (isMulti)
- Near another Indel (1):   &emsp;  isNear, (isFar)
- Neighbors (1):        &emsp;   &emsp; &ensp; &nbsp;   &emsp;   isConcord, (isDiscord)
- Observed Error Rate [5]:  &nbsp;    Map Phred Q Score to SNP/Indel (1-epsilon) for the Nucleotide *-OR-* (epsilon * conditional error)
    - Multiply instances
- Count [5]: &emsp; &emsp; &emsp; &emsp; &emsp; &ensp; Explicit read count of each nuc. 

### Add Column: Variation Type 

In [45]:
def get_cov(df): return int((len(df.columns)-4)/2)

def pd_reads_truncater(df, list_reads):
    # Removes meaningless indels that only arise from start, end alignment
    ret_list = []

    for read in list_reads:
        nan_indel_mask = read[read != 45]
        first_valid = nan_indel_mask.first_valid_index()
        last_valid = nan_indel_mask.last_valid_index()
        ret_list.append( (first_valid, last_valid) )
        # Replace all 45's before and after the idx tuple with 1
        if first_valid is not None and first_valid!= df.index[0]:
            read.loc[:first_valid] = 1 # start and excluding
        if last_valid is not None and last_valid != df.index[-1]:
            read.loc[last_valid+1:] = 1 # past and end
            
    return ret_list

def add_variation_type(df, cov, indel_parse=False, rfp=False):
    """Underlying Mechanism
    Filters (conditional) by at least a single 45 instance in a row.
    
    If: ground truth is a 45 (-) -> It's an insertion.
    Else: It's a deletion.
    """
    # Assumes: Reads already truncated and starting and ending useless indels replaced with 1
    if rfp:
        print('RFP-specific parsing currently inactivated.')
    # if rfp:
    #     focus = [i for i in range(1, cov+1)]
    #     focus.append(2*cov+2) # contig?
    #     nan_mask = df.iloc[:, focus][df != 45]
    if indel_parse:
        nan_mask = df.iloc[:, 0:cov+1][df!=45]
        df['is_indel'] = np.where(nan_mask.count(axis = 1) == cov+1, False, True) #cov + 1 because targ/contig included
        
    #    
    # Define indel type and if some nucleotide variation from present in row.
    #
    df['is_ins'] = np.where( df[['target', 'is_indel']].eq([45, True], axis=1).all(axis=1), True, False)

    def check_row_variation(df, cov):
        return np.where( (df.iloc[:, 1:cov+1] != df.iloc[:, 0]).any(axis=1), True, False)
    df['is_snp'] = np.where( df['is_indel'] == False, check_row_variation(df, cov), False)
        # Double conditional
        # First... check if its an indel -> immediately goes to False
        # Then... check if the target and at least one read disagree ->
    
    #
    # Check for singluar or multi...
    #
    df['is_single'] = [True for _ in range(len(df))]

    dels = df.iloc[np.where((df[['is_indel', 'is_ins']] == [True, False]).all(axis=1) == True)[0], :]
    to_check = dels.index.to_list()
    for i in to_check:
        isSingleDel(df, cov, i)

    ins = df.iloc[np.where((df[['is_indel', 'is_ins']] == [True, True]).all(axis=1) == True)[0], :]
    to_check = ins.index.to_list()
    for i in to_check:
        isSingleIns(df, i)

In [47]:
def pd_filter(df, columns, conditions, all=True):
    columns_to_filter_by = df[columns]
    if all:
        return df.iloc[np.where( (columns_to_filter_by == conditions).all(axis=1))[0]]
    else:
        return df.iloc[np.where( (columns_to_filter_by == conditions).any(axis=1))[0]]

test = RFP_LIST[0]
add_variation_type(test, get_cov(test))
test

  df['is_snp'] = np.where( df['is_indel'] == False, np.where( (df.iloc[:, 1:cov+1] != df.iloc[:, 0]).any(axis=1), True, False), False)


Unnamed: 0_level_0,target,read1,read2,read3,q1,q2,q3,changes,contig,mutations,is_indel,is_ins,is_single,is_snp
03R-----1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
780,65,65,65,65,126,126,126,32,65,32,False,False,True,True
781,84,84,84,84,126,126,126,32,84,32,False,False,True,True
782,71,71,71,71,126,126,126,32,71,32,False,False,True,True
783,71,71,71,71,126,126,126,32,71,32,False,False,True,True
784,84,84,84,84,126,126,126,32,84,32,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1477,65,65,65,65,126,126,126,32,65,32,False,False,True,True
1478,65,65,65,65,126,126,126,32,65,32,False,False,True,True
1479,84,84,84,84,126,126,126,32,84,32,False,False,True,True
1480,65,65,65,65,126,126,126,32,65,32,False,False,True,True


### Add Length Type

In [44]:
def isSingleSNP(df, idx):
    pass

def isSingleDel(df, cov, idx): # Checks if read has only one indel.
    # Assumes it's a del row.
    df['is_single'].iloc[idx] = np.where((df[1:cov+1].iloc[idx-1] == 45 and df[1:cov+1].iloc[idx+1] == 45).any(axis = 1), False, True)
    # isMulti if a single 45 is present left or right.
    
def isSingleIns(df, idx): # Checks if target has only one indel.
    df['is_single'].iloc[idx] == np.where( (df['target'].iloc[idx+1] == 45 and df['target'].iloc[idx-1] == 45).any(axis=1), False, True)

In [24]:
df = RFP_LIST[0]

True