# Libraries & Code Setup

In [4]:
import os
import itertools 
import Levenshtein as lv


PROJ_ROOT = os.path.dirname(os.path.dirname(os.path.abspath('')))
DATA_DIR = os.path.join(PROJ_ROOT, 'catalogs')
LOCAL_DATA = os.path.join(PROJ_ROOT, 'local_data')
print(DATA_DIR)
# DATA_FILE_NAME = 'HG001.strdust.vcf' # set file name here

c:\Users\garyh\Anshutz\Dashnow_Lab\projects\TR-Benchmarking\catalogs


## Function defs

### Helpers

In [28]:
def compareString(seq1, seq2):
    # compare the bed info and the refs (for now) 
    print(f"{seq1.clean_line[3]},    {seq2.clean_line[3]}")
    dist = lv.distance(seq1.clean_line[3], seq2.clean_line[3])
    
    return dist

In [None]:
class FileInfo:
    def __init__(self, file):
        self.file_obj = file
        # self.name = file.name
        self.cur_line = next(file)
        self.pause = False
        self.end_state = False


    def formatLine(self):
        # split line string into list
        flist = self.cur_line.strip().split("\t")

        # calculate the end position and add it to the end of the row list
        pos = int(flist[1])
        ref_len = len(flist[3])
        end_pos = pos + ref_len - 1
        flist.append(end_pos)

        self.clean_line = flist
        self.pos_info = (flist[0], pos, end_pos)


    def nextLine(self):
        if (not self.end_state) & (not self.pause):
            try:
                self.cur_line = next(self.file_obj)
            # catch exception that signals end of file
            except StopIteration:
                self.end_state = True


    def setName(self):
        name = ""
        for i in reversed(self.file.name):
            name += i
            if i == "\\":
                break
                print(name)

### main logic

In [None]:
def mainloop(bed_file, file1, file2):

    heads_parsed = False

    with open(os.path.join(DATA_DIR, bed_file) , 'r') as ref, \
    open(os.path.join(DATA_DIR, file1) , 'r') as f1, \
    open(os.path.join(DATA_DIR, file2) , 'r') as f2, \
    open(os.path.join(LOCAL_DATA, "comp_data.txt"), "w") as output_file:
        # CURRENTLY REPRESENTS FILE STACK
        file_stack = [bed_file, f1, f2]

        # make dictionary of open file objects for easier management
        #file_list = {"ref": FileInfo(ref)}
        file_list = [FileInfo(ref)]
        for i, file in enumerate(file_stack[1:]):
            #key_name = f"file{i}"
            file_list.append(FileInfo(file))


        # loop until all files have reached their final line
        #while not any([obj.end_state for obj in file_list]):
        for i in range(20):
            out_str = ""

            # loop through each file (except the reference) and move past the metadata 
            if not heads_parsed:
                for i, fi_obj in enumerate(file_list[1:]):
                    # loop while the current line in the current file still contains meta data
                    while fi_obj.cur_line.startswith("#"):
                        # save starting line for data handling -PROBABLY WONT NEED FOR FINAL LAUNCH
                        data_line_start = i + 1

                        # move to the next line in the current file
                        fi_obj.nextLine()

                # set bool to avoid this loop for rest of iterations
                heads_parsed = True
    

            for i, fi_obj in enumerate(file_list):      
                fi_obj.formatLine()

                # position alignment check
                if i == 0:
                    out_str += (f"BED Ref ({fi_obj.pos_info}): {fi_obj.clean_line[3]}\n")

                else: 
                    # if current file position is ahead of reference position range
                    print(f"File[{i}]{fi_obj.pos_info[1]}:{file_list[0].pos_info[1]} - {fi_obj.pos_info[1] > file_list[0].pos_info[1]}") # DEBUGGING - REMOVE FOR LAUNCH
                    
                    if (fi_obj.pos_info[1] > file_list[0].pos_info[1]):                    
                        fi_obj.pause = True
                    else:
                        fi_obj.pause = False


                    # run comparisons
                    difference = compareString(file_list[0], fi_obj)
                    out_str += (f"File[{i}] Ref diff score: {difference} | ({fi_obj.pos_info}): {fi_obj.clean_line[3]}\n")
                

                fi_obj.nextLine()

            output_file.write(out_str + "\n")






strdust_df = mainloop("test-isolated-vc-catalog.strglr.bed", "HG001.strdust.vcf", "HG001.strkit.vcf")


File[1]10627:10627 - False
AGGCGCGCCGCGCCGGCGCAGGCGCAGAG,    AAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAC
File[2]10623:10627 - False
AGGCGCGCCGCGCCGGCGCAGGCGCAGAG,    TGCAAAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGA
File[1]103260:103260 - False
TAAAA,    GTAAAATAAAATAAAATAAAAA
File[2]103260:103260 - False
TAAAA,    GTAAAATAAAATAAAATAAAA
File[1]432733:432733 - False
GACAAACACGTGGGTACATGGAGGGGAACAACACACACCAGGGCCTCTCAGGGGGACAGGGGGTAGGAGACCATCAG,    AGACAAACACGTGGGTACATGGAGGGGAACAACACACACCAGGGCCTCTCAGGGGGACAGGGGGTAGGAGACCATCAGGACAAACACGTGGGTACATGGAGGGGAACAACACACACCAGGGCCTCTCAGGGGGACAGGGGGTAGGAGACCATCAGGACAAACACGTGGGTACATGGAGGGGAACAACACACACCAGGGCCTCTCAGGGGGACAGGGGGTAGGAGACCATCAGGACAAACACGTGGGTACATGGAGGGGAACAACACACACCAGGGCCTCTCAGGGGGAC

# Resources

File bunching: https://stackoverflow.com/questions/16669428/process-very-large-20gb-text-file-line-by-line
reading in vcf to dataframe: https://gist.github.com/dceoy/99d976a2c01e7f0ba1c813778f9db744