# Libraries & Code Setup

In [3]:
import os
import pandas as pd


PROJ_ROOT = os.path.dirname(os.path.abspath(''))
DATA_DIR = os.path.join(PROJ_ROOT, 'data')
# DATA_FILE_NAME = 'HG001.strdust.vcf' # set file name here

In [4]:
def loadToDF(file_name):

    data_path = os.path.join(DATA_DIR, file_name) 

    with open(data_path, 'r') as f:
        file_lines=[]
        for i, line in enumerate(f):
            if line.startswith("#CHROM"):
                # save starting line for data
                data_line_start = i + 1
                # remove new line characters then split line into list of column names
                col_names = line.strip("\n").split("\t")

                col_names.append("ENDPOS")
                #print(col_names)
                df = pd.DataFrame(columns=col_names)

            # if not metadata, add to list
            if not line.startswith("#"):
                # have sub index for data lines
                data_i = i - data_line_start    

                #split line by tabs into a list
                line_list = line.split("\t") 
                pos = int(line_list[1])
                ref_len = len(line_list[3])

                end_pos = pos + ref_len - 1

                line_list.append(end_pos)

                # then load list into df as new row         
                df.loc[data_i] = line_list

    #print(df)

    #print(type(file_lines))
    #print(file_lines)

    return df

In [5]:
strdust_df = loadToDF("HG001.strdust.vcf")
strkit_df = loadToDF("HG001.strkit.vcf")

strdust_df.head()

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,HG001.PAW79146.haplotagged,ENDPOS
0,chr1,10627,.,AAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGG...,CGCCGCGC,.,.,"END=10800;STDEV=0,56",GT:RB:FRB:SUP:SC:PS,"1|1:-165,-165:8,8:17,21:24,24:10150\n",10801
1,chr1,103260,.,GTAAAATAAAATAAAATAAAAA,AAAAGTAAAATAAAATAAAAT,.,.,"END=103280;STDEV=0,.",GT:RB:FRB:SUP:SC:PS,"1|.:1,.:21,.:8,0:63,.:93819\n",103281
2,chr1,432733,.,AGACAAACACGTGGGTACATGGAGGGGAACAACACACACCAGGGCC...,.,.,.,"END=433131;STDEV=.,.",GT:RB:FRB:SUP:SC,".|.:.,.:.,.:0,0:.,.\n",433132
3,chr1,434350,.,AGACAAACACGTGGATACATGGAGGGGAACAACACACACCAGGGCC...,.,.,.,"END=434825;STDEV=.,.",GT:RB:FRB:SUP:SC,".|.:.,.:.,.:0,0:.,.\n",434826
4,chr1,668334,.,GCAAACACGTGGATACATGGAGGGGAACAACACACACCAGGGCCTC...,GAGCAAACACGTGGATACATGGAGGGGAACAACACACACCAGGGCC...,.,.,"END=668486;STDEV=1,0",GT:RB:FRB:SUP:SC:PS,"1|0:10,1:162,153:15,4:486,459:588037\n",668487


In [6]:
strkit_df.head()

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,hg001,ENDPOS
0,chr1,10623,locus1,TGCAAAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCG...,CGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGC...,.,.,VT=str;MOTIF=AGGCGCGCCGCGCCGGCGCAGGCGCAGAG;REF...,GT:PM:DP:MMAS:DPS:AD:MC:MCCI:ANCL:CONS:MCRL,"1/2:dist:45:1.92941:45:37,8:9,12:9-9,9-12:5,12...",10800
1,chr1,103260,locus2,GTAAAATAAAATAAAATAAAA,.,.,.,VT=str;MOTIF=TAAAA;REFMC=4;BED_START=103260;BE...,GT:PM:DP:MMAS:DPS:AD:MC:MCCI:ANCL:CONS:MCRL,"0/0:dist:14:1.99911:14:14:4,4:4-4,4-4:1:.:4x14\n",103280
2,chr1,668334,locus5,GCAAACACGTGGATACATGGAGGGGAACAACACACACCAGGGCCTC...,.,.,.,VT=str;MOTIF=CAAACACGTGGATACATGGAGGGGAACAACACA...,GT:PM:DP:MMAS:DPS:AD:MC:MCCI:ANCL:CONS:MCRL,"0/0:dist:30:1.92977:30:30:2,2:2-2,2-2:1:.:2x30\n",668486
3,chr1,862069,locus6,AGATGGATGATGGATGGATG,.,.,.,VT=str;MOTIF=GATG;REFMC=5;BED_START=862076;BED...,GT:PM:DP:MMAS:DPS:AD:MC:MCCI:ANCL:CONS:MCRL,"0/0:dist:29:1.88007:29:29:5,5:5-5,5-5:1:.:5x29\n",862088
4,chr1,905012,locus7,TGCGGGGGAGGCTGTTGGGGACGTTCGTGGCGGGGGAGGCTGTTGG...,.,.,.,VT=str;MOTIF=GCGGGGGAGGCTGTTGGGGACGTTCGTG;REFM...,GT:PM:DP:MMAS:DPS:AD:MC:MCCI:ANCL:CONS:MCRL,"0/0:dist:26:1.74823:26:26:4,4:4-4,4-4:1:.:4x26\n",905120


In [7]:
strdust_df[strdust_df["POS"] == "10627"]["REF"]


0    AAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGG...
Name: REF, dtype: object

In [None]:
def compareString(df_list):
    of = open(os.path.join(DATA_DIR, "comp_data.txt"), "w")

    # loop to len - 1 because it is grabbing i+1
    for i in range(len(df_list) - 1):
        #for row in df_list[i]:
        cur_df1 = df_list[i]
        cur_df2 = df_list[i+1]
        offset = 0
        for j in range(5):
            cur_row1 = cur_df1.iloc[j]
            cur_row2 = cur_df2.iloc[j+offset]

            cur_start1 = int(cur_row1["POS"])
            cur_start2 = int(cur_row2["POS"])
            cur_end1 = int(cur_row1["ENDPOS"])
            cur_end2 = int(cur_row2["ENDPOS"])

            out_str = "\n"

            # check for alignment 
            if cur_start1 > cur_start2 & cur_start1 < cur_end2:
                out_str += "Comparison 1 position < Comp 2 position"

                # get the difference between the 2 positions, and then -1 to compensate for 0 index
                pos_offset = cur_start1 - cur_start2 - 1

                comp_section1 = cur_row1["REF"]
                comp_section2 = cur_row2["REF"][pos_offset:]

            elif cur_start1 < cur_start2 & cur_end1 > cur_start2:
                out_str += "Comparison 1 position < Comp 2 position"

                # get the difference between the 2 positions, and then -1 to compensate for 0 index
                pos_offset = cur_start2 - cur_start1 - 1
                print(pos_offset)

                comp_section1 = cur_row1["REF"][pos_offset:]
                comp_section2 = cur_row2["REF"]

            elif cur_end1 < cur_start2:
                # move current row 1 to the next index 
                pass
            elif cur_start1 > cur_end2:
                # move the current row 2 to the next index
                pass
            else:
                out_str += "Comparison Positions equal"

                comp_section1 = cur_row1["REF"]
                comp_section2 = cur_row2["REF"]
            
            
            # append more info to output string that displays the sequence and it's positions
            out_str += f"\nComparing Seq starting at {cur_start1}:\nString1:\n End Pos:{cur_row1["ENDPOS"]}\n {comp_section1}\nString2:\n End Pos:{cur_row2["ENDPOS"]}\n {comp_section2}\n\n"  
            of.write(out_str)

    of.close()


In [25]:
compareString([strdust_df, strkit_df])

# Resources

File bunching: https://stackoverflow.com/questions/16669428/process-very-large-20gb-text-file-line-by-line
reading in vcf to dataframe: https://gist.github.com/dceoy/99d976a2c01e7f0ba1c813778f9db744