In [None]:
import argparse
import pysam
from pathlib import Path
import pandas as pd
import copy

In [None]:
#manually identified the inversion reads
inversion_reads = ['m84127_240712_175808_s4/246483519/ccs',
'm84127_240712_175808_s4/165154071/ccs',
'm84127_240712_175808_s4/41227253/ccs',
'm84127_240802_194716_s2/230297831/ccs',
'm84127_240802_194716_s2/91030239/ccs',
'm84127_240712_175808_s4/45553173/ccs',
'm84127_240712_175808_s4/69734245/ccs',
'm84127_240712_175808_s4/25760525/ccs',
'm84127_240712_175808_s4/200609304/ccs',
'm84127_240712_175808_s4/212337140/ccs',
'm84127_240712_175808_s4/60817797/ccs',
'm84127_240712_175808_s4/223613241/ccs',
'm84127_240712_175808_s4/246221764/ccs',
'm84127_240712_175808_s4/5771352/ccs']

In [None]:
#extracting the coordinates of the exonic regions in c9orf72
ex1_coord = (27573431,27573481)
ex2_coord = (27573709,27573866)
between = (27573481, 27573709)

#for the region between the start of exon1 and end of exon2 - extract the deletion and insertion coordinates
#input:
#     read
#     l: location in the reference
#     l_read: the location in the read where enters exon1 (not ness. same as ref loc)
#     segment_index: which cigar tuple we are at

the_read = None
def check_exonic_overlaps(read, l, l_read, segment_index):
    read_del_lst = []
    read_ins_lst = []
    for op, length in read.cigartuples[segment_index:]:
        if l > ex2_coord[1]:
            return read_del_lst, read_ins_lst
        if op == 0:  
            l_read += length
            l += length
        elif op == 1:
            l_read += length
            start = l
            end = l + length
            read_ins_lst.append((start, end))
        elif op == 2:  # Deletion
            start = l
            end = l + length
            l += length
            read_del_lst.append((start, end))
        elif op == 3:  # Skipped region
            l += length #moving ahead in the reference

        elif op == 4:  # Soft clipping
            l_read += length
            l += length 
        elif op == 5:  # Hard clipping 
            pass
        elif op == 6:  # Padding
            pass
        elif op == 7:  # Segment equal 
            l_read += length
            l += length
        elif op == 8:
            l_read += length
            l += length
        else:
            raise ValueError(f"Unknown CIGAR operation code {op}")

    return read_del_lst, read_ins_lst

def find_deletion_locs(inbam):
    deletions = []
    insertions = []
    count = 0

    for read in inbam.fetch("chr9", 27557179, 27580459):
        count += 1
        
        read_start = read.reference_start
        read_end = read.reference_end
        
        if read_start < ex1_coord[0] or read.query_name in inversion_reads:
            ref_start = read.reference_start  # Correct attribute for reference start position
    
            if read.cigartuples[0][0] == 4:
                ref_start = ref_start - read.cigartuples[0][1]
    
            cigar = read.cigartuples
    
            #the bases traversed - where we are in the read
            l_read = 0
            #the exact start position of the read - where we are in the reference
            l = ref_start
    
            segment_index = 0
            for op, length in cigar:
                #if we are at the exonic region in the read
                if l + length > ex1_coord[0]:
                    deletion_lst, insertion_lst = check_exonic_overlaps(read, l, l_read, segment_index)
                    deletions.append(deletion_lst)
                    insertions.append(insertion_lst)
                    break
    
                if op == 0:  # NOT SURE WHAT THE MEANING OF THIS IS
                    l_read += length
                    l += length
    
                elif op == 1:  # Insertion (this is correct)
                    l_read += length
    
                elif op == 2:  # Deletion (this is correct)
                    l += length
    
                elif op == 3:  # Skipped region
                    l += length #moving ahead in the reference
    
                elif op == 4:  # Soft clipping
                    l_read += length
                    l += length #
    
                elif op == 5:  # Hard clipping NOT SURE
                    pass
                elif op == 6:  # Padding NOT SURE
                    pass
                elif op == 7:  # Segment equal (this is correct)
                    l_read += length
                    l += length
                elif op == 8:  # seems to be sequence mismatcj - they both advance then NOT SURE
                    l_read += length
                    l += length
                else:
                    print('here')
                    raise ValueError(f"Unknown CIGAR operation code {op}")
    
                segment_index += 1
    return deletions, insertions
    # Close the BAM file
    inbam.close()

In [None]:
'''IMPORTANT LOCATIONS IN THE FILE'''
ex1_coord = (27573431-1,27573481)
ex2_coord = (27573709-1,27573866)
between = (27573481-1,27573709)

c_loc = (27573507-1,27573527) 
g4_loc = (27573648-1,27573668) 
g11_loc = (27573547-1,27573567)
g12_loc = (27573546-1,27573566)
g13_loc = (27573545-1,27573565)
repeat = (27573528-1, 27573547)

def del_types(inbam, group = 'None'):
    deletions_lst, insertions_lst = find_deletion_locs(inbam) #adding an insertion list!

    #the left guide in this case was the c
    lg_loc = (27573507-1,27573527)
    rg_loc = (0,0)

    if group.endswith('g4'):
        rg_loc = g4_loc
    elif group.endswith('g11'):
        rg_loc = g11_loc
    elif group.endswith('g12'):
        rg_loc = g12_loc
    elif group.endswith('g13'):
        rg_loc = g13_loc

    num_reads = len(deletions_lst)

    #exisions lists
    exision_intronic_lst = [0 for i in range(num_reads)]
    exision_ex1_lst = [0 for i in range(num_reads)]
    exision_ex2_lst = [0 for i in range(num_reads)]
    exision_ex1_ex2_lst = [0 for i in range(num_reads)]

    #deletion lists
    intronic_del_lst = [0 for i in range(num_reads)] #not including gRNA loc
    ex1_del_lst = [0 for i in range(num_reads)]
    ex2_del_lst = [0 for i in range(num_reads)]
    guide_del_lst = [0 for i in range(num_reads)] #deletion at either grna target site

    #insertion lists
    intronic_ins_lst = [0 for i in range(num_reads)] #not including gRNA loc
    ex1_ins_lst = [0 for i in range(num_reads)]
    ex2_ins_lst = [0 for i in range(num_reads)]
    guide_ins_lst = [0 for i in range(num_reads)] #insertion at either grna target site

    nothing = 0
    if len(deletions_lst) == 0:
        nothing = 1

    index = 0
    for read in deletions_lst:
        found_ex1_indel = False
        found_ex2_indel = False
        found_ex1_ex2_indel = False
        found_intronic_indel = False

        for deletion in read:
            #the deletion is intronic (1)
            if between[0] < deletion[0] and deletion[1] < between[1]:
                #spans the repeat region
                if deletion[0] <= repeat[0] and deletion[1] >= repeat[1]:
                    exision_intronic_lst[index] = 1
                else:
                    if (rg_loc[0] < deletion[0] <= rg_loc[1]) or (rg_loc[0] < deletion[1] <= rg_loc[1]) or (lg_loc[0] < deletion[1] <= lg_loc[1]) or (lg_loc[0] < deletion[0] <= lg_loc[1]):
                        guide_del_lst[index] += 1
                    elif (deletion[0] <= rg_loc[0] and deletion[1] >= rg_loc[0]) or (deletion[0] <= rg_loc[1] and deletion[1] >= rg_loc[1]):
                        guide_del_lst[index] += 1
                    elif (deletion[0] <= lg_loc[0] and deletion[1] >= lg_loc[0]) or (deletion[0] <= lg_loc[1] and deletion[1] >= lg_loc[1]):
                        guide_del_lst[index] += 1
                    else:
                        intronic_del_lst[index] += 1

            #deletion slices out some of the exon
            else:
                #spans both exons (3)
                if deletion[0] < ex1_coord[1] and deletion[1] > ex2_coord[0]:
                    exision_ex1_ex2_lst[index] += 1
                #excision which overlaps with ex1, but not ex2 (4)
                elif deletion[0] <= ex1_coord[1] and deletion[1] >= repeat[1]:
                    exision_ex1_lst[index] += 1
                #overlaps with ex2 (not ex1) (5)
                elif deletion[0] <= repeat[0] and deletion[1] >= ex2_coord[0]:
                    exision_ex2_lst[index] += 1
                #deletion which overlaps with exon 1 (but doesn't lead to an excision) (6)
                elif deletion[0] < ex1_coord[1]:
                    ex1_del_lst[index] += 1
                #deletion which overlaps with exon 2 (but doesn't lead to an excision) (7)
                elif deletion[0] > ex2_coord[0]:
                    ex2_del_lst[index] += 1
        index += 1

    index = 0

    for read in insertions_lst:
        print('insertion', read)
        for insertion in read:
            #the insertion is completley intronic
            if between[0] < insertion[0] and insertion[1] < between[1]:
                if (rg_loc[0] < insertion[0] <= rg_loc[1]) or (rg_loc[0] < insertion[1] <= rg_loc[1]) or (lg_loc[0] < insertion[1] <= lg_loc[1]) or (lg_loc[0] < insertion[0] <= lg_loc[1]):
                        guide_ins_lst[index] += 1
                else:
                    intronic_ins_lst[index] += 1
            else:
                #the insertion overlaps with exon 1
                if insertion[0] < ex1_coord[1]:
                    ex1_ins_lst[index] += 1
                #the insertion overlaps with exon 2 (make sure this isn't outside the exon
                elif insertion[1] > ex2_coord[0]:
                    ex2_ins_lst[index] += 1
        index += 1

    read_data = pd.DataFrame()
    read_data['group'] = [group for i in range(len(deletions_lst))]
    read_data['name'] = [read.query_name for read in inbam.fetch("chr9", 27557179, 27580459) if read.reference_start < ex1_coord[0] or read.query_name in inversion_reads]
    read_data['Intronic Excision?'] = exision_intronic_lst
    read_data['excision which overlaps with ex1'] = exision_ex1_lst
    read_data['excision which overlaps with ex2'] = exision_ex2_lst
    read_data['excision which overlaps with ex1 and ex2'] = exision_ex1_ex2_lst
    read_data['intronic deletions'] = intronic_del_lst
    read_data['guide site deletion'] = guide_del_lst
    read_data["deletions in ex1"] = ex1_del_lst
    read_data["deletions in ex2"] = ex2_del_lst
    read_data["insertions in ex1"] = ex1_ins_lst
    read_data["insertions in ex2"] = ex2_ins_lst
    read_data['intronic insertions'] = intronic_ins_lst
    read_data['guide site insertion'] = guide_ins_lst

    return read_data, (sum(exision_intronic_lst),
                       sum(exision_ex1_lst),
                       sum(exision_ex2_lst),
                       sum(exision_ex1_ex2_lst),
                       sum(intronic_del_lst),
                      sum(ex1_del_lst),
                       sum(ex2_del_lst),
                      nothing)

In [None]:
#identifies the deletion types (and insertions) in each of the sample files

structure = {'g4': None,
              'g11': None,
             'g12': None,
              'g13': None}

# Create separate copies of the structure
c9p4_mutant = copy.deepcopy(structure)
c9p5_mutant = copy.deepcopy(structure)
c9p8_mutant = copy.deepcopy(structure)
c9p9_mutant = copy.deepcopy(structure)

c9p4_wt = copy.deepcopy(structure)
c9p5_wt = copy.deepcopy(structure)
c9p8_wt = copy.deepcopy(structure)
c9p9_wt = copy.deepcopy(structure)

#___________c9p4______________
c9p4_mut_rd_g4,c9p4_mutant['g4'] = del_types((pysam.AlignmentFile('c9p4_samples_merged/MUTANT_Biosample_1.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")), 'c9p4_mut_g4')
c9p4_mut_rd_g11,c9p4_mutant['g11'] = del_types((pysam.AlignmentFile('c9p4_samples_merged/MUTANT_Biosample_2.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")), 'c9p4_mut_g11')
c9p4_mut_rd_g12,c9p4_mutant['g12']  = del_types((pysam.AlignmentFile('c9p4_samples_merged/MUTANT_Biosample_3.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")), 'c9p4_mut_g12')
c9p4_mut_rd_g13,c9p4_mutant['g13']  = del_types((pysam.AlignmentFile('c9p4_samples_merged/MUTANT_Biosample_4.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")), 'c9p4_mut_g13')

c9p4_wt_rd_g4, c9p4_wt['g4'] = del_types((pysam.AlignmentFile('c9p4_samples_merged/WT_Biosample_1.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")),'c9p4_wt_g4')
c9p4_wt_rd_g11, c9p4_wt['g11'] = del_types((pysam.AlignmentFile('c9p4_samples_merged/WT_Biosample_2.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")), 'c9p4_wt_g11')
c9p4_wt_rd_g12, c9p4_wt['g12'] = del_types((pysam.AlignmentFile('c9p4_samples_merged/WT_Biosample_3.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")), 'c9p4_wt_g12')
c9p4_wt_rd_g13, c9p4_wt['g13'] = del_types((pysam.AlignmentFile('c9p4_samples_merged/WT_Biosample_4.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")), 'c9p4_wt_g13')

#___________c9p5________________
c9p5_mut_rd_g4, c9p5_mutant['g4'] = del_types((pysam.AlignmentFile('c9p5_samples_merged/MUTANT_Biosample_5.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")), 'c9p5_mut_g4')
c9p5_mut_rd_g11, c9p5_mutant['g11']= del_types((pysam.AlignmentFile('c9p5_samples_merged/MUTANT_Biosample_6.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")),'c9p5_mut_g11')
c9p5_mut_rd_g12,c9p5_mutant['g12'] = del_types((pysam.AlignmentFile('c9p5_samples_merged/MUTANT_Biosample_7.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")), 'c9p5_mut_g12')
c9p5_mut_rd_g13,c9p5_mutant['g13'] = del_types((pysam.AlignmentFile('c9p5_samples_merged/MUTANT_Biosample_8.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")), 'c9p5_mut_g13')


c9p5_wt_rd_g4,c9p5_wt['g4'] = del_types((pysam.AlignmentFile('c9p5_samples_merged/WT_Biosample_5.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")), 'c9p5_wt_g4')
c9p5_wt_rd_g11,c9p5_wt['g11']  = del_types((pysam.AlignmentFile('c9p5_samples_merged/WT_Biosample_6.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")), 'c9p5_wt_g11')
c9p5_wt_rd_g12,c9p5_wt['g12']  = del_types((pysam.AlignmentFile('c9p5_samples_merged/WT_Biosample_7.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")), 'c9p5_wt_g12')
c9p5_wt_rd_g13,c9p5_wt['g13']  = del_types((pysam.AlignmentFile('c9p5_samples_merged/WT_Biosample_8.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")), 'c9p5_wt_g13')

#_________c9p8__________________
c9p8_mut_rd_g4,c9p8_mutant['g4'] = del_types((pysam.AlignmentFile('c9p8_samples_merged/MUTANT_Biosample_9.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")), 'c9p8_mut_g4')
c9p8_mut_rd_g11,c9p8_mutant['g11'] = del_types((pysam.AlignmentFile('c9p8_samples_merged/MUTANT_Biosample_10.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")),'c9p8_mut_g11')
c9p8_mut_rd_g12,c9p8_mutant['g12'] = del_types((pysam.AlignmentFile('c9p8_samples_merged/MUTANT_Biosample_11.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")), 'c9p8_mut_g12')
c9p8_mut_rd_g13,c9p8_mutant['g13'] = del_types((pysam.AlignmentFile('c9p8_samples_merged/MUTANT_Biosample_12.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")), 'c9p8_mut_g13')


c9p8_wt_rd_g4,c9p8_wt['g4'] = del_types((pysam.AlignmentFile('c9p8_samples_merged/WT_Biosample_9.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")),'c9p8_wt_g4' )
c9p8_wt_rd_g11,c9p8_wt['g11'] = del_types((pysam.AlignmentFile('c9p8_samples_merged/WT_Biosample_10.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")),'c9p8_wt_g11')
c9p8_wt_rd_g12,c9p8_wt['g12'] = del_types((pysam.AlignmentFile('c9p8_samples_merged/WT_Biosample_11.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")),'c9p8_wt_g12')
c9p8_wt_rd_g13,c9p8_wt['g13'] = del_types((pysam.AlignmentFile('c9p8_samples_merged/WT_Biosample_12.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")),'c9p8_wt_g13')


#_______c9p9____________________
c9p9_mut_rd_g4,c9p9_mutant['g4'] = del_types((pysam.AlignmentFile('c9p9_samples_merged/bam_1_13_MUTANT_sort.bam', "rb")), 'c9p9_mut_g4')
c9p9_mut_rd_g11,c9p9_mutant['g11'] = del_types((pysam.AlignmentFile('c9p9_samples_merged/bam_2_14_MUTANT_sort.bam', "rb")),'c9p9_mut_g11')
c9p9_mut_rd_g12,c9p9_mutant['g12'] = del_types((pysam.AlignmentFile('c9p9_samples_merged/bam_3_15_MUTANT_sort.bam', "rb")), 'c9p9_mut_g12')
c9p9_mut_rd_g13,c9p9_mutant['g13'] = del_types((pysam.AlignmentFile('c9p9_samples_merged/bam_4_16_MUTANT_sort.bam', "rb")), 'c9p9_mut_g13')

c9p9_wt_rd_g4,c9p9_wt['g4'] = del_types((pysam.AlignmentFile('c9p9_samples_merged/bam_1_13_WT_sort.bam', "rb")),'c9p9_wt_g4' )
c9p9_wt_rd_g11,c9p9_wt['g11'] = del_types((pysam.AlignmentFile('c9p9_samples_merged/bam_2_14_WT_sort.bam', "rb")),'c9p9_wt_g11')
c9p9_wt_rd_g12,c9p9_wt['g12'] = del_types((pysam.AlignmentFile('c9p9_samples_merged/bam_3_15_WT_sort.bam', "rb")),'c9p9_wt_g12')
c9p9_wt_rd_g13,c9p9_wt['g13'] = del_types((pysam.AlignmentFile('c9p9_samples_merged/bam_4_16_WT_sort.bam', "rb")),'c9p9_wt_g13')


In [None]:
#creates a combined table with all the 
read_table = pd.concat([c9p4_mut_rd_g4,
                       c9p4_mut_rd_g11,
                       c9p4_mut_rd_g12,
                       c9p4_mut_rd_g13,
                       c9p4_wt_rd_g4,
                       c9p4_wt_rd_g11,
                       c9p4_wt_rd_g12,
                       c9p4_wt_rd_g13,
                       c9p5_mut_rd_g4,
                       c9p5_mut_rd_g11,
                       c9p5_mut_rd_g12,
                       c9p5_mut_rd_g13,
                       c9p5_wt_rd_g4,
                       c9p5_wt_rd_g11,
                       c9p5_wt_rd_g12,
                       c9p5_wt_rd_g13,
                       c9p8_mut_rd_g4,
                       c9p8_mut_rd_g11,
                       c9p8_mut_rd_g12,
                       c9p8_mut_rd_g13,
                       c9p8_wt_rd_g4,
                       c9p8_wt_rd_g11,
                       c9p8_wt_rd_g12,
                       c9p8_wt_rd_g13,
                       c9p9_mut_rd_g4,
                       c9p9_mut_rd_g11,
                       c9p9_mut_rd_g12,
                       c9p9_mut_rd_g13,
                       c9p9_wt_rd_g4,
                       c9p9_wt_rd_g11,
                       c9p9_wt_rd_g12,
                       c9p9_wt_rd_g13])

read_table['inversion'] = [0 for i in range(len(read_table))]
read_table.loc[read_table['name'].isin(inversion_reads), 'inversion'] = int(1)
# Save the DataFrame to a CSV file
read_table.to_csv('detailed_read_events.csv', index=False)
read_table.groupby('group').sum().reset_index().to_csv('detailed_read_events_summary.csv', index=False)
read_table


#a different way of categorizign the reads

# Desired excision (HRE deletion with no disruption of exons)
# Undesired excision type I (HRE deletion extending into either exon)
# Undesired excision type II (HRE deletion with indel at either exon)
# Indel type I (no HRE deletion with indel at either exon)
# Indel type II (no HRE deletion with indel at either gRNA target site)
# Indel type III (no HRE deletion with indel outside of exons or target sites)
# Indel type IV (two or more indels?)
# Inversion (with or without any indels)
# Wild-type (unedited sequence)
#now using grouping and such - making the "mutually exclusive read table"
mut_exc_read_table = pd.DataFrame(columns = ['Group',
                                  'Name',
                                  'Desired excision',
                                 'Undesired excision type I',
                                 'Undesired excision type II',
                                 'Indel type I',
                                 'Indel type II',
                                 'Indel type III',
                                 'Indel Type IV',
                                 'Inversion',
                                 'Wild-type'])


#iterating through all the types of edits, and all the reads and then adding a row to the major table = depending on their idenitity
for index, row in read_table.iterrows():
    #inversion
    new_rows = []
    if row['inversion']:
        new_row = {
            'Group': row['group'],
            'Name': row['name'],
            'Desired excision': 0,
            'Undesired excision type I': 0,
            'Undesired excision type II': 0,
            'Indel type I': 0,
            'Indel type II': 0,
            'Indel type III': 0,
            'Indel Type IV': 0,
            'Inversion': 1,
            'Wild-type': 0
        }

    #undesired excision type I (overlaps with an exon)
    elif row['excision which overlaps with ex1'] or  row['excision which overlaps with ex2'] or row['excision which overlaps with ex1 and ex2']:
        new_row = {
            'Group': row['group'],
            'Name': row['name'],
            'Desired excision': 0,
            'Undesired excision type I': 1,
            'Undesired excision type II': 0,
            'Indel type I': 0,
            'Indel type II': 0,
            'Indel type III': 0,
            'Indel Type IV': 0,
            'Inversion': 0,
            'Wild-type': 0
        }

    #undesired excision type II (also an indel in the exon)
    elif row['Intronic Excision?'] and (row['deletions in ex1'] or
                                        row['deletions in ex2'] or
                                        row['insertions in ex1'] or
                                        row['insertions in ex2']):
        new_row = {
            'Group': row['group'],
            'Name': row['name'],
            'Desired excision': 0,
            'Undesired excision type I': 0,
            'Undesired excision type II': 1,
            'Indel type I': 0,
            'Indel type II': 0,
            'Indel type III': 0,
            'Indel Type IV': 0,
            'Inversion': 0,
            'Wild-type': 0
        }
    #desired excision
    elif row['Intronic Excision?']:
        new_row = {
            'Group': row['group'],
            'Name': row['name'],
            'Desired excision': 1,
            'Undesired excision type I': 0,
            'Undesired excision type II': 0,
            'Indel type I': 0,
            'Indel type II': 0,
            'Indel type III': 0,
            'Indel Type IV': 0,
            'Inversion': 0,
            'Wild-type': 0
        }

    #no excision, but an indel in one of the exons (only)
    elif (row['deletions in ex2'] or row['deletions in ex1'] or row['insertions in ex1'] or row['insertions in ex2']) and not row['intronic insertions'] and not row['intronic deletions'] and not row['guide site insertion'] and not row['guide site deletion']:
        new_row = {
            'Group': row['group'],
            'Name': row['name'],
            'Desired excision': 0,
            'Undesired excision type I': 0,
            'Undesired excision type II': 0,
            'Indel type I': 1,
            'Indel type II': 0,
            'Indel type III': 0,
            'Indel Type IV': 0,
            'Inversion': 0,
            'Wild-type': 0
        }

    #no excision, but intronic deletion/insetion but NOT at the guide sites and not in the exonds
    elif row['intronic deletions'] or row['intronic insertions'] and not row['guide site insertion'] and not row['guide site deletion'] and not (row['deletions in ex2'] or row['deletions in ex1'] or row['insertions in ex1'] or row['insertions in ex2']):
        new_row = {
            'Group': row['group'],
            'Name': row['name'],
            'Desired excision': 0,
            'Undesired excision type I': 0,
            'Undesired excision type II': 0,
            'Indel type I': 0,
            'Indel type II': 0,
            'Indel type III': 1,
            'Indel Type IV': 0,
            'Inversion': 0,
            'Wild-type': 0
        }

    #alternativey, JUST a guide site insetion
    elif row['guide site insertion'] or row['guide site deletion'] and not (row['intronic deletions'] or row['intronic insertions'] or row['deletions in ex2'] or row['deletions in ex1'] or row['insertions in ex1'] or row['insertions in ex2']):
        new_row = {
            'Group': row['group'],
            'Name': row['name'],
            'Desired excision': 0,
            'Undesired excision type I': 0,
            'Undesired excision type II': 0,
            'Indel type I': 0,
            'Indel type II': 1,
            'Indel type III': 0,
            'Indel Type IV': 0,
            'Inversion': 0,
            'Wild-type': 0
        }
    #check if the remaining columns are 0 or not
    elif row.iloc[2:].sum() == 0:
        new_row = {
            'Group': row['group'],
            'Name': row['name'],
            'Desired excision': 0,
            'Undesired excision type I': 0,
            'Undesired excision type II': 0,
            'Indel type I': 0,
            'Indel type II': 0,
            'Indel type III': 0,
            'Indel Type IV': 0,
            'Inversion': 0,
            'Wild-type': 1
        }
    else:
        new_row = {
            'Group': row['group'],
            'Name': row['name'],
            'Desired excision': 0,
            'Undesired excision type I': 0,
            'Undesired excision type II': 0,
            'Indel type I': 0,
            'Indel type II': 0,
            'Indel type III': 0,
            'Indel Type IV': 1,
            'Inversion': 0,
            'Wild-type': 0
        }

    #adding the new row to the table!
    # Convert the new_row dictionary to a DataFrame and append it
    new_row_df = pd.DataFrame([new_row])
    mut_exc_read_table = pd.concat([mut_exc_read_table, new_row_df], ignore_index=True)


read_table.to_csv("massive_read_table.csv", index = False)
mut_exc_read_table.to_csv("bankole_categorical_breakdown_reads.csv", index=False)

groupedby_group = mut_exc_read_table.groupby('Group').sum().reset_index()
groupedby_group = groupedby_group.drop("Name", axis = 1)
groupedby_group.to_csv("bankole_categorical_breakdown_summary.csv", index=False)

In [None]:
import os
import matplotlib.pyplot as plt

global_value = 0
# Function to generate filenames
def generate_filename(directory='images', extension='png'):
    global global_value  # Declare that we are using the global variable
    if not os.path.exists(directory):
        os.makedirs(directory)
    global_value += 1  # Increment the global value
    return os.path.join(directory, f'file_{global_value}.{extension}')


#making plots for the more coarse grained situation (desired excision, undesired excision,  inversion, wt, indel - indels
#might also be sequencing errors
corse_categories = ['Desired Excison', 'Undesired Excision', 'Indel Extending into Exonic Region']
read_table_corse = pd.DataFrame()
read_table_corse['Group'] = mut_exc_read_table['Group']
read_table_corse['Name'] = mut_exc_read_table['Name']
read_table_corse['Desired excision'] = mut_exc_read_table['Desired excision']
#excision that extends into exon1 or exon 2
read_table_corse['Undesired Excision'] = mut_exc_read_table['Undesired excision type I'] + mut_exc_read_table['Undesired excision type II']
#inversions (which I manually identified)
read_table_corse['Inversion'] = mut_exc_read_table['Inversion']
#indels which are in the guide region
read_table_corse['Indel'] =  mut_exc_read_table['Indel type II']
#wild type! (ALL else)
read_table_corse['Wildtype'] = mut_exc_read_table['Wild-type'] + mut_exc_read_table['Indel type III']

print(sum(read_table_corse['Desired excision']))
print(sum(read_table_corse['Undesired Excision']))
print(sum(read_table_corse['Inversion']))
print(sum(read_table_corse['Indel']))
print(sum(read_table_corse['Wildtype']))

display(read_table_corse)
read_table_corse.to_csv("coarse_breakdown_reads.csv", index=False)
read_table_corse_by_group = read_table_corse.groupby('Group').sum().reset_index()
read_table_corse_by_group.to_csv("coarse_breakdown_summary.csv", index=False)

grouped_sum = read_table_corse.groupby('Group').sum()
grouped_sum = grouped_sum.reset_index()
grouped_sum['Cell Line'] = [name[:4] for name in grouped_sum['Group']]
grouped_sum['Guide'] = [name[-3:] for name in grouped_sum['Group']]
grouped_sum['Allele'] = [name[5:8] for name in grouped_sum['Group']]

mut_exc_read_table['Cell Line'] = [name[:4] for name in mut_exc_read_table['Group']]
mut_exc_read_table['Guide'] = [name[-3:] for name in mut_exc_read_table['Group']]
mut_exc_read_table['Allele'] = [name[5:8] for name in mut_exc_read_table['Group']]

#seperating by cell line
c9p4_corse = grouped_sum[grouped_sum['Cell Line'] == 'c9p4']
c9p5_course = grouped_sum[grouped_sum['Cell Line'] == 'c9p5']
c9p8_course = grouped_sum[grouped_sum['Cell Line'] == 'c9p8']
c9p9_course = grouped_sum[grouped_sum['Cell Line'] == 'c9p9']

#seperating by allele type
c9p4_corse_wt = c9p4_corse[c9p4_corse['Allele'] == 'wt_']
c9p5_corse_wt = c9p5_course[c9p5_course['Allele'] == 'wt_']
c9p8_corse_wt = c9p8_course[c9p8_course['Allele'] == 'wt_']
c9p9_corse_wt = c9p9_course[c9p9_course['Allele'] == 'wt_']

c9p4_corse_mut = c9p4_corse[c9p4_corse['Allele'] == 'mut']
c9p5_corse_mut = c9p5_course[c9p5_course['Allele'] == 'mut']
c9p8_corse_mut = c9p8_course[c9p8_course['Allele'] == 'mut']
c9p9_corse_mut = c9p9_course[c9p9_course['Allele'] == 'mut']


def corse_stacked_bar(tbl, column_names, title = None, ylabel = 'Number of Reads', bars = 'Guide', y_lim = (0,1.1)):
    y_labels = tbl[bars]
    colors = [
    '#d0f0c0',  # Pale Green
    '#a3d9a5',  # Light Moss Green
    '#7cc57f',  # Light Olive Green
    '#66cdaa',  # Medium Aquamarine
    '#3cb371',  # Medium Sea Green
    '#2e8b57',  # Sea Green
    '#228b22',  # Forest Green
    '#006400',  # Dark Green
    '#004d00'   # Very Dark Green
]

    bottoms = [0] * len(y_labels)
    for i in range(len(column_names)):
        column_name = column_names[i]
        plt.bar(y_labels, tbl[column_name], bottom=bottoms, color=colors[i], label=f'Segment {i+1}', edgecolor='black', linewidth=1.6)
        # Update bottoms to include the height of the newly plotted segment
        bottoms = [b + s for b, s in zip(bottoms, tbl[column_name])]
        plt.ylim(y_lim)


    plt.legend(column_names,
               loc = 'upper right',
              frameon=False,
              bbox_to_anchor = (1.8,1))
    
    plt.title(title)
    plt.xlabel('Guide')
    plt.ylabel(ylabel)
       # Save and show the plot
    filename = generate_filename()
    plt.savefig(filename)
    plt.show()


for cell_line in mut_exc_read_table['Cell Line'].unique():
    all_numeric_cols = ['Desired excision',
                                 'Undesired excision type I',
                                 'Undesired excision type II',
                                 'Indel type I',
                                 'Indel type II',
                                 'Indel type III',
                                 'Indel Type IV',
                                 'Inversion',
                                 'Wild-type']

    #extract the table for the cell line of interest
    tbl = mut_exc_read_table[mut_exc_read_table['Cell Line'] == cell_line]
    tbl = tbl.groupby(['Guide', 'Allele']).sum()
    tbl.reset_index()

    tbl['Cell Line'] = [name[:4] for name in tbl['Group']]
    tbl['Guide'] = [name[-3:] for name in tbl['Group']]
    tbl['Allele'] = [name[5:8] for name in tbl['Group']]

    tbl_wt = tbl[tbl['Allele'] == 'wt_']
    tbl_mut = tbl[tbl['Allele'] == 'mut']

    tbl_row_sums_wt = tbl_wt[all_numeric_cols].sum(axis=1)
    tbl_row_sums_mut = tbl_mut[all_numeric_cols].sum(axis=1)

    tbl_wt_norm = tbl_wt[all_numeric_cols].div(tbl_row_sums_wt, axis = 0) #normalizing the wt tbl
    tbl_mut_norm = tbl_mut[all_numeric_cols].div(tbl_row_sums_mut, axis = 0) #normalizing the mut tbl

    tbl_wt_norm['Group'] = tbl_wt['Group']
    tbl_mut_norm['Group'] = tbl_mut['Group']

    #adding the non-numeric columns back to the tables
    tbl_wt_norm['Cell Line'] = [name[:4] for name in tbl_wt_norm['Group']]
    tbl_wt_norm['Guide'] = [name[-3:] for name in tbl_wt_norm['Group']]
    tbl_wt_norm['Allele'] = [name[5:8] for name in tbl_wt_norm['Group']]

    #adding the non-numeric columns back to the tables
    tbl_mut_norm['Cell Line'] = [name[:4] for name in tbl_mut_norm['Group']]
    tbl_mut_norm['Guide'] = [name[-3:] for name in tbl_mut_norm['Group']]
    tbl_mut_norm['Allele'] = [name[5:8] for name in tbl_mut_norm['Group']]

    corse_stacked_bar(tbl_wt, all_numeric_cols, title = str(cell_line) + 'wt_', y_lim = (0,25))
    corse_stacked_bar(tbl_mut, all_numeric_cols, title = str(cell_line) + 'mut', y_lim = (0,25))
    corse_stacked_bar(tbl_wt_norm, all_numeric_cols, title = str(cell_line) + 'wt_')
    corse_stacked_bar(tbl_mut_norm, all_numeric_cols, title = str(cell_line) + 'mut')

numeric_cols = ['Desired excision', 'Undesired Excision', 'Inversion', 'Indel', 'Wildtype']
corse_stacked_bar(c9p4_corse_wt, numeric_cols, title = 'c9p4 WT', y_lim = (0,25))
corse_stacked_bar(c9p5_corse_wt, numeric_cols, title = 'c9p5 WT',  y_lim = (0,25))
corse_stacked_bar(c9p8_corse_wt, numeric_cols, title = 'c9p8 WT',  y_lim = (0,25))

corse_stacked_bar(c9p9_corse_wt, numeric_cols, title = 'c9p9 WT',  y_lim = (0,25))
corse_stacked_bar(c9p4_corse_mut, numeric_cols, title = 'c9p4 MUT',  y_lim = (0,25))
corse_stacked_bar(c9p5_corse_mut, numeric_cols, title = 'c9p5 MUT',  y_lim = (0,25))
corse_stacked_bar(c9p8_corse_mut, numeric_cols, title = 'c9p8 MUT',  y_lim = (0,25))
corse_stacked_bar(c9p9_corse_mut, numeric_cols, title = 'c9p9 MUT',  y_lim = (0,25))

numeric_cols = ['Desired excision', 'Undesired Excision', 'Inversion', 'Indel', 'Wildtype']

c9p4_corse_wt_norm = c9p4_corse_wt[numeric_cols].div(c9p4_corse_wt[numeric_cols].sum(axis=1), axis = 0)
c9p4_corse_wt_norm["Group"] = c9p4_corse_wt["Group"]
c9p4_corse_wt_norm["Guide"] = c9p4_corse_wt["Guide"]
corse_stacked_bar(c9p4_corse_wt_norm, numeric_cols, title = 'c9p4 WT')

c9p5_corse_wt_norm = c9p5_corse_wt[numeric_cols].div(c9p5_corse_wt[numeric_cols].sum(axis=1), axis = 0)
c9p5_corse_wt_norm["Group"] = c9p5_corse_wt["Group"]
c9p5_corse_wt_norm["Guide"] = c9p5_corse_wt["Guide"]
corse_stacked_bar(c9p5_corse_wt_norm, numeric_cols, title = 'c9p5 WT')

c9p8_corse_wt_norm = c9p8_corse_wt[numeric_cols].div(c9p8_corse_wt[numeric_cols].sum(axis=1), axis = 0)
c9p8_corse_wt_norm["Group"] = c9p8_corse_wt["Group"]
c9p8_corse_wt_norm["Guide"] = c9p8_corse_wt["Guide"]
corse_stacked_bar(c9p8_corse_wt_norm, numeric_cols, title = 'c9p8 WT')

c9p9_corse_wt_norm = c9p9_corse_wt[numeric_cols].div(c9p9_corse_wt[numeric_cols].sum(axis=1), axis = 0)
c9p9_corse_wt_norm["Group"] = c9p9_corse_wt["Group"]
c9p9_corse_wt_norm["Guide"] = c9p9_corse_wt["Guide"]
corse_stacked_bar(c9p9_corse_wt_norm, numeric_cols, title = 'c9p9 WT')

c9p4_corse_mut_norm = c9p4_corse_mut[numeric_cols].div(c9p4_corse_mut[numeric_cols].sum(axis=1), axis = 0)
c9p4_corse_mut_norm["Group"] = c9p4_corse_mut["Group"]
c9p4_corse_mut_norm["Guide"] = c9p4_corse_mut["Guide"]
corse_stacked_bar(c9p4_corse_mut_norm, numeric_cols, title = 'c9p4 MUT')

c9p5_corse_mut_norm = c9p5_corse_wt[numeric_cols].div(c9p5_corse_wt[numeric_cols].sum(axis=1), axis = 0)
c9p5_corse_mut_norm["Group"] = c9p5_corse_mut["Group"]
c9p5_corse_mut_norm["Guide"] = c9p5_corse_mut["Guide"]
corse_stacked_bar(c9p5_corse_mut_norm, numeric_cols, title = 'c9p5 MUT')

c9p8_corse_mut_norm = c9p8_corse_wt[numeric_cols].div(c9p8_corse_wt[numeric_cols].sum(axis=1), axis = 0)
c9p8_corse_mut_norm["Group"] = c9p8_corse_mut["Group"]
c9p8_corse_mut_norm["Guide"] = c9p8_corse_mut["Guide"]
corse_stacked_bar(c9p8_corse_mut_norm, numeric_cols, title = 'c9p8 MUT')

c9p9_corse_mut_norm = c9p9_corse_wt[numeric_cols].div(c9p9_corse_wt[numeric_cols].sum(axis=1), axis = 0)
c9p9_corse_mut_norm["Group"] = c9p9_corse_mut["Group"]
c9p9_corse_mut_norm["Guide"] = c9p9_corse_mut["Guide"]
corse_stacked_bar(c9p9_corse_mut_norm, numeric_cols, title = 'c9p9 MUT')

#grouping them all together even more!
combined_stacked_bar_all = grouped_sum.groupby('Guide').sum()
combined_stacked_bar_all = combined_stacked_bar_all.reset_index()
display()

corse_stacked_bar(combined_stacked_bar_all, ['Desired excision', 'Undesired Excision', 'Inversion', 'Indel', 'Wildtype'], 'All Conditions', y_lim = (0,100))
combined_stacked_bar_all_norm = pd.DataFrame()

row_sums = combined_stacked_bar_all[['Desired excision', 'Undesired Excision', 'Inversion', 'Indel', 'Wildtype']].sum(axis=1)
combined_stacked_bar_all_norm[['Desired excision', 'Undesired Excision', 'Inversion', 'Indel', 'Wildtype']] = combined_stacked_bar_all[['Desired excision', 'Undesired Excision', 'Inversion', 'Indel', 'Wildtype']].div(row_sums, axis=0)
combined_stacked_bar_all_norm['Guide'] = combined_stacked_bar_all['Guide']
combined_stacked_bar_all_norm['Allele'] = combined_stacked_bar_all['Allele']
corse_stacked_bar(combined_stacked_bar_all_norm, numeric_cols, ylabel = '% occurence', title = 'Editing outcome Across Alleles and Patient Line', y_lim = (0,1.1))

for cell_line in grouped_sum['Cell Line'].unique():
    mut_wt_tbl = grouped_sum.groupby(['Cell Line', 'Allele']).sum().reset_index()
    corse_stacked_bar(mut_wt_tbl[mut_wt_tbl['Cell Line'] == cell_line],
                                 numeric_cols, bars = 'Allele', title = cell_line, y_lim = (0,80))


    row_sums = mut_wt_tbl[mut_wt_tbl['Cell Line'] == cell_line][['Desired excision', 'Undesired Excision', 'Inversion', 'Indel', 'Wildtype']].sum(axis=1)
    mut_wt_tbl_norm = mut_wt_tbl[mut_wt_tbl['Cell Line'] == cell_line][numeric_cols].div(row_sums, axis=0)
    mut_wt_tbl_norm['Allele'] = mut_wt_tbl[mut_wt_tbl['Cell Line'] == cell_line]['Allele']

    corse_stacked_bar(mut_wt_tbl_norm,
                                 numeric_cols, bars = 'Allele', title = cell_line, y_lim = (0,1.1))


In [None]:
combined_stacked_bar_all
combined_stacked_bar_all_norm_mut = pd.DataFrame()
grouped_sum_mut = grouped_sum[grouped_sum['Allele'] == 'mut']
grouped_sum_wt = grouped_sum[grouped_sum['Allele'] == 'wt_']
combined_stacked_bar_all_mut = grouped_sum_mut.groupby('Guide').sum()
combined_stacked_bar_all_mut = combined_stacked_bar_all_mut.reset_index()
combined_stacked_bar_all_wt = grouped_sum_wt.groupby('Guide').sum()
combined_stacked_bar_all_wt = combined_stacked_bar_all_wt.reset_index()
combined_stacked_bar_all_norm_mut

row_sums_mut = combined_stacked_bar_all_mut[['Desired excision', 'Undesired Excision', 'Inversion', 'Indel', 'Wildtype']].sum(axis=1)
row_sums_wt = combined_stacked_bar_all_wt[['Desired excision', 'Undesired Excision', 'Inversion', 'Indel', 'Wildtype']].sum(axis=1)

combined_stacked_bar_all_mut_norm = combined_stacked_bar_all_mut[['Desired excision', 'Undesired Excision', 'Inversion', 'Indel', 'Wildtype']].div(row_sums_mut, axis=0)
combined_stacked_bar_all_wt_norm = combined_stacked_bar_all_wt[['Desired excision', 'Undesired Excision', 'Inversion', 'Indel', 'Wildtype']].div(row_sums_wt, axis=0)
combined_stacked_bar_all_mut_norm['Guide'] = combined_stacked_bar_all['Guide']
combined_stacked_bar_all_wt_norm['Guide'] = combined_stacked_bar_all['Guide']

combined_stacked_bar_all_mut_norm
corse_stacked_bar(combined_stacked_bar_all_mut_norm, numeric_cols, ylabel = '% occurence', title = 'Editing outcome across Patient Lines (mut)', y_lim = (0,1.1))
corse_stacked_bar(combined_stacked_bar_all_wt_norm, numeric_cols, ylabel = '% occurence', title = 'Editing outcome Across Alleles and Patient Line (wt)', y_lim = (0,1.1))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


def make_bar_chart_combined(tuple_lst_1, tuple_lst_2, tuple_lst_3, title):
    # Convert lists of tuples into numpy arrays
    array_1 = np.array(tuple_lst_1)
    array_2 = np.array(tuple_lst_2)
    array_3 = np.array(tuple_lst_3)

    # Stack arrays to form a 3D array where the first dimension is the list index
    stacked_arrays = np.stack([array_1, array_2, array_3], axis=0)

    # Calculate the mean and standard deviation across the lists
    avg_tuples = np.mean(stacked_arrays)
    sd_tuples = np.std(stacked_arrays)

    # Print averages and standard deviations
    print("Average Tuples:\n", avg_tuples)
    print("Standard Deviation Tuples:\n", sd_tuples)

def make_bar_chart(tuple_lst, title):
    #might just do the inversions manually....,
    groups = ['g4', 'g11', 'g12', 'g13']
    categories = ['Intronic Excision',
                  "Excision with overlap in Exon1",
                  "Excision with overlap in Exon2",
                 "Excision with overlap in Exon1 and Exon2",
                 "Intronic Deletion",
                 "Deletion in Exon1",
                 "Deletion in Exon2",
                 "No Deletions"]


    colors = [
    '#d0f0c0',  # Pale Green
    '#90ee90',  # Light Green
    '#3cb371',  # Medium Sea Green
    '#00fa9a',  # Medium Spring Green
    '#2e8b57',  # Sea Green
    '#228b22',  # Forest Green
    '#556b2f',  # Dark Olive Green
    '#006400'   # Dark Green
    ]

    bottom = [0] * len(categories)  # Initialize bottom positions for stacking
    # Plot each segment of the bar

    stacked = [[tuple_lst['g4'][0],  tuple_lst['g11'][0], tuple_lst['g12'][0], tuple_lst['g13'][0]],
               [tuple_lst['g4'][1],  tuple_lst['g11'][1], tuple_lst['g12'][1], tuple_lst['g13'][1]],
               [tuple_lst['g4'][2],  tuple_lst['g11'][2], tuple_lst['g12'][2], tuple_lst['g13'][2]],
               [tuple_lst['g4'][3],  tuple_lst['g11'][3], tuple_lst['g12'][3], tuple_lst['g13'][3]],
               [tuple_lst['g4'][4],  tuple_lst['g11'][4], tuple_lst['g12'][4], tuple_lst['g13'][4]],
               [tuple_lst['g4'][5],  tuple_lst['g11'][5], tuple_lst['g12'][5], tuple_lst['g13'][5]],
               [tuple_lst['g4'][6],  tuple_lst['g11'][6], tuple_lst['g12'][6], tuple_lst['g13'][6]],
               [tuple_lst['g4'][7],  tuple_lst['g11'][7], tuple_lst['g12'][7], tuple_lst['g13'][7]]]

    #THE C9P4 MUTANT GRAPH
    # Initialize bottom heights
    bottoms = [0] * len(groups)

    # Plotting the stacked bar graph
    for i in range(len(stacked)):
        plt.bar(groups, stacked[i], bottom=bottoms, color=colors[i], label=f'Segment {i+1}', edgecolor = 'black', linewidth = 1.4)
        # Update bottoms to include the height of the newly plotted segment
        bottoms = [b + s for b, s in zip(bottoms, stacked[i])]

    # Adding labels and title
    plt.xlabel('Guide')
    plt.ylabel('Number of Reads')
    plt.title(title)

    # Adding a legend
    plt.legend(categories,
              loc = 'upper right',
              frameon=False,
              bbox_to_anchor = (1.8,1))

    # Display the graph
    plt.xticks(rotation=45)
    plt.show()



# make_bar_chart(c9p4_mutant, 'c9p4_mutant')
# make_bar_chart(c9p4_wt, 'c9p4_wildtype')

# make_bar_chart(c9p8_mutant, 'c9p8_mutant')
# make_bar_chart(c9p8_wt, 'c9p8_wildtype')

# make_bar_chart(c9p5_mutant, 'c9p5_mutant')
# make_bar_chart(c9p5_wt, 'c9p5_wildtype')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


def make_bar_chart_combined(tuple_lst_1, tuple_lst_2, tuple_lst_3, title):
    # Convert lists of tuples into numpy arrays
    array_1 = np.array(tuple_lst_1)
    array_2 = np.array(tuple_lst_2)
    array_3 = np.array(tuple_lst_3)

    # Stack arrays to form a 3D array where the first dimension is the list index
    stacked_arrays = np.stack([array_1, array_2, array_3], axis=0)

    # Calculate the mean and standard deviation across the lists
    avg_tuples = np.mean(stacked_arrays)
    sd_tuples = np.std(stacked_arrays)

    # Print averages and standard deviations
    print("Average Tuples:\n", avg_tuples)
    print("Standard Deviation Tuples:\n", sd_tuples)
    
#normalizing by the total number of reads (which isn't a lot).....
def make_bar_chart(tuple_lst, title):
    #might just do the inversions manually....,
    groups = ['g4', 'g11', 'g12', 'g13']
    categories = ['Intronic Excision',
                  "Excision with overlap in Exon1",
                  "Excision with overlap in Exon2",
                 "Excision with overlap in Exon1 and Exon2",
                 "Intronic Deletion",
                 "Deletion in Exon1",
                 "Deletion in Exon2",
                 "No Deletions"]


    colors = [
    '#d0f0c0',  # Pale Green
    '#90ee90',  # Light Green
    '#3cb371',  # Medium Sea Green
    '#00fa9a',  # Medium Spring Green
    '#2e8b57',  # Sea Green
    '#228b22',  # Forest Green
    '#556b2f',  # Dark Olive Green
    '#006400'   # Dark Green
    ]

    bottom = [0] * len(categories)  # Initialize bottom positions for stacking
    # Plot each segment of the bar

    print(tuple_lst['g4'][0])

    print(tuple_lst['g4'])
    total_g4 = 1
    total_g11 = 1
    total_g12 = 1
    total_g13 = 1

    stacked = [[tuple_lst['g4'][0]/total_g4,  tuple_lst['g11'][0]/total_g11, tuple_lst['g12'][0]/total_g12, tuple_lst['g13'][0]/total_g13],
               [tuple_lst['g4'][1]/total_g4,  tuple_lst['g11'][1]/total_g11, tuple_lst['g12'][1]/total_g12, tuple_lst['g13'][1]/total_g13],
               [tuple_lst['g4'][2]/total_g4,  tuple_lst['g11'][2]/total_g11, tuple_lst['g12'][2]/total_g12, tuple_lst['g13'][2]/total_g13],
               [tuple_lst['g4'][3]/total_g4,  tuple_lst['g11'][3]/total_g11, tuple_lst['g12'][3]/total_g12, tuple_lst['g13'][3]/total_g13],
               [tuple_lst['g4'][4]/total_g4,  tuple_lst['g11'][4]/total_g11, tuple_lst['g12'][4]/total_g12, tuple_lst['g13'][4]/total_g13],
               [tuple_lst['g4'][5]/total_g4,  tuple_lst['g11'][5]/total_g11, tuple_lst['g12'][5]/total_g12, tuple_lst['g13'][5]/total_g13],
               [tuple_lst['g4'][6]/total_g4,  tuple_lst['g11'][6]/total_g11, tuple_lst['g12'][6]/total_g12, tuple_lst['g13'][6]/total_g13],
               [tuple_lst['g4'][7]/total_g4,  tuple_lst['g11'][7]/total_g11, tuple_lst['g12'][7]/total_g12, tuple_lst['g13'][7]/total_g13]]

    # # Define the pinkish color palette
    colors = [
        '#FFC0CB',  # Soft Pink
        '#FFB6C1',  # Light Pink
        '#FF69B4',  # Pink
        '#FF1493',  # Hot Pink
        '#FF3385',  # Deep Pink
        '#C71585',  # Medium Violet Red
        '#DB7093',  # Pale Violet Red
        '#DA70D6'   # Orchid
    ]

    #THE C9P4 MUTANT GRAPH
    # Initialize bottom heights
    bottoms = [0] * len(groups)

    # Plotting the stacked bar graph
    for i in range(len(stacked)):
        plt.bar(groups, stacked[i], bottom=bottoms, color=colors[i], label=f'Segment {i+1}', edgecolor = 'black', linewidth = 1.4)
        # Update bottoms to include the height of the newly plotted segment
        bottoms = [b + s for b, s in zip(bottoms, stacked[i])]

    # Adding labels and title
    plt.xlabel('Guide')
    plt.ylabel('Percent of Reads')
    plt.title(title)

    # Adding a legend
    plt.legend(categories,
              loc = 'upper right',
              frameon=False,
              bbox_to_anchor = (1.8,1))

    # Display the graph
    plt.xticks(rotation=45)
    plt.show()


print(c9p4_mutant)

make_bar_chart(c9p4_mutant, 'c9p4_mutant')
make_bar_chart(c9p4_wt, 'c9p4_wildtype')

make_bar_chart(c9p8_mutant, 'c9p8_mutant')
print(c9p8_wt)
make_bar_chart(c9p8_wt, 'c9p8_wildtype')

make_bar_chart(c9p5_mutant, 'c9p5_mutant')
make_bar_chart(c9p5_wt, 'c9p5_wildtype')


#TODO (FROM THE NO DELETIONS EXCLUDE THOSE REVERSIONS - NEED TO SIFT THROGUH INDEPENDENTLY)
c9p4_reversions = []
c9p5_reversions = []
c9p8_reversions = []

c9p4_shifts = []
c9p5_shifts = []
c9p8_shifts = []

In [None]:
#if input a known interval of a heterozygous variant - will return the nucleotide sequence in that variant
#thereby allowing verification of the phasing (for instance if split a BAM into WT and MUT, and have a known snp present on one allele at a 
#specified interval
def find_interval(inbam, pos1, pos2):
    deletions = []
    count = 0

    for read in inbam.fetch("chr9", 27557179, 27580459):
        #need to take into account soft clipped bases

        count += 1
        ref_start = read.reference_start  # Correct attribute for reference start position
        if read.cigartuples[0][0] == 4:
            ref_start = ref_start - read.cigartuples[0][1]

        cigar = read.cigartuples

        #the bases traversed - where we are in the read
        l_read = 0
        #the exact start position of the read - where we are in the reference
        l = ref_start

        segment_index = 0
        for op, length in cigar:
            #if we are at the exonic region in the read
            if l + length > pos1:
                deletions.append(read.query_sequence[l_read + (pos1 - l):l_read + (pos2 - l)])
                break
                
            if op == 0:
                l_read += length
                l += length

            elif op == 1:  # Insertion (this is correct)
                l_read += length

            elif op == 2:  # Deletion (this is correct)
                l += length

            elif op == 3:  # Skipped region
                l += length #moving ahead in the reference

            elif op == 4:  # Soft clipping
                l_read += length
                l += length #
            elif op == 5:  # Hard clipping NOT SURE
                pass
            elif op == 6:  # Padding NOT SURE
                pass
            elif op == 7:  # Segment equal (this is correct)
                l_read += length
                l += length
            elif op == 8:  # seems to be sequence mismatcj - they both advance then NOT SURE
                l_read += length
                l += length
            else:
                raise ValueError(f"Unknown CIGAR operation code {op}")
            segment_index += 1
    return deletions

In [None]:
#verifuing the phasing for wt and mutant
c9p8_g13_wt = pysam.AlignmentFile("c9p8_samples_merged/WT_Biosample_12.fofn.pbmm2.repeats.bam%.bam.merged.bam", "rb" )
c9p8_g13_mutant = pysam.AlignmentFile("c9p8_samples_merged/WT_Biosample_12.fofn.pbmm2.repeats.bam%.bam.merged.bam", "rb" )
result = find_interval(c9p8_g13_wt, 27574803, 27574806)
result = find_interval(c9p8_g13_mutant, 27574804, 27574806)

In [None]:
#another categorization method 
#many of the indels are actually sequencing errors
#settled on categorizing into desired excisions, undesired excisions (exonic overlap + intronic), inversions and a neither category
#table to be used in PRISM plotting
corse_tbl = pd.DataFrame()
for index, row in read_table.iterrows():
    # inversion
    new_rows = []
    if row['inversion']:
        new_row = {
            'Group': row['group'],
            'Name': row['name'],
            'Desired excision': 0,
            'Undesired excision type': 0,
            'Inversion': 1,
            'No Excision or Inversion': 0
        }
    # undesired excision type I (overlaps with an exon)
    elif row['excision which overlaps with ex1'] or row['excision which overlaps with ex2'] or row['excision which overlaps with ex1 and ex2']:
        new_row = {
            'Group': row['group'],
            'Name': row['name'],
            'Desired excision': 0,
            'Undesired excision type': 1,
            'Inversion': 0,
            'No Excision or Inversion': 0
        }
    elif row['Intronic Excision?']:
        new_row = {
            'Group': row['group'],
            'Name': row['name'],
            'Desired excision': 1,
            'Undesired excision type': 0,
            'Inversion': 0,
            'No Excision or Inversion': 0
        }
    else:  # Changed from elif: to else:
        new_row = {
            'Group': row['group'],
            'Name': row['name'],
            'Desired excision': 0,
            'Undesired excision type': 0,
            'Inversion': 0,
            'No Excision or Inversion': 1
        }

    new_row_df = pd.DataFrame([new_row])
    corse_tbl = pd.concat([corse_tbl, new_row_df], ignore_index=True)

In [None]:
corse_tbl.groupby('Group').sum()

In [None]:
mut_rows = corse_tbl[corse_tbl['Group'].str.contains('mut', case=False, na=False)]
mut_rows

In [None]:
wt_rows = corse_tbl[corse_tbl['Group'].str.contains('wt', case=False, na=False)]
wt_rows

In [None]:
c9p8_al = pysam.AlignmentFile('c9p8_samples_merged/MUTANT_Biosample_9.fofn.pbmm2.repeats.bam%.bam.merged.bam', "rb")

In [None]:
import pandas as pd

#summation over reads - for use in final table
grouped = mut_rows.groupby('Group').sum()
n = 3  # For example, extract the last 3 characters
grouped['GroupLastChars'] = grouped.index.str[-n:]  # Extract last 'n' chars from the group names
grouped_sorted = grouped.sort_values(by='GroupLastChars')
grouped_sorted = grouped_sorted.drop(columns=['GroupLastChars'])
display(grouped_sorted)
