In [None]:
from IPython.display import clear_output
clear_output(wait=True)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.instance().run_line_magic('reset', '-f')

In [None]:
import argparse
import pysam
from pathlib import Path
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

strandedness = []
fasta = pysam.FastaFile("hg38.fa")
warnings.filterwarnings("ignore")

In [None]:
import pysam
from pathlib import Path

#contains the locations of all the guides
altc = 'altc_circle_seq_3_output.bed'
alt7 = 'alt7_circle_seq_3_output.bed'
rex4 = 'rex4_circle_seq_3_output.bed'
rexc = 'rexc_circle_seq_3_output.bed'

#all the dfs (at the moment) 

# ALT7
ALT7_1_BAMs = Path('HG38_NEW') / 'ALT7_HG38_NEW_FILTER_MPQ20' / 'rep1'
ALT7_2_BAMs = Path('HG38_NEW') / 'ALT7_HG38_NEW_FILTER_MPQ20' / 'rep2' 

# ALTC
ALTC_1_BAMs = Path('HG38_NEW') / 'ALTC_HG38_NEW_FILTER_MPQ20' / 'rep1'
ALTC_2_BAMs = Path('HG38_NEW') / 'ALTC_HG38_NEW_FILTER_MPQ20' / 'rep2'

# REXC
REXC_1_BAMs = Path('HG38_NEW') / 'REXC_HG38_NEW_FILTER_MPQ20' / 'rep1'
REXC_2_BAMs = Path('HG38_NEW') / 'REXC_HG38_NEW_FILTER_MPQ20' / 'rep2'

# REX4
REX4_2_BAMs = Path('HG38_NEW') / 'REX4_HG38_NEW_FILTER_MPQ20' / 'rep1'
REX4_3_BAMs = Path('HG38_NEW') / 'REX4_HG38_NEW_FILTER_MPQ20' / 'rep2'


#This is the no nucleoinfection BAM
ALTC_NO_NUNC = Path('HG38_NEW') / 'ALTC_HG38_NEW_FILTER_MPQ20' / 'nunc'
REX4_NO_NUNC = Path('HG38_NEW') / 'REX4_HG38_NEW_FILTER_MPQ20' / 'nunc'
ALT7_NO_NUNC = Path('HG38_NEW') / 'ALT7_HG38_NEW_FILTER_MPQ20' / 'nunc'
REXC_NO_NUNC = Path('HG38_NEW') / 'REXC_HG38_NEW_FILTER_MPQ20' / 'nunc'

def read_bam_files_to_dict(directory):
    bam_files_dict = {}
    
    #sort the directory by name
    sorted_bam_files = sorted(directory.glob('*.bam'), key=lambda f: f.name)
    
    for bam_file in sorted_bam_files:  # Filter for .bam files
        #bam = pysam.AlignmentFile(bam_file, "rb") 
        bam_files_dict[bam_file.name] = bam_file.resolve() # Store filename as key, BAM object as value
    return bam_files_dict

# Read BAM files from each directory

#ALTC
bam_files_AltC_1 = read_bam_files_to_dict(ALTC_1_BAMs)
bam_files_AltC_2 = read_bam_files_to_dict(ALTC_2_BAMs)

#ALT7
bam_files_Alt7_1 = read_bam_files_to_dict(ALT7_1_BAMs)
bam_files_Alt7_2 = read_bam_files_to_dict(ALT7_2_BAMs)

#REX4
bam_files_REX4_2 = read_bam_files_to_dict(REX4_2_BAMs)
bam_files_REX4_3 = read_bam_files_to_dict(REX4_3_BAMs)

#REXC
bam_files_REXC_1 = read_bam_files_to_dict(REXC_1_BAMs)
bam_files_REXC_2 = read_bam_files_to_dict(REXC_2_BAMs)

#NO_NUNC 
bam_files_NO_NUNC_ALTC = read_bam_files_to_dict(ALTC_NO_NUNC)
bam_files_NO_NUNC_ALT7 = read_bam_files_to_dict(ALT7_NO_NUNC)
bam_files_NO_NUNC_REX4 = read_bam_files_to_dict(REX4_NO_NUNC)
bam_files_NO_NUNC_REXC = read_bam_files_to_dict(REXC_NO_NUNC)


#THE BAM FILES TO CONSIDER
all_bams = {'ALTC': [bam_files_AltC_1, bam_files_AltC_2],
            'ALT7': [bam_files_Alt7_1, bam_files_Alt7_2],
            'REX4': [bam_files_REX4_2, bam_files_REX4_3],
            'REXC': [bam_files_REXC_1, bam_files_REXC_2]}

#THE CONTROL BAMS
all_controls = {'ALTC':bam_files_NO_NUNC_ALTC,
                'ALT7':bam_files_NO_NUNC_ALT7,
                'REX4':bam_files_NO_NUNC_REX4,
                'REXC':bam_files_NO_NUNC_REXC}

#TSVs for the locations of the potential locations
all_tsvs = {'ALTC':pd.read_csv(altc, sep='\t',  names=['chrom','start','end','name','na','na2','REGION'])[:],
                'ALT7':pd.read_csv(alt7, sep='\t',  names=['chrom','start','end','na','na2','REGION'])[:],
                'REX4':pd.read_csv(rex4, sep='\t',  names=['chrom','start','end','na','na2','REGION'])[:],
                'REXC':pd.read_csv(rexc, sep='\t',  names=['chrom','start','end','na','na2','REGION'])[:]}

#iterate through all the tsvs and make a "REGION' entry
for condition in all_tsvs:
    all_tsvs[condition]['REGION'] = ['REGION_' + str(i) + '.bam' for i in np.arange(0,  all_tsvs[condition]['REGION'].shape[0])]




In [None]:
the_read = None

def find_deletion_locs(inbam, chrom, start_loc, end_loc,strand):
    deletions = []
    insertions = []
    indels = []
    count = 0
    
    for read in inbam.fetch(chrom, start_loc, end_loc):
        deletion_lst = []
        insertion_lst = []
        indel_lst = []

        count += 1
        ref_start = read.reference_start  # Correct attribute for reference start position

        if read.is_unmapped:
            break
        if read.cigartuples[0][0] == 4:
            ref_start = ref_start - read.cigartuples[0][1] 
        
        cigar = read.cigartuples
    
        #the bases traversed - where we are in the read 
        l_read = 0
        
        #the exact start position of the read - where we are in the reference 
        l = ref_start
        segment_index = 0

        #go through now the entire cigar array
        for op, length in cigar:

            if strand == '-':
                if length + l > start_loc and l < start_loc:
                    if op == 2:
                        deletion_lst.append(["deletions extending passed region", length, (l, l+length)])
                        indel_lst.append(["indels", -1 * length])
                elif l + length > end_loc and l < end_loc:
                    if op == 2:
                        deletion_lst.append(["deletions extending passed region", length, (l, l+length)])
                        indel_lst.append(["indels", -1 * length])
                elif l >= start_loc and l <= end_loc:
                    if op == 2:
                        deletion_lst.append(["deletions", length, (l, l+length)])
                        indel_lst.append(["indels", -1 * length])
                elif l + length >= start_loc and l + length  <= end_loc:
                    if op == 2:
                        deletion_lst.append(["deletions", length, (l, l+length)])
                        indel_lst.append(["indels", -1 * length])

            if strand == '+' or strand == 'unknown':
                if length + l > end_loc and l < end_loc:
                    if op == 2:
                        deletion_lst.append(["deletions extending passed region", length, (l, l+length)])
                        indel_lst.append(["indels", -1 * length])
                elif length + l > start_loc and l < start_loc:
                    if op == 2:
                        deletion_lst.append(["deletions extending passed region", length, (l, l+length)])
                        indel_lst.append(["indels", -1 * length])
                elif l >= start_loc and l  <= end_loc:
                    if op == 2:
                        deletion_lst.append(["deletions", length, (l, l+length)])
                        indel_lst.append(["indels", -1 * length])
                elif l + length >= start_loc and l + length  <= end_loc:
                    if op == 2:
                        deletion_lst.append(["deletions", length, (l, l+length)])
                        indel_lst.append(["indels", -1 * length])
            if l + length < end_loc and l + length > start_loc :
                if op == 1:
                    insertion_lst.append(["insertions", length, (l, l+length)])
                    indel_lst.append(["indels", length])
                    
            #insertion begins in the guide loc
            if l  < end_loc and l > start_loc :
                if op == 1:
                    insertion_lst.append(["insertions", length, (l, l+length)])
                    indel_lst.append(["indels", length])
                
            '''below traverses the region of the read we are considering'''
            if op == 0:  
                l_read += length
                l += length
            elif op == 1:  
                l_read += length
                
            elif op == 2: 
                l += length 
                
            elif op == 3:  
                l += length 
                
            elif op == 4:  
                l_read += length
                l += length 
                
            elif op == 5: 
                pass
            elif op == 6: 
                pass
            elif op == 7:  
                l_read += length
                l += length
                
            elif op == 8: 
                l_read += length
                l += length
                
            else:
                raise ValueError(f"Unknown CIGAR operation code {op}")

            if l > end_loc:
                if len(deletion_lst) > 0:
                    deletions.append(deletion_lst)
                if len(insertion_lst) > 0:
                    insertions.append(insertion_lst)
                if len(indel_lst) > 0:
                    indels.append(indel_lst)
                break
                
            segment_index += 1
            
    return deletions, insertions, indels
    inbam.close()


def construct_df(locations, bams, del_lst, ins_lst, indel_lst):
    locations = locations.copy()    
    count = 0
    for region in bams:
        if count % 1000 == 0:
            print(count)
        count += 1
            
        inbam = bams[region]
        inbam = pysam.AlignmentFile(inbam, "rb")  # Store filename as key, BAM object as value

        chrom = locations[locations['REGION'] == region]['chrom'].tolist()[0]
        start = locations[locations['REGION'] == region]['start'].tolist()[0]
        end = locations[locations['REGION'] == region]['end'].tolist()[0]

        deletions = del_lst[region]
        indels = indel_lst[region]
        
        deletion_lengths = [
            [deletion[1] for deletion in read if 'deletions' in deletion[0]]
            for read in deletions
        ]

        deletion_coords = [
            [deletion[2] for deletion in read if 'deletions' in deletion[0]]
            for read in deletions
        ]

        indel_lengths = [
            [indel[1] for indel in read if 'indel' in indel[0]]
            for read in indels
        ]
        
        num_deletions = len(deletion_lengths)
        deletion_extending_past_region_lengths = [deletion[1] for read in deletions for deletion in read if 'extending' in deletion[0]]
        deletion_extending_past_region_coords = [deletion[2] for read in deletions for deletion in read if 'extending' in deletion[0]]
        num_deletions_extending_past_region = len(deletion_extending_past_region_coords)
        
        insertions = ins_lst[region]        
        
        insertion_lengths = [
            [insertion[1] for insertion in read if 'insertions' in insertion[0]]
            for read in insertions 
        ]
        
        insertion_coords = [
            [insertion[2] for insertion in read if 'insertions' in insertion[0]]
            for read in insertions
        ]
        
        num_insertions = len(insertion_coords)
        num_indels = len(indel_lengths)

        total_reads = sum(1 for read in inbam.fetch(chrom, int(start), int(end)))
        
        locations.loc[locations['REGION'] == region, 'total_reads'] = total_reads
        locations.loc[locations['REGION'] == region, 'forward_reads'] = sum(1 for read in inbam.fetch(chrom, int(start), int(end)) if not read.is_reverse)
        locations.loc[locations['REGION'] == region, 'reverse_reads'] = sum(1 for read in inbam.fetch(chrom, int(start), int(end)) if read.is_reverse)
                    
        #collect info about all deletions
        locations.loc[locations['REGION'] == region, 'deletion lengths'] = str(deletion_lengths)
        locations.loc[locations['REGION'] == region, 'deletion coords'] = str(deletion_coords)
        locations.loc[locations['REGION'] == region, 'number of reads with deletion(s)'] = str(num_deletions)
        #collect info about all insertions
        locations.loc[locations['REGION'] == region, 'insertion lengths'] = str(insertion_lengths)
        locations.loc[locations['REGION'] == region, 'insertion coords'] = str(insertion_coords)
        locations.loc[locations['REGION'] == region, 'number of reads with insertion(s)'] = str(num_insertions)
        locations.loc[locations['REGION'] == region, 'number of reads with indels'] = str(num_indels)     
        #collect info about all deletions that go past the guide site
        locations.loc[locations['REGION'] == region, 'deletion extending past region lengths'] = str(deletion_extending_past_region_lengths)
        locations.loc[locations['REGION'] == region, 'deletion extending past region coords'] = str(deletion_extending_past_region_coords)
        locations.loc[locations['REGION'] == region, 'number of reads with deletion extending past region'] = str(num_deletions_extending_past_region)     

        if total_reads != 0:
            locations.loc[locations['REGION'] == region, '% reads with an deletion'] = int(num_deletions) / int(total_reads)
            locations.loc[locations['REGION'] == region, '% reads with an insertion'] = int(num_insertions) / int(total_reads)
            locations.loc[locations['REGION'] == region, '% reads with an indel'] = int(num_indels) / int(total_reads)
        else:
            locations.loc[locations['REGION'] == region, '% reads with an insertion'] = None
            locations.loc[locations['REGION'] == region, '% reads with an deletion'] = None
            locations.loc[locations['REGION'] == region, '% reads with an indel'] = None
        #close the bam file
        inbam.close()
    return locations

In [None]:
from Bio import pairwise2
from Bio.Seq import Seq
import logging

for condition in ['ALTC', 'ALT7', 'REXC', 'REX4']:
    all_tsvs[condition]['strandedness'] = 'unknown'


In [None]:
all_del_dict = {'ALTC':[{},{}],'ALT7':[{},{}],'REXC':[{},{}],'REX4':[{},{}]}
all_ins_dict = {'ALTC':[{},{}],'ALT7':[{},{}],'REXC':[{},{}],'REX4':[{},{}]}
all_ind_dict = {'ALTC':[{},{}],'ALT7':[{},{}],'REXC':[{},{}],'REX4':[{},{}]}

#goes through all conditions, all bams and all replicates - extracts each region analyzed and populates dictionary
for condition in ['REXC', 'REX4', 'ALTC', 'ALT7']:
    condition_bams = all_bams[condition]
    tsv = all_tsvs[condition]    
    replicate_num = 0
    for replicate in condition_bams:
        count = 0       
        for bam in replicate:
            bam_basename = ''
            
            if count % 1000 == 0:
                print(count)
    
            count += 1
            chrom = tsv[tsv['REGION'] == bam]['chrom'].tolist()[0]
            start = tsv[tsv['REGION'] == bam]['start'].tolist()[0]
            end = tsv[tsv['REGION'] == bam]['end'].tolist()[0]
            strand = 'unknown'
            bam_file = pysam.AlignmentFile(replicate[bam], "rb") 

            all_del_dict[condition][replicate_num][bam],all_ins_dict[condition][replicate_num][bam], all_ind_dict[condition][replicate_num][bam] = find_deletion_locs(bam_file, chrom, int(start), int(end), strand)        
            bam_file.close()  # Close the BAM file
        replicate_num += 1 

In [None]:
all_del_dict_wt = {'ALTC':{},'ALT7':{},'REXC':{},'REX4':{}}
all_ins_dict_wt = {'ALTC':{},'ALT7':{},'REXC':{},'REX4':{}}
all_ind_dict_wt = {'ALTC':{},'ALT7':{},'REXC':{},'REX4':{}}

for condition in ["ALTC", "REXC", "REX4", "ALT7"]:
    condition_bams = all_controls[condition]

    count = 0
    
    for bam in condition_bams:    
        
        if count % 1000 == 0:
            print(condition)
            print(count)
                
        count += 1

        chrom = all_tsvs[condition][all_tsvs[condition]['REGION'] == bam]['chrom'].tolist()[0]
        start = all_tsvs[condition][all_tsvs[condition]['REGION'] == bam]['start'].tolist()[0]
        end = all_tsvs[condition][all_tsvs[condition]['REGION'] == bam]['end'].tolist()[0]
                    
        #open the bam_file
        bam_file = pysam.AlignmentFile(all_controls[condition][bam], "rb") 
        
        all_del_dict_wt[condition][bam],all_ins_dict_wt[condition][bam], all_ind_dict_wt[condition][bam] = find_deletion_locs(bam_file, chrom, int(start), int(end), strand)
        bam_file.close()  # Close the BAM file

In [None]:
edited_dfs = {'ALTC': [None, None],
      'ALT7': [None, None],
      'REXC': [None, None],
      'REX4':[None, None]}

control_dfs = {'ALTC': None,
      'ALT7': None,
      'REXC': None,
      'REX4': None}

#iterate through the conditions (ALTC, ALT7, REX4, and REXC) and then create tables
for condition in ['ALTC', 'ALT7', 'REXC', 'REX4']:
    for replicate in np.arange(0,2):
        edited_dfs[condition][replicate] = construct_df(all_tsvs[condition],all_bams[condition][replicate], all_del_dict[condition][replicate],
                     all_ins_dict[condition][replicate], all_ind_dict[condition][replicate])

In [None]:
for condition in ['ALTC', 'ALT7', 'REXC', 'REX4']:
    control_dfs[condition] = construct_df(all_tsvs[condition],all_controls[condition], all_del_dict_wt[condition],
                 all_ins_dict_wt[condition], all_ind_dict_wt[condition])

In [None]:
for condition in ['ALTC', 'ALT7', 'REXC', 'REX4']:
    for replicate in  np.arange(0,2):
        df = edited_dfs[condition][replicate]
        df.to_csv(str(condition) + 'consensus_indel_rates_hg38_new_con_filtered_mp20' + str(replicate) + '.csv', index=False) 


for condition in ['ALTC','ALT7', 'REXC', 'REX4']:
    df = control_dfs[condition]
    display(df)
    df.to_csv(str(condition) + 'consensus_indel_rates_control_hg38_new_con_filtered_mp20' + '.csv', index=False)  

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
import scipy.stats as stats


def construct_comp_tbl(wt, edited):
    comp_df = pd.DataFrame([])

    # Ensure 'REGION' column is included
    comp_df['REGION'] = wt['REGION'].tolist()

    #construct a comparison table
    for index, row in wt.iterrows():
        cont = pd.DataFrame([])
        cont_indel = pd.DataFrame([])
        
        region = row['REGION']
        cont['wt'] = [int(row['number of reads with indels']),
        int(row['total_reads']) - int(row['number of reads with indels'])]

        row2 = edited.iloc[index]
        cont['edited'] = [int(row2['number of reads with indels']),
        int(row2['total_reads']) - int(row2['number of reads with indels'])]
        
        cont['type'] = ['indel', 'no indel']
        cont.set_index('type', inplace=True)
        odds_ratio, p_value = stats.fisher_exact(cont)
        
        comp_df.loc[comp_df['REGION'] == region, 'chrom'] = row['chrom']
        comp_df.loc[comp_df['REGION'] == region, 'start'] = row2['start']
        comp_df.loc[comp_df['REGION'] == region, 'end'] = row['end']
        
        comp_df.loc[comp_df['REGION'] == region, 'edited indel %'] = row2['% reads with an indel']
        comp_df.loc[comp_df['REGION'] == region, 'wt indel %'] = row['% reads with an indel']
        comp_df.loc[comp_df['REGION'] == region, 'edited indel %'] = row2['% reads with an indel']
        comp_df.loc[comp_df['REGION'] == region, 'total edited reads'] = row2['total_reads']
        comp_df.loc[comp_df['REGION'] == region, 'total wt reads'] = row['total_reads']
        comp_df.loc[comp_df['REGION'] == region, 'edited reads with indels '] = row2['number of reads with indels']
        comp_df.loc[comp_df['REGION'] == region, 'wt reads with indels'] = row['number of reads with indels']
        comp_df.loc[comp_df['REGION'] == region, 'indel p-value'] = p_value
        
        
    return comp_df

In [None]:
comparison_tables = {'ALTC': [None, None],
                     'ALT7': [None, None],
                     'REX4': [None, None],
                     'REXC': [None, None]}

for condition in ['ALTC', 'ALT7', 'REXC', 'REX4']:
    for replicate in [0,1]:
        display(control_dfs[condition])
        comparison_tables[condition][replicate] = construct_comp_tbl(control_dfs[condition], edited_dfs[condition][replicate])
        comparison_tables[condition][replicate]['trial'] = replicate

In [None]:
for condition in ['ALTC', 'ALT7', 'REXC', 'REX4']:
    for replicate in np.arange(0,2):
        comparison_tables[condition][replicate].to_csv(f"{condition}_{replicate}consensus_comparison_hg38_new_con_filtered_mp20.csv")

In [None]:
for condition in ['ALT7', 'ALTC', 'REXC', 'REX4']:
    for replicate in np.arange(0,2):
        comparison_tables[condition][replicate] = pd.read_csv(f"{condition}_{replicate}consensus_comparison_hg38_con_filtered.csv")
        comparison_tables[condition][replicate]['trial'] = replicate

In [None]:
merged_comparison_tables = {'ALTC': None,
                     'ALT7': None,
                     'REX4': None,
                     'REXC': None}

for condition in ['ALTC', 'ALT7', 'REXC', 'REX4']:
    merged_comparison_tables[condition] = pd.concat([comparison_tables[condition][0], comparison_tables[condition][1]])
    display(merged_comparison_tables[condition][merged_comparison_tables[condition]['REGION'] == 'REGION_0.bam'])
    display(comparison_tables[condition][0])
    display(comparison_tables[condition][1])
    merged_comparison_tables[condition].to_csv(f"{condition}consensus_merged_comparisond_con_new_hg38_filtered_mp20.csv")

In [None]:
def plot_scatter(tbl):
    plt.figure(figsize=(8,6))

    # Color the first point differently by separating it
    sns.scatterplot(
        x=tbl["wt indel %"].to_list()[1:],  # Exclude first point
        y=tbl["edited indel %"].to_list()[1:],  # Exclude first point
        color='black',  # Makes the rest of the points black
        alpha=0.5
    )
    
    # Plot the first point with a different color (e.g., red)
    sns.scatterplot(
        x=[tbl["wt indel %"].to_list()[167]],  # on target point (change depending on the location and sample)
        y=[tbl["edited indel %"].to_list()[167]],  # on target point (change depending on the location and sample)
        color='red',  # Different color for the first point
        s=100,  # Increase size of first point
        label="Edited Region (ALT7 Guide - chr9:27575447-27575470 )"  # Optional: add label for legend
    )
    
    # Labels and title
    plt.xlabel("Non-Edited Indel Percentage (%)", fontsize=12)  # X-axis label
    plt.ylabel("Edited Indel Percentage (%)", fontsize=12)  # Y-axis label
    plt.title("Indel Percentage Comparison: Non-Edited vs. Edited (Replicate 1)", fontsize=14)  # Title
    
    # Show the plot
    plt.legend()  # Optional: to show the label in the legend
    plt.show()

# change depending on the table of interest
tbl = comparison_tables['REX4'][0]
display(tbl)
plot_scatter(tbl)

In [None]:
import matplotlib.pyplot as plt
import ast
import numpy as np  # For histogram calculations

# Extract the raw data from the DataFrame for CONTROL
control_raw = control_dfs['REX4'][control_dfs['REX4']['REGION'] == 'REGION_14.bam']['deletion lengths'].to_list()
if isinstance(control_raw, list) and len(control_raw) == 1 and isinstance(control_raw[0], str):
    control_raw = ast.literal_eval(control_raw[0])  # Convert string to a proper list
control_deletion_lengths = [x[0] for x in control_raw]  # Extract numbers from single-item lists

# Extract the raw data from the DataFrame for EDITED (1)
edited_raw_1 = edited_dfs['REX4'][0][edited_dfs['REX4'][0]['REGION'] == 'REGION_14.bam']['deletion lengths'].to_list()
if isinstance(edited_raw_1, list) and len(edited_raw_1) == 1 and isinstance(edited_raw_1[0], str):
    edited_raw_1 = ast.literal_eval(edited_raw_1[0])  # Convert string to a proper list
edited_deletion_lengths_1 = [x[0] for x in edited_raw_1]  # Extract numbers from single-item lists

# Extract the raw data from the DataFrame for EDITED (2)
edited_raw_2 = edited_dfs['REX4'][1][edited_dfs['REX4'][1]['REGION'] == 'REGION_14.bam']['deletion lengths'].to_list()
if isinstance(edited_raw_2, list) and len(edited_raw_2) == 1 and isinstance(edited_raw_2[0], str):
    edited_raw_2 = ast.literal_eval(edited_raw_2[0])  # Convert string to a proper list
edited_deletion_lengths_2 = [x[0] for x in edited_raw_2]  # Extract numbers from single-item lists

# Calculate the bin range for all data
all_deletion_lengths = edited_deletion_lengths_1 + edited_deletion_lengths_2 + control_deletion_lengths
bin_range = range(min(all_deletion_lengths), max(all_deletion_lengths) + 1)
bin_width = 1  # Set the width of the bins

# Compute histogram values
control_hist, bin_edges = np.histogram(control_deletion_lengths, bins=bin_range)
edited_hist_1, _ = np.histogram(edited_deletion_lengths_1, bins=bin_range)
edited_hist_2, _ = np.histogram(edited_deletion_lengths_2, bins=bin_range)

# Set the positions for each dataset's bars
width = 0.3  # Bar width
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2  # Get the center of each bin for placing bars

# Create the combined histogram with bars side by side
plt.figure(figsize=(10, 5))

# Plot the bars for the control dataset
plt.bar(bin_centers - width, control_hist, width=width, alpha=0.7, color='red', edgecolor='black', label='Control')

# Plot the bars for the first edited dataset
plt.bar(bin_centers, edited_hist_1, width=width, alpha=0.7, color='blue', edgecolor='black', label='Edited 1')

# Plot the bars for the second edited dataset
plt.bar(bin_centers + width, edited_hist_2, width=width, alpha=0.7, color='cyan', edgecolor='black', label='Edited 2')

# Labels and title
plt.xlabel("Deletion Length (bp)")
plt.ylabel("Frequency")
plt.title("Deletion Lengths On Target Region - REX4")
plt.legend()  # Add legend to differentiate between datasets

# Show the plot
plt.show()