In [1]:
import pandas as pd
import numpy as np
import sys
import os
import pysam
import pyranges as pr

import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.colors import LogNorm, Normalize

# Load the mappings to V5 constructs

In [6]:
bam_path = "/scratch/indikar_root/indikar1/cstansbu/HSC/v5_tagged/all_reads_factor_mapped.bam"

bamfile = pysam.AlignmentFile(bam_path, "rb")

v5 = []

for read in bamfile:
    barcode = read.qname.split('_')[0]
    umi = read.qname.split('_')[1].split("#")[0]
    read_name = read.qname.split("#")[1][:-2]
    
    row = {
            'barcode' : barcode,
            'umi' : umi,
            'read_name' : read_name,
            'forward' : read.is_forward,
            'mapping_quality' : read.mapping_quality,
            'query_length' : read.query_length,
            'query_name' : read.query_name,
            'reference_name' : read.reference_name,
            'reference_start' : read.reference_start,
            'reference_end' : read.reference_end,
        }
    v5.append(row)
    
        
v5 = pd.DataFrame(v5)
v5 = v5[v5['reference_name'].notna()]
v5 = v5[v5['mapping_quality'] > 0]
v5 = v5.sort_values(by='mapping_quality', ascending=False)

v5 = v5.drop_duplicates(subset=['barcode', 'umi', 'reference_name'])

print(f"{v5.shape=}")
v5.head()

v5.shape=(38672, 10)


Unnamed: 0,barcode,umi,read_name,forward,mapping_quality,query_length,query_name,reference_name,reference_start,reference_end
42389,CGAAGGAAGTGTACAA,CCTACGGCCCGC,2454610c-8369-4117-a3db-a66d03a69a8a,False,60,610,CGAAGGAAGTGTACAA_CCTACGGCCCGC#2454610c-8369-41...,STAT5A_tre-3ltr,2655,2746.0
11,CGTAATGCACAAATCC,GCCAGTGCGCTT,a4b1f06b-aabd-41a9-bbbe-988d66b2a575,True,60,845,CGTAATGCACAAATCC_GCCAGTGCGCTT#a4b1f06b-aabd-41...,FOS_tre-3ltr,0,684.0
12,ACCTACCTCCATTCGC,AATCAGCACTAC,b638a499-a273-40d5-978a-2c5ec7545322,True,60,1011,ACCTACCTCCATTCGC_AATCAGCACTAC#b638a499-a273-40...,FOS_tre-3ltr,0,766.0
42350,TATCAGGTCCCAGGAC,GGTCATTCACGG,8706740d-3bca-493c-a8c8-39d9c26dcbee,True,60,409,TATCAGGTCCCAGGAC_GGTCATTCACGG#8706740d-3bca-49...,STAT5A_tre-3ltr,2470,2747.0
42351,GTCCCATAGGTCACAG,TCACAGGGATCG,ba8621fa-2aa6-4f98-8bcd-c8354857854c,False,60,357,GTCCCATAGGTCACAG_TCACAGGGATCG#ba8621fa-2aa6-4f...,STAT5A_tre-3ltr,2481,2747.0


In [7]:
v5['barcode'].nunique() / 8500

0.774235294117647

In [8]:
v5['reference_name'].value_counts(dropna=False)

reference_name
FOS_tre-3ltr       23163
REL_tre-3ltr       11664
GATA2_tre-3ltr      3667
STAT5A_tre-3ltr      178
Name: count, dtype: int64

In [5]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

# Factor Tables

In [38]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/v5_tagged/v5_result.factor_table.csv"

df = pd.read_csv(fpath)
df

Unnamed: 0,barcode,umi,read_name,patternName,strand,start,end
0,CATAGACAGGATGCGT,GATAGTGTGTGC,f5a43d5b-f95e-41fc-a1fa-391af47efa14,attb2_hpgk,+,1055,1677
1,TCATGTTCATAACTCG,GGCTGACATCCT,6cf07e9c-af31-41b5-804c-b5276cb93013,tTRE_promoter,+,1568,1878
2,TTCACGCTCATCGTAG,GATAACTCCATA,8b01f79b-2f0d-491f-bbf3-364a028c5bac,attb2_hpgk,+,1269,1891
3,AAAGGGCCAGAGTTCT,TACTGGCGTACC,4158d354-7385-4df6-82f2-97a7ace07f5f,tTRE_promoter,+,229,539
4,AGGTAGGTCCCTCATG,CCGGTACCTCGA,9dd0af2c-b466-48b3-866c-8142d3134c30,attb2_hpgk,-,2219,2841
5,GAAATGAGTTCTGACA,GGTTCACTATAA,29128759-0a43-4d7a-b4af-8dd533910f21,attb2_hpgk,+,302,924
6,CCTCTCCTCCCGTTCA,GAGTACCGGTAT,313836b9-2eac-4589-a487-12edecaaa0fe,attb2_hpgk,+,655,1277
7,ATCCTATAGTATGATG,CGGGTAGAATTT,e7f5e150-6e78-413b-a3bb-42ac9c8c09fb,attb2_hpgk,+,260,882


# Load the mappings to the reference loci

In [None]:
# load the GTF
gtf_path = "/scratch/indikar_root/indikar1/cstansbu/HSC/references/annotations.gtf"
gf = pr.read_gtf(gtf_path)
gdf = gf.df
print(f"{gdf.shape=}")
print(gdf.columns)
gdf.head()

In [None]:
# get the gtf records we care about
TFs = [
    'GATA2', 
    'GFI1B', 
    'FOS', 
    'STAT5A',
    'REL',  
]
print(f"{len(TFs)=}")

genes = gdf.copy()
genes = gdf[gdf['Feature'].isin(['gene'])]
genes = genes[genes['gene_name'].isin(TFs)]

genes = genes.groupby(['gene_name', 'Feature']).agg(
    Chromosome = ('Chromosome', 'first'),
    Strand = ('Strand', 'first'),
    Start = ('Start', 'min'),
    End = ('End', 'max'),
).reset_index(drop=False)


genes['Length'] = genes['End'] - genes['Start']

genes

In [None]:
bam_path = "/scratch/indikar_root/indikar1/cstansbu/HSC/merged/merged.bam"
buffer_bp = 1000 # base pair fudge factor
res = []

bamfile = pysam.AlignmentFile(bam_path, "rb")

for _, gene_rec in genes.iterrows():
    chrom = gene_rec['Chromosome']
    start = gene_rec['Start'] - buffer_bp
    end = gene_rec['End'] + buffer_bp
    
    for read in bamfile.fetch(chrom, start, end):
        barcode = read.qname.split('_')[0]
        umi = read.qname.split('_')[1].split("#")[0]
        read_name = read.qname.split("#")[1][:-2]

        if not read.is_secondary:
            
            polyA = "A" * 7
            polyT = "T" * 7
        
            row = {
                'barcode' : barcode,
                'umi' : umi,
                'read_name' : read_name,
                'forward' : read.is_forward,
                'mapping_quality' : read.mapping_quality,
                'gene_name' : gene_rec['gene_name'],
                'query_length' : read.query_length,
                'query_name' : read.query_name,
                'reference_start' : read.reference_start,
                'reference_end' : read.reference_end,
                'has_polyA' : polyA in read.query_sequence,
                'has_polyT' : polyT in read.query_sequence,
            }
            res.append(row)
            
    break
        
res = pd.DataFrame(res)
print(f"{res.shape=}")
res

In [None]:
break

In [None]:
sns.histplot(data=res, 
             x='mapping_quality',
             hue='reference_name')

In [None]:
res['barcode'].nunique() / 8500

In [None]:
res['reference_name'].value_counts(dropna=False)

In [None]:
break

In [None]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/v5_tagged/Gridr1.tagged.csv"

df = pd.read_csv(fpath, sep='\t')
print(f"{df.shape=}")
df.head()

In [None]:
# load the GTF
gtf_path = "/scratch/indikar_root/indikar1/cstansbu/HSC/references/annotations.gtf"
gf = pr.read_gtf(gtf_path)
gdf = gf.df
print(f"{gdf.shape=}")
print(gdf.columns)
gdf.head()

In [None]:
gdf['Feature'].value_counts()

In [None]:
break

In [None]:
# get the gtf records we care about
TFs = [
    'GATA2', 
    'GFI1B', 
    'FOS', 
    'STAT5A',
    'REL',  
]
print(f"{len(TFs)=}")

genes = gdf.copy()
genes = gdf[gdf['Feature'].isin(['gene'])]
genes = genes[genes['gene_name'].isin(TFs)]

genes = genes.groupby(['gene_name', 'Feature']).agg(
    Chromosome = ('Chromosome', 'first'),
    Strand = ('Strand', 'first'),
    Start = ('Start', 'min'),
    End = ('End', 'max'),
).reset_index(drop=False)


genes['Length'] = genes['End'] - genes['Start']

genes

In [None]:
bam_path = "/scratch/indikar_root/indikar1/cstansbu/HSC/merged/merged.bam"
buffer_bp = 1000 # base pair fudge factor
res = []

bamfile = pysam.AlignmentFile(bam_path, "rb")

for _, gene_rec in genes.iterrows():
    chrom = gene_rec['Chromosome']
    start = gene_rec['Start'] - buffer_bp
    end = gene_rec['End'] + buffer_bp
    
    for read in bamfile.fetch(chrom, start, end):
        barcode = read.qname.split('_')[0]
        umi = read.qname.split('_')[1].split("#")[0]
        read_name = read.qname.split("#")[1][:-2]

        if not read.is_secondary:
            
            polyA = "A" * 7
            polyT = "T" * 7
        
            row = {
                'barcode' : barcode,
                'umi' : umi,
                'read_name' : read_name,
                'forward' : read.is_forward,
                'mapping_quality' : read.mapping_quality,
                'gene_name' : gene_rec['gene_name'],
                'query_length' : read.query_length,
                'query_name' : read.query_name,
                'reference_start' : read.reference_start,
                'reference_end' : read.reference_end,
                'has_polyA' : polyA in read.query_sequence,
                'has_polyT' : polyT in read.query_sequence,
            }
            res.append(row)
            
    break
        
res = pd.DataFrame(res)
print(f"{res.shape=}")
res

In [None]:
res['barcode'].nunique() 

In [None]:
res.shape

In [None]:
res['has_polyA'].sum()

In [None]:
21841 / 25533

In [None]:
(res['reference_end'] - res['reference_start']).mean()

In [None]:
pdf = res.sample(100).reset_index(drop=True)


pdf = pdf.reset_index()

plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 5, 3

sns.scatterplot(data=pdf,
                x='reference_start',
                y='index',
                color='r')

sns.scatterplot(data=pdf,
                x='reference_end',
                y='index',
                color='b',
               )

plt.axvline(x=75278825, zorder=0, c='r')
plt.axvline(x=75282230, zorder=0, c='b')
plt.title("FOS")


# 75278825	75282230
