In [1]:
# import packages
import pandas as pd
import os
os.environ['R_HOME'] = '/Users/geba9152/miniconda3/envs/rpy2_env/lib/R'
import rpy2.rinterface

# loads ipython extension
%load_ext rpy2.ipython

06/03/2024

I am performing a preliminary analysis for my F31 grant. Essentially I want to identify...

Paper for validated gene-enhancer pairs: `Nasser, Joseph. et al. Nature 2021; 4, 408`

1. The number of eRNAs present in run-on regions
2. Distance from PAS to nearest eRNA contained within run-on region
3. Correlation analysis with mT taken into account

In [6]:
%%R
data <- read.table("/scratch/Users/geba9152/LIET-summer2024/correlation-erna-anaysis/41586_2021_3446_MOESM7_ESM.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)

# Print cell types

## Options for us (present within DBNascent) - 1. K562, 2. Jurkat, 3. THP, 4. LNCAP, 5. GM12878 (only 3 samples, though)
unique(data$CellType)

# Plan: let's start with all K562 data we have already processed for LIET


 [1] "K562"                        "NCCIT"                      
 [3] "PrimaryHepatocytes"          "BJAB_anti-IgM_anti-CD40_4hr"
 [5] "BJAB"                        "Jurkat_anti-CD3_PMA_4hr"    
 [7] "Jurkat"                      "THP1"                       
 [9] "THP1_LPS_4hr"                "GM12878"                    
[11] "LNCAP"                      


In [12]:
## I queryed across database so I can map SRRs I have already ran LIET on in K562s back to their respective papers
# See query below
all_srrs = pd.read_csv("/Users/geba9152/summer_2024_3prime/queries/all-human-PRO-GRO-query.txt", sep = "\t")
all_srrs.columns


Index(['srr', 'sample_name', 'replicate', 'single_paired', 'rcomp', 'unusable',
       'trim_read_depth', 'sample_qc_score', 'paper_id', 'sample_id',
       'protocol', 'paper_name', 'organism', 'cell_type', 'sample_id.1',
       'condition_type', 'treatment', 'conc_intens', 'start_time', 'end_time',
       'duration', 'duration_unit'],
      dtype='object')

# First try correlation analysis on K562s that we have already run LIET on prelim genes

### SQL Query for this analysis- querying everything & filtering for samples I want

```
mysql --host=socotra.int.colorado.edu --user=geba9152 --password=Fb38f0@%Ld9 dbnascent -e "\
SELECT
    sampleEquiv.srr,
    samples.sample_name,
    samples.replicate,
    samples.single_paired,
    samples.rcomp,
    samples.unusable,
    samples.trim_read_depth,
    samples.sample_qc_score,
    linkIDs.paper_id,
    linkIDs.sample_id,
    papers.protocol,
    papers.paper_name,
    organisms.organism,
    genetics.cell_type,
    conditionLink.sample_id,
    conditions.condition_type,
    conditions.treatment,
    conditions.conc_intens,
    conditions.start_time,
    conditions.end_time,
    conditions.duration,
    conditions.duration_unit
FROM
    sampleEquiv
    INNER JOIN samples ON samples.id = sampleEquiv.sample_id
    INNER JOIN conditionLink ON samples.id = conditionLink.sample_id
    INNER JOIN conditions ON conditionLink.condition_id = conditions.id
    INNER JOIN linkIDs ON linkIDs.sample_id = samples.id
    INNER JOIN papers ON papers.id = linkIDs.paper_id
    INNER JOIN genetics ON genetics.id = linkIDs.genetic_id
    INNER JOIN organisms ON genetics.organism_id = organisms.id
WHERE
    (papers.protocol = 'PRO-seq' OR papers.protocol = 'GRO-seq')
    AND organisms.organism = 'H. sapiens'
ORDER BY
    genetics.cell_type;" > /Users/geba9152/summer_2024_3prime/queries/all-human-PRO-GRO-query.txt```

In [14]:
# K562 SRRs I ran LIET on 
k562s = ["SRR11793825", "SRR11793826", "SRR12083664", "SRR12083665",
         "SRR4454567", "SRR4454568", "SRR5364303", "SRR5364304", 
         "SRR8137173", "SRR8669162", "SRR8669163", "SRZ1554311"]

# filter to get paper names
all_srrs = all_srrs[['sample_name','paper_name']]
all_srrs[all_srrs['sample_name'].isin(k562s)]

Unnamed: 0,sample_name,paper_name
1112,SRR12083665,Blumberg2021characterizing
1113,SRR8137173,Wang2019identification
1116,SRR4454568,Vihervaara2017transcriptional
1117,SRR4454567,Vihervaara2017transcriptional
1121,SRR11793826,Judd2020unpublished
1122,SRR11793825,Judd2020unpublished
1127,SRR12083664,Blumberg2021characterizing
1129,SRZ1554311,Core2014analysis
1130,SRZ1554311,Core2014analysis
1131,SRR5364303,Dukler2017nascent


**K562 papers & associated SRRs (that LIET has been run on so far)^^:**

Now I am going to make a `bash` script to pull TFit and Dreg calls for each paper

I think I will do mu merge calls **1st.** by paper and **2nd.** by condition. 

All of these samples have no treatment, so we can do mu merge by papers only

### Make mumerge input files

In [31]:
# dreg
# dreg_tfit = 'dreg'
# prefix = '.sorted.dREG.full.covfiltered'
# input_file = '/scratch/Users/geba9152/LIET-summer2024/correlation-erna-anaysis/paper_names.txt'


# tfit
dreg_tfit = 'tfit'
prefix = '.sorted_split_bidir_cov_filtered'


In [32]:
def make_mumerge_file(dreg_tfit, prefix, input_file):
    # Read the input file
    df = pd.read_csv(input_file, sep='\t')
    
    # Create a dictionary to keep track of sample numbers for each paper
    sample_counts = {}

    # Create a list to store the output lines
    output_lines = []

    # Loop through the dataframe and generate the output lines
    for index, row in df.iterrows():
        srr = row['srr']
        paper = row['paper']

        if paper not in sample_counts:
            sample_counts[paper] = 1
        else:
            sample_counts[paper] += 1

        sample_id = f'sample{sample_counts[paper]}'
        file_path = f'/scratch/Users/geba9152/LIET-summer2024/correlation-erna-anaysis/{dreg_tfit}/{srr}{prefix}.bed'
        group = paper.replace(' ', '_')  # Replace spaces with underscores to avoid issues with file names
        output_line = f'{file_path}\t{sample_id}\t{group}'
        output_lines.append(output_line)

    # Write the output lines to a file
    output_file = f'/scratch/Users/geba9152/LIET-summer2024/correlation-erna-anaysis/mumerge-{dreg_tfit}-input.txt'
    with open(output_file, 'w') as f:
        f.write('file\tsampid\tgroup\n')
        for line in output_lines:
            f.write(line + '\n')

make_mumerge_file(dreg_tfit, prefix, input_file)

### Make bedfiles -1kb of PAS to mT + 2 sigma
- I will intersect this with the mumerged bidir file

In [10]:
srrs = ["SRR11793825", "SRR11793826", "SRR12083664", "SRR12083665",
         "SRR4454567", "SRR4454568", "SRR5364303", "SRR5364304", 
         "SRR8137173", "SRR8669162", "SRR8669163", "SRZ1554311"]

filepath = '/scratch/Users/geba9152/LIET-summer2024/k562-batch/results/tight-sT/'


In [11]:
def make_intersect_bed(srrs, filepath):
    
    for srr in srrs: 
    
        lietfile = f"{filepath}{srr}.liet"
        
#         # get list of files for each type
#     lietfiles = glob.glob(lietpath)
# #     logfiles = glob.glob(logpath)
# #     configfiles = glob.glob(configpath)

#     # process each group of files 
# #     for configfile, lietfile, logfile in zip(configfiles, lietfiles, logfiles):
#     for lietfile in lietfiles:
#         celltype = os.path.basename(lietfile).split('.')[0]  # extract celltype from filename
       
#         # call fitparse function to get model params
#         fit_parser = FitParse(lietfile)
        
#         data = {
#             "Gene": fit_parser.genes,
#             "mL": fit_parser.mL,
#             "mL_std": fit_parser.mL_std,
#             "sL": fit_parser.sL,
#             "sL_std": fit_parser.sL_std,
#             "tI": fit_parser.tI,
#             "tI_std": fit_parser.tI_std,
#             "mT": fit_parser.mT,
#             "mT_std": fit_parser.mT_std,
#             "sT": fit_parser.sT,
#             "sT_std": fit_parser.sT_std,
#             "w": fit_parser.w,
#             "w_std": fit_parser.w_std,
#             "mL_a": fit_parser.mL_a,
#             "mL_a_std": fit_parser.mL_a_std,
#             "sL_a": fit_parser.sL_a,
#             "sL_a_std": fit_parser.sL_a_std,
#             "tI_a": fit_parser.tI_a,
#             "tI_a_std": fit_parser.tI_a_std,
#             "w_a": fit_parser.w_a,
#             "w_a_std": fit_parser.w_a_std,
#         }
        
#         return df
    

make_intersect_bed(srrs, filepath)

ParserError: Error tokenizing data. C error: Expected 1 fields in line 5, saw 10
