In [2]:
import os
import re
import pandas as pd
import numpy as np
import polars as pl
from natsort import natsorted

In [3]:
# Dictionary to hold strand pairs.
strand_pairs = {
    # GM12878
    "ENCFF074SXQ": "ENCFF164VLA", # from experiment ENCSR000AED; file format conversion: signal generation (alignment)
    "ENCFF546NVF": "ENCFF182LTN", # from experiment ENCSR00AED; file format conversion: file format conversion (signal generation and chromosome sizes)
    "ENCFF078ATR": "ENCFF037DUE", # from experiment ENCSR000AEF; file format conversion: signal generation (alignment)
    "ENCFF892WMR": "ENCFF985TNZ", # from experiment ENCSR00AEF; file format conversion: file format conversion (signal generation and chromosome sizes)

    # K562
    "ENCFF829PNJ": "ENCFF336COA", # from experiment ENCSR000AEM; file format conversion: signal generation (alignment)
    "ENCFF964BAP": "ENCFF006DQI", # from experiment ENCSR000AEM; signal generation: signal generation (signal generation and chromosome sizes) 
    "ENCFF777EAJ": "ENCFF040DXX", # from experiment ENCSR000AEO; signal generation: file format conversion (alignment)
    "ENCFF528VFJ": "ENCFF097ASF", # from experiment ENCSR000AEO; signal generation: file format conversion (signal generation and  chromosome sizes)
}

In [4]:
# Splitting .tab and .txt files into different arrays.
# Sorting each array first by whether it is a plus or minus and then by its cell line.
data = os.listdir("./data/RNASeq_bw")
tab_files = sorted([file for file in data if file.endswith(".tab") and "unstranded" not in file], key=lambda x: (x[x.find(".") + 1], x[0]))
coverage_files = sorted([file for file in data if file.endswith(".txt") and "unstranded" not in file], key=lambda x: (x[x.find(".") + 1], x[0]))

In [5]:
coverage_files = ['GM12878.minus.ENCFF074SXQ.coverage.txt', 'GM12878.plus.ENCFF164VLA.coverage.txt']

In [None]:
# Loop to combine the signals of the minus and plus strands into a single dataset for .tab files.
for i in range(len(tab_files) // 2):
    # Identify the cell line splicing the string up to the first "."
    # Identify the minus strand's and plus strand's name, the ENCFF*.
    cell_line = tab_files[i][ :tab_files[i].find(".")]
    key_file_name = re.search(r"E[a-zA-Z0-9]+", tab_files[i]).group(0)
    for j in range(len(tab_files) // 2, len(tab_files)):
        value_file_name = re.search(r"E[a-zA-Z0-9]+", tab_files[j]).group(0)

        # Create dataframes for our plus and minus strand data.
        # Rename the columns, so the data will actually add.
        if strand_pairs[key_file_name] == value_file_name:
            key_df = pd.read_csv(f"./data/RNASeq_bw/{tab_files[i]}", sep="\t", skiprows=3, header=None)
            value_df = pd.read_csv(f"./data/RNASeq_bw/{tab_files[j]}", sep="\t", skiprows=2)

            res_df = key_df.add(value_df)
            res_df.to_csv(f"/home/coder/data-REU/stranded_files/{cell_line}.stranded.{value_file_name}.{key_file_name}_values_TSS.tab", sep="\t", index=False)

KeyboardInterrupt: 

In [6]:
coverage_files

['GM12878.minus.ENCFF074SXQ.coverage.txt',
 'GM12878.plus.ENCFF164VLA.coverage.txt']

In [13]:
# Loop to combine the signals of the minus and plus strands into a single dataset for .txt files.
for i in range(len(coverage_files) // 2):
    # Identify the cell line splicing the string up to the first "."
    # Identify the minus strand's and plus strand's name, the ENCFF*.
    cell_line = coverage_files[i][ :coverage_files[i].find(".")]
    key_file_name = re.search(r"E[a-zA-Z0-9]+", coverage_files[i]).group(0)
    for j in range(len(coverage_files) // 2, len(coverage_files)):
        value_file_name = re.search(r"E[a-zA-Z0-9]+", coverage_files[j]).group(0)
        # Create dataframes for our plus and minus strand data.
        # Rename the columns, so the data will actually add.
        if strand_pairs[key_file_name] == value_file_name:
            key_df = pd.read_csv(f"./data/RNASeq_bw/{coverage_files[i]}", sep="\t", header=None)
            value_df = pd.read_csv(f"./data/RNASeq_bw/{coverage_files[j]}", sep="\t", header=None)

            # Stack key_df and value_df on top of each other.
            stacked_df = pd.concat([key_df, value_df], ignore_index=True)

            # Perform a natural sort on stacked_df on the chromosome, as well as the start and end locations.
            sorted_df = stacked_df.sort_values(by=[6, 7, 8], key=natsorted)

            # Get the total RNA signal from each location.
            signal_df = sorted_df.groupby(by=[6, 7, 8]).agg({9: "sum"}).round(5).reset_index()

            # Merge left on the sum of the RNA signals.
            res_df = pd.merge(signal_df, sorted_df, on=[6, 7, 8, 9], how="left")
            res_df = res_df[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]
            # res_df.to_csv(f"/home/coder/data-REU/stranded_files/{cell_line}.stranded.{value_file_name}.{key_file_name}.coverage.txt", sep="\t", index=False)

In [15]:
res_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,chr1,28359.0,30359.0,ENSG00000310526,0.0,-,chr1,28359,28360,0.01987
1,chr1,28359.0,30359.0,ENSG00000310526,0.0,-,chr1,28360,28361,0.01987
2,chr1,28359.0,30359.0,ENSG00000310526,0.0,-,chr1,28361,28362,0.01987
3,chr1,28359.0,30359.0,ENSG00000310526,0.0,-,chr1,28362,28363,0.01987
4,chr1,28359.0,30359.0,ENSG00000310526,0.0,-,chr1,28363,28364,0.01987
...,...,...,...,...,...,...,...,...,...,...
17694934,chrY,13702902.0,13704902.0,ENSG00000290853,0.0,+,chrY,13703509,13703510,0.00482
17694935,chrY,13702902.0,13704902.0,ENSG00000290853,0.0,+,chrY,13703510,13703511,0.00385
17694936,chrY,13702902.0,13704902.0,ENSG00000290853,0.0,+,chrY,13703511,13703512,0.00289
17694937,chrY,13702902.0,13704902.0,ENSG00000290853,0.0,+,chrY,13703512,13703513,0.00193


In [7]:
plus = pd.read_csv(f"./data/RNASeq_bw/{coverage_files[1]}", sep="\t", header=None)
minus = pd.read_csv(f"./data/RNASeq_bw/{coverage_files[0]}", sep="\t", header=None)

In [8]:
plus

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,chr1,28359,30359,ENSG00000310526,0,-,chr1,29900,29901,0.00076
1,chr1,28359,30359,ENSG00000310526,0,-,chr1,29901,29902,0.00076
2,chr1,28359,30359,ENSG00000310526,0,-,chr1,29902,29903,0.00076
3,chr1,28359,30359,ENSG00000310526,0,-,chr1,29903,29904,0.00076
4,chr1,28359,30359,ENSG00000310526,0,-,chr1,29904,29905,0.00076
...,...,...,...,...,...,...,...,...,...,...
13828669,chrY,13702902,13704902,ENSG00000290853,0,+,chrY,13703509,13703510,0.00482
13828670,chrY,13702902,13704902,ENSG00000290853,0,+,chrY,13703510,13703511,0.00385
13828671,chrY,13702902,13704902,ENSG00000290853,0,+,chrY,13703511,13703512,0.00289
13828672,chrY,13702902,13704902,ENSG00000290853,0,+,chrY,13703512,13703513,0.00193


In [9]:
minus

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,chr1,28359,30359,ENSG00000310526,0,-,chr1,28359,28360,0.01987
1,chr1,28359,30359,ENSG00000310526,0,-,chr1,28360,28361,0.01987
2,chr1,28359,30359,ENSG00000310526,0,-,chr1,28361,28362,0.01987
3,chr1,28359,30359,ENSG00000310526,0,-,chr1,28362,28363,0.01987
4,chr1,28359,30359,ENSG00000310526,0,-,chr1,28363,28364,0.01987
...,...,...,...,...,...,...,...,...,...,...
13509197,chrX,155611952,155613952,ENSG00000185973,0,-,chrX,155612950,155612951,0.00530
13509198,chrX,155611952,155613952,ENSG00000185973,0,-,chrX,155612951,155612952,0.00530
13509199,chrX,155611952,155613952,ENSG00000185973,0,-,chrX,155612952,155612953,0.00530
13509200,chrX,155611952,155613952,ENSG00000185973,0,-,chrX,155612953,155612954,0.00530


In [10]:
plus_minus = plus.merge(minus, how="outer", on=[6, 7, 8], suffixes=["_plus", "_minus"])

In [None]:
plus_minus.iloc[:, :6].bfill(axis=1).iloc[:, 10:16]

In [18]:
plus_minus

Unnamed: 0,0_plus,1_plus,2_plus,3_plus,4_plus,5_plus,6,7,8,9_plus,0_minus,1_minus,2_minus,3_minus,4_minus,5_minus,9_minus
0,,,,,,,chr1,28359,28360,,chr1,28359.0,30359.0,ENSG00000310526,0.0,-,0.01987
1,,,,,,,chr1,28360,28361,,chr1,28359.0,30359.0,ENSG00000310526,0.0,-,0.01987
2,,,,,,,chr1,28361,28362,,chr1,28359.0,30359.0,ENSG00000310526,0.0,-,0.01987
3,,,,,,,chr1,28362,28363,,chr1,28359.0,30359.0,ENSG00000310526,0.0,-,0.01987
4,,,,,,,chr1,28363,28364,,chr1,28359.0,30359.0,ENSG00000310526,0.0,-,0.01987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27058308,chrY,13702902.0,13704902.0,ENSG00000290853,0.0,+,chrY,13703509,13703510,0.00482,,,,,,,
27058309,chrY,13702902.0,13704902.0,ENSG00000290853,0.0,+,chrY,13703510,13703511,0.00385,,,,,,,
27058310,chrY,13702902.0,13704902.0,ENSG00000290853,0.0,+,chrY,13703511,13703512,0.00289,,,,,,,
27058311,chrY,13702902.0,13704902.0,ENSG00000290853,0.0,+,chrY,13703512,13703513,0.00193,,,,,,,
