In [1]:
import pandas as pd
from tqdm import tqdm
from pandarallel import pandarallel

In [2]:
df = pd.read_csv("data/4.2_hg38_paired_introns.tsv", sep="\t", usecols=["chr","start","end","strand","RC3-Score"])
df

Unnamed: 0,chr,start,end,strand,RC3-Score
0,chr1,12227,12612,+,59151
1,chr1,12721,13220,+,61021
2,chr1,12057,12178,+,-1
3,chr1,12697,12974,+,21
4,chr1,13052,13220,+,526
...,...,...,...,...,...
499138,chrY,24883840,24886132,+,-1
499139,chrY,24888605,24889352,+,-1
499140,chrY,24889386,24901111,+,-1
499141,chrY,24833970,24840730,+,-1


In [3]:
gap = 10

In [4]:
def matchrc(row):
    potential_matches = !tabix ftp-data/recount3.bed.gz {row.chr}:{row.start-gap}-{row.start+gap} | cut -f 2,5


    potential_matches = [
        (int(entry.split('\t')[0]), int(entry.split('\t')[1]))
        for entry in potential_matches
        if row['start'] - 5 <= int(entry.split('\t')[0]) <= row['start'] + 5 and int(entry.split('\t')[1]) > row["RC3-Score"]
    ]
    if potential_matches:
        return True
    return False



In [5]:
pandarallel.initialize(nb_workers=12)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [6]:
%%time
df_test = df.copy().head(1000)
df_test["better_match_5"] = df_test.parallel_apply(matchrc, axis=1)
df_test

CPU times: user 15 ms, sys: 37 ms, total: 52 ms
Wall time: 980 ms


Unnamed: 0,chr,start,end,strand,RC3-Score,better_match_5
0,chr1,12227,12612,+,59151,False
1,chr1,12721,13220,+,61021,True
2,chr1,12057,12178,+,-1,False
3,chr1,12697,12974,+,21,True
4,chr1,13052,13220,+,526,False
...,...,...,...,...,...,...
995,chr1,1817875,1839189,-,24700,True
996,chr1,1839238,1891024,-,-1,True
997,chr1,1839238,1853151,-,561893,True
998,chr1,1853297,1890819,-,389375,False


In [7]:
del df_test

In [8]:
%%time
df["better_match_5nt_start_ss"] = df.parallel_apply(matchrc, axis=1)
df

CPU times: user 28.5 ms, sys: 43 ms, total: 71.6 ms
Wall time: 5min 3s


Unnamed: 0,chr,start,end,strand,RC3-Score,better_match_5nt_start_ss
0,chr1,12227,12612,+,59151,False
1,chr1,12721,13220,+,61021,True
2,chr1,12057,12178,+,-1,False
3,chr1,12697,12974,+,21,True
4,chr1,13052,13220,+,526,False
...,...,...,...,...,...,...
499138,chrY,24883840,24886132,+,-1,True
499139,chrY,24888605,24889352,+,-1,True
499140,chrY,24889386,24901111,+,-1,False
499141,chrY,24833970,24840730,+,-1,False


In [9]:
def matchrc_end(row):
    potential_matches = !tabix ftp-data/recount3.bed.gz {row.chr}:{row.end-gap}-{row.end+gap} | cut -f 2,5


    potential_matches = [
        (int(entry.split('\t')[0]), int(entry.split('\t')[1]))
        for entry in potential_matches
        if row['end'] - 5 <= int(entry.split('\t')[0]) <= row['end'] + 5 and int(entry.split('\t')[1]) > row["RC3-Score"]
    ]
    if potential_matches:
        return True
    return False

In [10]:
%%time
df["better_match_5nt_end_ss"] = df.parallel_apply(matchrc_end, axis=1)
df

CPU times: user 28.9 ms, sys: 38.4 ms, total: 67.3 ms
Wall time: 5min 5s


Unnamed: 0,chr,start,end,strand,RC3-Score,better_match_5nt_start_ss,better_match_5nt_end_ss
0,chr1,12227,12612,+,59151,False,False
1,chr1,12721,13220,+,61021,True,False
2,chr1,12057,12178,+,-1,False,False
3,chr1,12697,12974,+,21,True,False
4,chr1,13052,13220,+,526,False,False
...,...,...,...,...,...,...,...
499138,chrY,24883840,24886132,+,-1,True,False
499139,chrY,24888605,24889352,+,-1,True,False
499140,chrY,24889386,24901111,+,-1,False,False
499141,chrY,24833970,24840730,+,-1,False,False


In [11]:
# output
df.to_csv("data/4.6-better-nearby-match-and-rc3-10nt.tsv", sep="\t", index=False)