Skip to content

Commit

Permalink
add dual guide count function
Browse files Browse the repository at this point in the history
  • Loading branch information
abearab committed Mar 7, 2024
1 parent 2aa8a51 commit fe9a27d
Showing 1 changed file with 63 additions and 21 deletions.
84 changes: 63 additions & 21 deletions screenpro/ngs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,42 +7,84 @@
import biobear as bb


def load_fastq(fastq_file_path: str, verbose: bool=False):
t0 = time()
if verbose: print('load FASTQ file ...')

# Read the FASTQ file using read_records function
if '.gz' in fastq_file_path:
out = bb.FastqReader(fastq_file_path,compression=bb.Compression.GZIP)
else:
out = bb.FastqReader(fastq_file_path)

if verbose: print("done in %0.3fs" % (time() - t0))

return out


def fastq_to_count_unique_seq(fastq_file_path:str, trim5p_start:int=None, trim5p_length:int=None, verbose: bool=False) -> pl.DataFrame:
def fastq_to_count_single_guide(
fastq_file_path:str,
trim5p_start:int=None, trim5p_length:int=None,
verbose: bool=False) -> pl.DataFrame:

if verbose: ('count unique sequences ...')
t0 = time()

session = bb.connect()

if trim5p_start and trim5p_length:
df_count = session.sql(f"""
sql_cmd = f"""
SELECT substr(f.sequence, {trim5p_start}, {trim5p_length}) AS sequence, COUNT(*) as count
FROM fastq_scan('{fastq_file_path}') f
GROUP BY substr(f.sequence, {trim5p_start}, {trim5p_length})
"""
).to_polars()
else:
df_count = session.sql(f"""
sql_cmd = f"""
SELECT f.sequence AS sequence, COUNT(*) as count
FROM fastq_scan('{fastq_file_path}') f
GROUP BY f.sequence
"""
).to_polars()

df_count = session.sql(sql_cmd).to_polars()

if verbose: print("done in %0.3fs" % (time() - t0))

return df_count


def fastq_to_count_dual_guide(
R1_fastq_file_path:str, R2_fastq_file_path:str,
trim5p_pos1_start:int=None, trim5p_pos1_length:int=None,
trim5p_pos2_start:int=None, trim5p_pos2_length:int=None,
verbose: bool=False) -> pl.DataFrame:

if verbose: ('count unique sequences ...')
t0 = time()

session = bb.connect()

if trim5p_pos1_start and trim5p_pos1_length and trim5p_pos2_start and trim5p_pos2_length:
sql_cmd = f"""
WITH pos1 AS (
SELECT REPLACE(name, '_R1', '') trimmed_name, *
FROM fastq_scan('{R1_fastq_file_path}')
), pos2 AS (
SELECT REPLACE(name, '_R2', '') trimmed_name, *
FROM fastq_scan('{R2_fastq_file_path}')
)
SELECT substr(pos1.sequence, {trim5p_pos1_start}, {trim5p_pos1_length}) pos1_sequence, substr(reverse_complement(pos2.sequence), {trim5p_pos2_start}, {trim5p_pos2_length}) pos2_sequence, COUNT(*) count
FROM pos1
JOIN pos2
ON pos1.name = pos2.name
GROUP BY pos1_sequence, pos2_sequence
"""
elif trim5p_pos1_start==None and trim5p_pos1_length==None and trim5p_pos2_start==None and trim5p_pos2_length==None:
sql_cmd = f"""
WITH pos1 AS (
SELECT REPLACE(name, '_R1', '') trimmed_name, *
FROM fastq_scan('{R1_fastq_file_path}')
), pos2 AS (
SELECT REPLACE(name, '_R2', '') trimmed_name, *
FROM fastq_scan('{R2_fastq_file_path}')
)
SELECT pos1.sequence pos1_sequence, reverse_complement(pos2.sequence) pos2_sequence, COUNT(*) count
FROM pos1
JOIN pos2
ON pos1.name = pos2.name
GROUP BY pos1_sequence, pos2_sequence
"""
else:
raise ValueError("trim5p_pos1_start, trim5p_pos1_length, \
trim5p_pos2_start, and trim5p_pos2_length \
must be provided concurrently!")

df_count = session.sql(sql_cmd).to_polars()

if verbose: print("done in %0.3fs" % (time() - t0))

return df_count
Expand All @@ -65,4 +107,4 @@ def map_sample_counts_to_library(library, sample):
counts_df['counts'] = 0
counts_df.loc[overlap, 'counts'] = sample.set_index('sequence').loc[overlap, 'count']

return counts_df
return counts_df

0 comments on commit fe9a27d

Please sign in to comment.