/
checkpamandscore.py
623 lines (560 loc) · 38.8 KB
/
checkpamandscore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
import os
import re
import string
from Bio import SeqIO
import subprocess
from subprocess import call, Popen, PIPE
import glob
from Bio.Seq import Seq
##from Bio.Alphabet import generic_dna
import numpy as np
import numpy
import pandas as pd
import pyranges as pr
import sys
import csv
from datetime import datetime
csv.field_size_limit(sys.maxsize)
# def checkpam(argu, chr,start,secondaryhitpattern,strand, getgenome, querysequence, secondaryhitmismatch):
# length = score = ggfound = 0
# forwardseq = revcomplementseq = complementseq = "NONE"
# for record in getgenome:
# recordid = str(record.id)
# #if (strand == 0 or strand <= 256) and recordid == chr:
# if (strand == "+" ) and recordid == chr:
# forwardseq = str(record.seq)[start-1:start+22]
# if forwardseq.endswith(argu.pamsequence):
# ggfound = ggfound + 1
# elif argu.pamsequence in forwardseq[-4:]:
# ggfound = ggfound + 1
# elif (strand == "-") and recordid == chr:
# complementseq = str(record.seq)[start-1:start + 22]
# newseq = Seq(complementseq)
# revcomplementseq = str(newseq.reverse_complement())
# if revcomplementseq.endswith(argu.pamsequence):
# ggfound = ggfound + 1
# elif argu.pamsequence in revcomplementseq[-4:]:
# ggfound = ggfound + 1
# if ggfound > 0:
# #print ("inside checkpam.. before getscore")
# (length, score) = getscore(secondaryhitpattern, secondaryhitmismatch)
# return(length, score, ggfound,forwardseq,revcomplementseq)
def getscore(firstlineunused, pattern1,secondaryhitmismatch):
pattern = str(pattern1)
#print(pattern)
#print("patternprinted...")
#print (secondaryhitmismatch)
#secondaryhitmismatch = int(secondaryhitmismatch1)
#print (pattern)
#print (secondaryhitmismatch)
matchpat = re.findall("(\d+)M", pattern)
#mismatchpat = re.findall("(\d+)X", pattern)
insertionpat = re.findall("(\d+)I", pattern)
deletionpat = re.findall("(\d+)D", pattern)
substitutionpat = re.findall("(\d+)D", pattern)
#print("inside checkpam.. inside getscore")
if len(matchpat) > 0:
summatchpat = sum(map(int, matchpat))
else:
summatchpat = 0
if len(substitutionpat) > 0:
substitutionpat = sum(map(int, substitutionpat))
else:
substitutionpat = 0
if len(insertionpat) > 0:
suminsertionpat = sum(map(int, insertionpat))
else:
suminsertionpat = 0
length = summatchpat + substitutionpat + suminsertionpat
#score = round(float(summatchpat - secondaryhitmismatch - suminsertionpat)/float(length), 3)
score = (summatchpat - secondaryhitmismatch - suminsertionpat) / length
##return(length, score)
#print(score)
return (score)
#
# def calculatewrite(argu, fileoutmap, gc_content, counthits, countoffhits, inwindscore, outwindscore, queryname, querychr, arbitraryid, querystart, offtargetlist, secondaryhitschr, bestwindow, newlist, querysequence, getstddev):
# import string
# suminwindscore = sum(map(float, inwindscore))
# sumoutwindscore = sum(map(float, outwindscore))
# finalscore = ((float(counthits) * (suminwindscore/float(counthits)))/float(counthits+countoffhits)) - ((float(countoffhits) * (sumoutwindscore/float(countoffhits)))/float(counthits+countoffhits))
# ### Below sequence translate take more time ###
# if counthits > argu.minhits:
# if "candidaternars" in queryname:
# ## For python 3.0 it changed to str.maketrans
# #complements = string.maketrans('ATGC', 'TACG')
# #complements = str.maketrans('ATGC', 'TACG')
# #print (querysequence)
# #querysequence = querysequence.translate(complements)[::-1]
# #print (querysequence)
# ############ Above sequence translate take more time ###
# fileoutmap.write(str(queryname) + "_" + str(querychr) + "_" + str(arbitraryid) + "_" + str(querystart) + "\t" + str(querysequence) + "\t" + str(gc_content)+ "\t" + str(finalscore) + "\t" + str(getstddev) + "\t" + str(suminwindscore) + "\t" + str(sumoutwindscore) + "\t" + str(counthits) + "\t" + str(len(offtargetlist)) + "\t" + str(querychr) + ":" + str(min(bestwindow)) + "-" + str(max(bestwindow)) + "\t" + str(newlist) + "\n")
# elif "candidaternafs" in queryname:
# fileoutmap.write(str(queryname) + "_" + str(querychr) + "_" + str(arbitraryid) + "_" + str(querystart) + "\t" + str(querysequence) + "\t" + str(gc_content) + "\t" + str(finalscore) + "\t" + str(getstddev) + "\t" + str(suminwindscore) + "\t" + str(sumoutwindscore) + "\t" + str(counthits) + "\t" + str(len(offtargetlist)) + "\t" + str(querychr) + ":" + str(min(bestwindow)) + "-" + str(max(bestwindow)) + "\t" + str(newlist) + "\n")
# elif counthits < 1:
# print ("Pattern Match error")
def calculatestdev(bestwindow):
if len(bestwindow) > 4:
getstddev = numpy.std(bestwindow, axis=0)
else:
getstddev = "NULL"
return getstddev
def getallresults(argu):
dataresult = pd.read_csv("Crispr_broad_multihits.xls", names=['Chromosome', 'End', 'Start', 'Totalhits', 'finalscore', 'offtargetshits', 'score', 'Sequence', 'stdev', 'stringforgroup', 'taggingcount'], sep="\t", header=None)
##Chromosome End Start Totalhits finalscore offtargetshits score sequence stdev stringforgroup taggingcount
print ("Pooling the results...")
column_names = ['Chromosome', 'Start', 'End', 'Totalhits', 'finalscore', 'offtargetshits', 'score', 'Sequence', 'stdev', 'stringforgroup', 'taggingcount']
dataresult = dataresult.reindex(columns=column_names)
print (dataresult)
dataresult1 = dataresult.sort_values(['offtargetshits', 'stringforgroup'], ascending=[True, True])
dataresult1.to_csv("Crispr_broad_multihits_sorted.xls", sep="\t", index=False, header=True)
def checkpandapamquerywindow (rowdatchunk, argu, splitinputfile):
genomesplitoriginal = "Candidate_crisprnatabbed.txt"
datachunkallcrna = pd.read_csv(genomesplitoriginal, header=None, names=['name', 'Str', 'Chromosome', 'crnanumber', 'Startggposition'], sep="\t", dtype=str, engine='c', index_col=False, low_memory=False, na_filter=True)
eachrowdataframe = pd.DataFrame(rowdatchunk['colCIGAR'], columns=['flips'])
eachrowdataframe['flips'] = eachrowdataframe['flips'].str.replace('XA:Z:', '', regex=False)
eachrowdataframe['flips'] = eachrowdataframe['flips'].str.replace('+', '+,', regex=False)
eachrowdataframe['flips'] = eachrowdataframe['flips'].str.replace('-', '-,', regex=False)
eachrowdataframe = eachrowdataframe['flips'].str.replace('XA:Z:', '', regex=False)
eachrowdataframe1 = pd.DataFrame(eachrowdataframe.str.split(',').tolist(), columns=['Chromosome', 'Str', 'Start', 'mismatch', 'unusedstrings'])
eachrowdataframe2 = eachrowdataframe1.copy()
eachrowdataframe2.loc[eachrowdataframe1.index.max()+1] = [rowdatchunk['chr'],rowdatchunk['strand'], rowdatchunk['firsthitstart'], '23M','0']
eachrowdataframe2 = eachrowdataframe2.dropna()
eachrowdataframe2['score'] = eachrowdataframe2.apply(lambda x: getscore(x, x['mismatch'], pd.to_numeric(x['unusedstrings']) ), axis=1)
eachrowdataframe2.loc[:,'taggingcount'] = 1
#########################
eachrowdataframe3 = eachrowdataframe2[['Chromosome', 'Start', 'score','taggingcount']]
eachrowdataframe3.loc[:,'End'] = eachrowdataframe3['Start'].astype(int) + 27
eachrowdataframe3 = eachrowdataframe3.dropna()
datachunkallcrna1 = pd.DataFrame()
datachunkallcrna1['Chromosome'] = datachunkallcrna['Chromosome']
datachunkallcrna1['Start'] = datachunkallcrna['Startggposition']
datachunkallcrna1.loc[:,'Start'] = datachunkallcrna1['Start'].astype(int) - 4
datachunkallcrna1.loc[:,'End'] = datachunkallcrna['Startggposition'].astype(int) + 4
overlappedggdataframe = (pr.PyRanges(eachrowdataframe3).overlap(pr.PyRanges(datachunkallcrna1))).df
if (len(overlappedggdataframe) >= int(argu.minhits) & len(overlappedggdataframe) <= int(argu.maxhits)):
#allchrmax = overlappedggdataframe[['Chromosome', 'Start']].groupby('Chromosome').Start.agg('max').reset_index()
#allchrmin = overlappedggdataframe[['Chromosome', 'Start']].groupby('Chromosome').Start.agg('min')
#allchrnew = pd.merge(allchrmin, allchrmax, on='Chromosome')
#allchrnew.columns = ['Chromosome', 'Start', 'End']
################### Window from the given ########
filemakewindowsinwindow = pd.read_csv(argu.inputbed, sep="\t", names=['Chromosome', 'Start', 'End'])
filemakewindowsinwindow1 =pr.PyRanges(filemakewindowsinwindow)
###################
getscorewindow = (filemakewindowsinwindow1.join(pr.PyRanges(overlappedggdataframe))).df
suminwindscore = getscorewindow['score'].sum()
counthits = getscorewindow['score'].shape[0]
getscorewindow['stringforgroup'] = getscorewindow[['Chromosome', 'Start', 'End']].astype(str).agg('-'.join, axis=1)
#getscorewindow1 = getscorewindow.groupby(['stringforgroup'], as_index=False).agg({'score': "sum"})
#print (getscorewindow)
getscorewindowsum = getscorewindow.groupby(['stringforgroup'], as_index=False)['score'].sum()
#print(getscorewindow2)
getscorewindowcount = getscorewindow.groupby(['stringforgroup'], as_index=False)['taggingcount'].count()
getscorewindow2 = pd.merge(getscorewindowcount, getscorewindowsum, on='stringforgroup')
#print (getscorewindow2)
hitslistforstdevdf = getscorewindow.groupby('stringforgroup')['Start_b'].apply(list).reset_index(name='starthistaslist')
#print(hitslistforstdevdf)
getscorewindow2["stdev"] = hitslistforstdevdf.apply(lambda x: calculatestdev( x['starthistaslist']), axis=1)
#print (getscorewindow2)
#getscorewindow1[['Chromosome', 'Start', 'End']] = getscorewindow1['stringforgroup'].str.split("-", expand=True)
#print(getscorewindow2)
##########
getscorenonwindow = (pr.PyRanges(overlappedggdataframe).join(filemakewindowsinwindow1, how="left")).df
sumoutwindscore = getscorenonwindow[getscorenonwindow['Start_b'] < 0]['score'].sum()
countoffhits = getscorenonwindow[getscorenonwindow['Start_b'] < 0].shape[0]
#############
#suminwindscore = sum(map(float, inwindscore))
#sumoutwindscore = sum(map(float, outwindscore))
finalscore = ((float(counthits) * (suminwindscore / float(counthits))) / float(counthits + countoffhits)) - ((float(countoffhits) * (sumoutwindscore / float(countoffhits))) / float(counthits + countoffhits))
###########
#print (finalscore)
#getstddev = calculatestdev(bestwindow)
getscorewindow2.loc[:,"finalscore"] = finalscore
getscorewindow2["crnaid"] = rowdatchunk['crnaid']
getscorewindow2["sequence"] = rowdatchunk['sequence']
getscorewindow2.loc[:,"Totalhits"] = len(overlappedggdataframe)
#print(getscorewindow2)
####################
getscorewindow3 = getscorewindow2.sort_values(['taggingcount'], ascending=[False]).head(argu.windownumbers)
# print(data1['Totalhits'].iloc[0])
# print(data1)
getscorewindow3['offtargetshits'] = pd.to_numeric(getscorewindow3['Totalhits']).iloc[0] - getscorewindow3['taggingcount'].sum()
# data1['off-targets-hits'] = data1.apply(lambda row: TotalHitsforthatRNA - inwindowtarget, axis=1)
# print ("I am writing")
getscorewindow3[['Chromosome', 'Start', 'End']] = getscorewindow3['stringforgroup'].str.split("-", expand=True)
#print (getscorewindow3)
fileoutstd = open(os.path.join(argu.workingdirectory, "Crispr_broad_multihits.xls"), 'a')
getscorewindow3.to_csv(fileoutstd, sep="\t", index=False, header=False)
def checkpandapam(rowdatchunk, argu, splitinputfile):
genomesplitoriginal = "Candidate_crisprnatabbed.txt"
datachunkallcrna = pd.read_csv(genomesplitoriginal, header=None, names=['name', 'Str', 'Chromosome', 'crnanumber', 'Startggposition'], sep="\t", dtype=str, engine='c', index_col=False, low_memory=False, na_filter=True)
eachrowdataframe = pd.DataFrame(rowdatchunk['colCIGAR'], columns=['flips'])
eachrowdataframe['flips'] = eachrowdataframe['flips'].str.replace('XA:Z:', '', regex=False)
eachrowdataframe['flips'] = eachrowdataframe['flips'].str.replace('+', '+,', regex=False)
eachrowdataframe['flips'] = eachrowdataframe['flips'].str.replace('-', '-,', regex=False)
eachrowdataframe = eachrowdataframe['flips'].str.replace('XA:Z:', '', regex=False)
eachrowdataframe1 = pd.DataFrame(eachrowdataframe.str.split(',').tolist(), columns=['Chromosome', 'Str', 'Start', 'mismatch', 'unusedstrings'])
eachrowdataframe2 = eachrowdataframe1.copy()
eachrowdataframe2.loc[eachrowdataframe1.index.max()+1] = [rowdatchunk['chr'],rowdatchunk['strand'], rowdatchunk['firsthitstart'], '23M','0']
eachrowdataframe2 = eachrowdataframe2.dropna()
eachrowdataframe2['score'] = eachrowdataframe2.apply(lambda x: getscore(x, x['mismatch'], pd.to_numeric(x['unusedstrings']) ), axis=1)
eachrowdataframe2.loc[:,'taggingcount'] = 1
#########################
eachrowdataframe3 = eachrowdataframe2[['Chromosome', 'Start', 'score','taggingcount']]
#eachrowdataframe3['End'] = eachrowdataframe3['Start'].astype(int) + 27
eachrowdataframe3.loc[:,'End'] = eachrowdataframe3['Start'].astype(int) + 27
eachrowdataframe3 = eachrowdataframe3.dropna()
#print (eachrowdataframe3)
datachunkallcrna1 = pd.DataFrame()
datachunkallcrna1['Chromosome'] = datachunkallcrna['Chromosome']
datachunkallcrna1['Start'] = datachunkallcrna['Startggposition']
datachunkallcrna1.loc[:,'Start'] = datachunkallcrna1['Start'].astype(int) - 4
datachunkallcrna1.loc[:,'End'] = datachunkallcrna['Startggposition'].astype(int) + 4
ts = time.time()
print(ts)
print ("before overlap")
overlappedggdataframe = (pr.PyRanges(eachrowdataframe3).overlap(pr.PyRanges(datachunkallcrna1))).df
ts = time.time()
print(ts)
print ("After overlap")
if (len(overlappedggdataframe) >= int(argu.minhits) & len(overlappedggdataframe) <= int(argu.maxhits)):
allchrmax = overlappedggdataframe[['Chromosome', 'Start']].groupby('Chromosome').Start.agg('max').reset_index()
allchrmin = overlappedggdataframe[['Chromosome', 'Start']].groupby('Chromosome').Start.agg('min')
allchrnew = pd.merge(allchrmin, allchrmax, on='Chromosome')
allchrnew.columns = ['Chromosome', 'Start', 'End']
filemakewindowsinwindow1 = pr.PyRanges(allchrnew).window(argu.windowsize)
#filemakewindowsinwindow1 = pr.PyRanges(filemakewindowsinwindow)
getscorewindow = (filemakewindowsinwindow1.join(pr.PyRanges(overlappedggdataframe))).df
suminwindscore = getscorewindow['score'].sum()
counthits = getscorewindow['score'].shape[0]
getscorewindow['stringforgroup'] = getscorewindow[['Chromosome', 'Start', 'End']].astype(str).agg('-'.join, axis=1)
#getscorewindow1 = getscorewindow.groupby(['stringforgroup'], as_index=False).agg({'score': "sum"})
#print (getscorewindow)
getscorewindowsum = getscorewindow.groupby(['stringforgroup'], as_index=False)['score'].sum()
#print(getscorewindow2)
getscorewindowcount = getscorewindow.groupby(['stringforgroup'], as_index=False)['taggingcount'].count()
getscorewindow2 = pd.merge(getscorewindowcount, getscorewindowsum, on='stringforgroup')
#print (getscorewindow2)
hitslistforstdevdf = getscorewindow.groupby('stringforgroup')['Start_b'].apply(list).reset_index(name='starthistaslist')
#print(hitslistforstdevdf)
getscorewindow2["stdev"] = hitslistforstdevdf.apply(lambda x: calculatestdev( x['starthistaslist']), axis=1)
##print (getscorewindow2)
#getscorewindow1[['Chromosome', 'Start', 'End']] = getscorewindow1['stringforgroup'].str.split("-", expand=True)
#print(getscorewindow2)
##########
getscorenonwindow = (pr.PyRanges(overlappedggdataframe).join(filemakewindowsinwindow1, how="left")).df
sumoutwindscore = getscorenonwindow[getscorenonwindow['Start_b'] < 0]['score'].sum()
countoffhits = getscorenonwindow[getscorenonwindow['Start_b'] < 0].shape[0]
#############
#suminwindscore = sum(map(float, inwindscore))
#sumoutwindscore = sum(map(float, outwindscore))
finalscore = ((float(counthits) * (suminwindscore / float(counthits))) / float(counthits + countoffhits)) - ((float(countoffhits) * (sumoutwindscore / float(countoffhits))) / float(counthits + countoffhits))
###########
#print (finalscore)
#getstddev = calculatestdev(bestwindow)
getscorewindow2.loc[:,"finalscore"] = finalscore
getscorewindow2["crnaid"] = rowdatchunk['crnaid']
getscorewindow2["sequence"] = rowdatchunk['sequence']
getscorewindow2.loc[:,"Totalhits"] = len(overlappedggdataframe)
#print(getscorewindow2)
####################
getscorewindow3 = getscorewindow2.sort_values(['taggingcount'], ascending=[False]).head(argu.windownumbers)
# print(data1['Totalhits'].iloc[0])
# print(data1)
getscorewindow3['offtargetshits'] = pd.to_numeric(getscorewindow3['Totalhits']).iloc[0] - getscorewindow3['taggingcount'].sum()
# data1['off-targets-hits'] = data1.apply(lambda row: TotalHitsforthatRNA - inwindowtarget, axis=1)
# print ("I am writing")
getscorewindow3[['Chromosome', 'Start', 'End']] = getscorewindow3['stringforgroup'].str.split("-", expand=True)
#print (getscorewindow3)
fileoutstd = open(os.path.join(argu.workingdirectory, "Crispr_broad_multihits.xls"), 'a')
getscorewindow3.to_csv(fileoutstd, sep="\t", index=False, header=False)
def overlapeachchrommultiwindow(argu, splitinputfile):
print ("Processing:",splitinputfile)
for datachunk in pd.read_csv(splitinputfile, header=None, names =['crnaid', 'strand', 'firsthitschr', 'firsthitstart', 'sequence', 'colCIGAR', 'Totalhits'], sep="\t", comment='@', dtype=str, engine='c', index_col=False, low_memory=False, na_filter=True, chunksize=200000, iterator=True):
datachunk["colCIGAR"] = datachunk["colCIGAR"].str.split(";", expand=False)
datachunk = datachunk[datachunk["colCIGAR"].str.len() > int(argu.minhits)]
#datachunk["colCIGAR"] = datachunk["colCIGAR"].str.split(";", expand=False)
pd.set_option('display.max_rows', 10)
#print (datachunk)
#print (datachunk["colCIGAR"])
#datachunk.apply(lambda x: print (x['crnaid']), axis =1)
if argu.inputbed is None:
datachunk.apply(lambda x: checkpandapam(x, argu, splitinputfile), axis =1 )
else:
datachunk.apply(lambda x: checkpandapamquerywindow(x, argu, splitinputfile), axis=1)
#def checkpandapamsingle(rowdatchunk, argu, datachunkallcrna1):
def checkpandapamsingle(eachrowdataframe, argu, datachunkallcrna1):
#working module raj 25 april 2022
#eachrowdataframe = pd.DataFrame(rowdatchunk['colCIGAR'], columns=['flips'])
#eachrowdataframe['flips'] = eachrowdataframe['flips'].str.replace('XA:Z:', '', regex=False)
eachrowdataframe['colCIGAR'] = eachrowdataframe['colCIGAR'].str.replace('+', '+,', regex=False)
eachrowdataframe['colCIGAR'] = eachrowdataframe['colCIGAR'].str.replace('-', '-,', regex=False)
eachrowdataframe['colCIGAR'] = eachrowdataframe['colCIGAR'].str.replace('XA:Z:', '', regex=False)
#print(eachrowdataframe.head(10))
print("Before split")
eachrowdataframe[['Chromosome', 'Str', 'Start', 'mismatch', 'unusedstrings']] = eachrowdataframe['colCIGAR'].str.split(',', expand=True)
if argu.avoidscoreoff == 1:
eachrowdataframe['score'] = 1
eachrowdataframe.loc[:, 'taggingcount'] = 1
else:
eachrowdataframe['score'] = eachrowdataframe.apply(lambda x: getscore(x, x['mismatch'], pd.to_numeric(x['unusedstrings']) ), axis=1)
eachrowdataframe.loc[:,'taggingcount'] = 1
#########################
eachrowdataframe3 = eachrowdataframe[['Chromosome', 'Start', 'score','taggingcount','crnaid','sequence']]
#print (eachrowdataframe3.head(10))
eachrowdataframe3 = eachrowdataframe3.dropna()
#eachrowdataframe3['End'] = eachrowdataframe3['Start'].astype(int) + 27
eachrowdataframe3.loc[:,'End'] = eachrowdataframe3['Start'].astype(int) + 27
#eachrowdataframe3 = eachrowdataframe3.dropna()
#print (eachrowdataframe3.head(10))
#datachunkallcrna1 = pd.DataFrame()
#datachunkallcrna1['Chromosome'] = datachunkallcrna['Chromosome']
#datachunkallcrna1['Start'] = datachunkallcrna['Startggposition']
#datachunkallcrna1.loc[:,'Start'] = datachunkallcrna1['Start'].astype(int) - 4
#datachunkallcrna1.loc[:,'End'] = datachunkallcrna['Startggposition'].astype(int) + 4
#now = datetime.now()
#current_time = now.strftime("%H:%M:%S")
#print(current_time)
#print ("before overlap")
overlappedggdataframe = (pr.PyRanges(eachrowdataframe3).overlap(pr.PyRanges(datachunkallcrna1))).df
#print ("overlap done")
#now = datetime.now()
#current_time = now.strftime("%H:%M:%S")
#print(current_time)
#print(overlappedggdataframe.head(10))
foreachoverlappedggdataframe = overlappedggdataframe[['Chromosome','Start', 'End', 'score','taggingcount','crnaid','sequence']].groupby('crnaid')
#print(foreachoverlappedggdataframe.head(10))
for eachname, eachgroupedframe in foreachoverlappedggdataframe:
#print(eachname)
#print (eachgroupedframe.head(10))
if (len(eachgroupedframe) >= int(argu.minhits)):
allchrmax = eachgroupedframe[['Chromosome', 'End']].groupby('Chromosome').End.agg('max').reset_index()
allchrmin = eachgroupedframe[['Chromosome', 'Start']].groupby('Chromosome').Start.agg('min')
allchrnew = pd.merge(allchrmin, allchrmax, on='Chromosome')
allchrnew.columns = ['Chromosome', 'Start', 'End']
allchrnew = allchrnew[~allchrnew.isin([np.nan, np.inf, -np.inf]).any(1)]
filemakewindowsinwindow1 = pr.PyRanges(allchrnew).window(argu.windowsize)
if argu.avoidscoreoff == 1:
getscorewindow = (filemakewindowsinwindow1.join(pr.PyRanges(eachgroupedframe))).df
#print(getscorewindow.head(10))
getscorewindow['stringforgroup'] = getscorewindow[['Chromosome', 'Start', 'End']].astype(str).agg('-'.join, axis=1)
getscorewindowsum = getscorewindow.groupby(['stringforgroup'], as_index=False)['score'].sum()
getscorewindowcount = getscorewindow.groupby(['stringforgroup'], as_index=False)['taggingcount'].count()
getscorewindow2 = pd.merge(getscorewindowcount, getscorewindowsum, on='stringforgroup')
#print(getscorewindow2.head(10))
#getscorewindow['stringforgroup'] = getscorewindow[['Chromosome', 'Start', 'End']].astype(str).agg('-'.join, axis=1)
#getscorewindow2.loc[:,"sequence1"] = eachgroupedframe['sequence'][0]
getscorewindow2.loc[:,"Totalhits"] = len(eachgroupedframe)
getscorewindow2.loc[:,"sequence"] = eachgroupedframe['sequence'].iloc[0]
#print(hitslistforstdevdf)
hitslistforstdevdf = getscorewindow.groupby('stringforgroup')['Start_b'].apply(list).reset_index(name='starthistaslist')
getscorewindow2["stdev"] = hitslistforstdevdf.apply(lambda x: calculatestdev( x['starthistaslist']), axis=1)
getscorewindow2.loc[:,"finalscore"] = 1
#argu.windownumbers = 1
#print(getscorewindow2.head(10))
#print(eachgroupedframe['sequence'].iloc[0])
#getscorewindow['stringforgroup'] = getscorewindow[['Chromosome', 'Start', 'End']].astype(str).agg('-'.join, axis=1)
getscorewindow3 = getscorewindow2.sort_values(['taggingcount'], ascending=[False]).head(argu.windownumbers)
getscorewindow3['offtargetshits'] = pd.to_numeric(getscorewindow3['Totalhits']).iloc[0] - getscorewindow3['taggingcount'].sum()
getscorewindow3[['Chromosome', 'Start', 'End']] = getscorewindow3['stringforgroup'].str.split("-", expand=True)
fileoutstd = open(os.path.join(argu.workingdirectory, "Crispr_broad_multihits.xls"), 'a')
getscorewindow3 = getscorewindow3.reindex(sorted(getscorewindow3.columns), axis=1)
getscorewindow3.to_csv(fileoutstd, sep="\t", index=False, header=False)
#print (getscorewindow3.head(10))
#now = datetime.now()
#current_time = now.strftime("%H:%M:%S")
#print(current_time)
else:
getscorewindow = (filemakewindowsinwindow1.join(pr.PyRanges(eachgroupedframe))).df
suminwindscore = getscorewindow['score'].sum()
counthits = getscorewindow['score'].shape[0]
getscorewindow['stringforgroup'] = getscorewindow[['Chromosome', 'Start', 'End']].astype(str).agg('-'.join, axis=1)
#getscorewindow1 = getscorewindow.groupby(['stringforgroup'], as_index=False).agg({'score': "sum"})
#print (getscorewindow)
getscorewindowsum = getscorewindow.groupby(['stringforgroup'], as_index=False)['score'].sum()
#print(getscorewindow2)
getscorewindowcount = getscorewindow.groupby(['stringforgroup'], as_index=False)['taggingcount'].count()
getscorewindow2 = pd.merge(getscorewindowcount, getscorewindowsum, on='stringforgroup')
#print (getscorewindow2)
hitslistforstdevdf = getscorewindow.groupby('stringforgroup')['Start_b'].apply(list).reset_index(name='starthistaslist')
#print(hitslistforstdevdf)
getscorewindow2["stdev"] = hitslistforstdevdf.apply(lambda x: calculatestdev( x['starthistaslist']), axis=1)
#print (getscorewindow2)
#getscorewindow1[['Chromosome', 'Start', 'End']] = getscorewindow1['stringforgroup'].str.split("-", expand=True)
#print(getscorewindow2)
########## OFF Target
getscorenonwindow = (pr.PyRanges(eachgroupedframe).join(filemakewindowsinwindow1, how="left")).df
sumoutwindscore = getscorenonwindow[getscorenonwindow['Start_b'] < 0]['score'].sum()
countoffhits = getscorenonwindow[getscorenonwindow['Start_b'] < 0].shape[0]
#############
#suminwindscore = sum(map(float, inwindscore))
#sumoutwindscore = sum(map(float, outwindscore))
finalscore = ((float(counthits) * (suminwindscore / float(counthits))) / float(counthits + countoffhits)) - ( (float(countoffhits) * (sumoutwindscore / float(countoffhits))) / float(counthits + countoffhits))
###########
#print (finalscore)
#getstddev = calculatestdev(bestwindow)
getscorewindow2.loc[:,"finalscore"] = finalscore
#getscorewindow2["crnaid"] = eachgroupedframe['crnaid']
getscorewindow2.loc[:,"Totalhits"] = len(eachgroupedframe)
getscorewindow2.loc[:,"sequence"] = eachgroupedframe['sequence'].iloc[0]
#argu.windownumbers = 1
#print(getscorewindow2)
####################
argu.windownumbers = 1
getscorewindow3 = getscorewindow2.sort_values(['taggingcount'], ascending=[False]).head(argu.windownumbers)
# print(data1['Totalhits'].iloc[0])
# print(data1)
getscorewindow3['offtargetshits'] = pd.to_numeric(getscorewindow3['Totalhits']).iloc[0] - getscorewindow3['taggingcount'].sum()
# data1['off-targets-hits'] = data1.apply(lambda row: TotalHitsforthatRNA - inwindowtarget, axis=1)
# print ("I am writing")
getscorewindow3[['Chromosome', 'Start', 'End']] = getscorewindow3['stringforgroup'].str.split("-", expand=True)
#print (getscorewindow3)
fileoutstd = open(os.path.join(argu.workingdirectory, "Crispr_broad_multihits.xls"), 'a')
getscorewindow3 = getscorewindow3.reindex(sorted(getscorewindow3.columns), axis=1)
getscorewindow3.to_csv(fileoutstd, sep="\t", index=False, header=False)
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(current_time)
def overlapeachchromosomesingle(argu, splitinputfile):
print ("Processing:",splitinputfile)
genomesplitoriginal = "Candidate_crisprnatabbed.txt2col"
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(current_time)
###datachunkallcrna = pd.read_csv(genomesplitoriginal, header=None, names=['name', 'Str', 'Chromosome', 'crnanumber', 'Startggposition'], sep="\t", dtype=str, engine='c', index_col=False, low_memory=False, na_filter=True)
datachunkallcrna = pd.read_csv(genomesplitoriginal, header=None, names=['Chromosome', 'Startggposition'], sep="\t", dtype=str, engine='c', index_col=False, low_memory=False, na_filter=True)
print ("Two col data loaded")
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(current_time)
datachunkallcrna1 = pd.DataFrame()
datachunkallcrna1['Chromosome'] = datachunkallcrna['Chromosome']
datachunkallcrna1['Start'] = datachunkallcrna['Startggposition']
datachunkallcrna1.loc[:,'Start'] = datachunkallcrna1['Start'].astype(int) - 4
datachunkallcrna1.loc[:,'End'] = datachunkallcrna['Startggposition'].astype(int) + 4
for datachunk in pd.read_csv(splitinputfile, header=None, usecols =['crnaid','colCIGAR','sequence'], names =['crnaid', 'strand', 'firsthitschr', 'firsthitstart', 'sequence','waste1','waste2', 'colCIGAR', 'Totalhits'], sep="\t", comment='@', dtype=str, engine='c', index_col=False, low_memory=False, na_filter=True, chunksize=2000, iterator=True):
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(current_time)
print ("Filtered hits started after loading chunk1")
#datachunk["colCIGAR"] = datachunk["colCIGAR"].str.split(";", expand=False)
datachunk['colCIGAR'] = datachunk["colCIGAR"].str.split(";")
datachunk = datachunk[datachunk["colCIGAR"].str.len() > int(argu.minhits)]
datachunk = datachunk.set_index(['crnaid','sequence']).apply(pd.Series.explode).reset_index()
datachunk1 = datachunk[['crnaid', 'colCIGAR','sequence']]
#print (datachunk1.head(10))
#datachunk = datachunk[datachunk["colCIGAR"].str.len() > int(argu.minhits)]
#checkpandapamsingle(datachunk1, argu, datachunkallcrna1)
checkpandapamsingle(datachunk1, argu, datachunkallcrna1)
###datachunk.apply(lambda x: checkpandapamsingle(x, argu, splitinputfile, datachunkallcrna1), axis =1 )
def getmultisgrna(argu):
#multisgrnafile = "crispr_broad_multisgrna.xls"
multisgrnafile= pr.PyRanges(pd.read_csv(argu.crisprbroadresfile, sep="\t"))
getscorewindow = (multisgrnafile.join(multisgrnafile,slack=-10)).df
#getscorewindow1 = getscorewindow.groupby(['crnaid', 'crnaid_b']).agg(['count'])
getscorewindow1 = getscorewindow.groupby(['crnaid', 'crnaid_b'])['crnaid'].count().reset_index(name='counts').sort_values(['counts'], ascending=False)
#getscorewindow2 = getscorewindow1.sort_values(['count'], ascending=[False]).head(argu.windownumbers)
#getscorewindow1 = getscorewindow.groupby(['crnaid', 'crnaid_b'])['crnaid'].count().reset_index(name='counts')
getscorewindow2 = getscorewindow1.groupby(['crnaid']).head(argu.sgrnanumbers)
getscorewindow3 = pd.merge(getscorewindow2, getscorewindow, how='left', left_on=['crnaid','crnaid_b'], right_on = ['crnaid','crnaid_b'] )
#overlappedggdataframe = (pr.PyRanges(eachrowdataframe2).overlap(pr.PyRanges(datachunkallcrna1))).df
#getscorewindow.to_csv(os.path.join(argu.workingdirectory, "crispr_broad_multisgrna.xls"), sep="\t", index=False, header=True)
#getscorewindow1.to_csv(os.path.join(argu.workingdirectory, "crispr_broad_grouped_multisgrna.xls"), sep="\t", index=False,header=True)
#getscorewindow2.to_csv(os.path.join(argu.workingdirectory, "crispr_broad_grouped_multisgrna2.xls"), sep="\t", index=False, header=True)
getscorewindow3.to_csv(os.path.join(argu.workingdirectory, "Crispr_broad_multisgrna.xls"), sep="\t", index=False, header=True)
#filemultisgrnain = open(os.path.join(argu.workingdirectory, "crispr_broad_multisgrna.xls"), 'w+')
#p2 = subprocess.Popen(['bedtools', 'intersect', '-a', str(argu.crisprbroadresfile), '-b', str(argu.crisprbroadresfile), '-wao'], stdout=filemultisgrnain)
#p2.communicate()
def checkpandapamsingleWORKINGOLD(rowdatchunk, argu, splitinputfile, datachunkallcrna1):
#genomesplitoriginal = "Candidate_crisprnatabbed.txt"
#datachunkallcrna = pd.read_csv(genomesplitoriginal, header=None, names=['name', 'Str', 'Chromosome', 'crnanumber', 'Startggposition'], sep="\t", dtype=str, engine='c', index_col=False, low_memory=False, na_filter=True)
eachrowdataframe = pd.DataFrame(rowdatchunk['colCIGAR'], columns=['flips'])
eachrowdataframe['flips'] = eachrowdataframe['flips'].str.replace('XA:Z:', '', regex=False)
eachrowdataframe['flips'] = eachrowdataframe['flips'].str.replace('+', '+,', regex=False)
eachrowdataframe['flips'] = eachrowdataframe['flips'].str.replace('-', '-,', regex=False)
eachrowdataframe = eachrowdataframe['flips'].str.replace('XA:Z:', '', regex=False)
eachrowdataframe1 = pd.DataFrame(eachrowdataframe.str.split(',').tolist(), columns=['Chromosome', 'Str', 'Start', 'mismatch', 'unusedstrings'])
eachrowdataframe2 = eachrowdataframe1.copy()
eachrowdataframe2.loc[eachrowdataframe1.index.max()+1] = [rowdatchunk['chr'],rowdatchunk['strand'], rowdatchunk['firsthitstart'], '23M','0']
eachrowdataframe2 = eachrowdataframe2.dropna()
if argu.avoidscoreoff == 1:
eachrowdataframe2['score'] = 1
eachrowdataframe2.loc[:, 'taggingcount'] = 1
else:
eachrowdataframe2['score'] = eachrowdataframe2.apply(lambda x: getscore(x, x['mismatch'], pd.to_numeric(x['unusedstrings']) ), axis=1)
eachrowdataframe2.loc[:,'taggingcount'] = 1
#########################
eachrowdataframe3 = eachrowdataframe2[['Chromosome', 'Start', 'score','taggingcount']]
#eachrowdataframe3['End'] = eachrowdataframe3['Start'].astype(int) + 27
eachrowdataframe3.loc[:,'End'] = eachrowdataframe3['Start'].astype(int) + 27
eachrowdataframe3 = eachrowdataframe3.dropna()
#print (eachrowdataframe3)
#datachunkallcrna1 = pd.DataFrame()
#datachunkallcrna1['Chromosome'] = datachunkallcrna['Chromosome']
#datachunkallcrna1['Start'] = datachunkallcrna['Startggposition']
#datachunkallcrna1.loc[:,'Start'] = datachunkallcrna1['Start'].astype(int) - 4
#datachunkallcrna1.loc[:,'End'] = datachunkallcrna['Startggposition'].astype(int) + 4
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(current_time)
print ("before overlap")
overlappedggdataframe = (pr.PyRanges(eachrowdataframe3).overlap(pr.PyRanges(datachunkallcrna1))).df
print ("overlap done")
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(current_time)
#print (overlappedggdataframe)
if (len(overlappedggdataframe) >= int(argu.minhits)):
allchrmax = overlappedggdataframe[['Chromosome', 'End']].groupby('Chromosome').End.agg('max').reset_index()
allchrmin = overlappedggdataframe[['Chromosome', 'Start']].groupby('Chromosome').Start.agg('min')
allchrnew = pd.merge(allchrmin, allchrmax, on='Chromosome')
allchrnew.columns = ['Chromosome', 'Start', 'End']
allchrnew = allchrnew[~allchrnew.isin([np.nan, np.inf, -np.inf]).any(1)]
filemakewindowsinwindow1 = pr.PyRanges(allchrnew).window(argu.windowsize)
#print (filemakewindowsinwindow1)
#filemakewindowsinwindow1 = pr.PyRanges(filemakewindowsinwindow)
getscorewindow = (filemakewindowsinwindow1.join(pr.PyRanges(overlappedggdataframe))).df
#print (getscorewindow)
suminwindscore = getscorewindow['score'].sum()
counthits = getscorewindow['score'].shape[0]
getscorewindow['stringforgroup'] = getscorewindow[['Chromosome', 'Start', 'End']].astype(str).agg('-'.join, axis=1)
#getscorewindow1 = getscorewindow.groupby(['stringforgroup'], as_index=False).agg({'score': "sum"})
#print (getscorewindow)
getscorewindowsum = getscorewindow.groupby(['stringforgroup'], as_index=False)['score'].sum()
#print(getscorewindow2)
getscorewindowcount = getscorewindow.groupby(['stringforgroup'], as_index=False)['taggingcount'].count()
getscorewindow2 = pd.merge(getscorewindowcount, getscorewindowsum, on='stringforgroup')
#print (getscorewindow2)
hitslistforstdevdf = getscorewindow.groupby('stringforgroup')['Start_b'].apply(list).reset_index(name='starthistaslist')
#print(hitslistforstdevdf)
getscorewindow2["stdev"] = hitslistforstdevdf.apply(lambda x: calculatestdev( x['starthistaslist']), axis=1)
#print (getscorewindow2)
#getscorewindow1[['Chromosome', 'Start', 'End']] = getscorewindow1['stringforgroup'].str.split("-", expand=True)
#print(getscorewindow2)
##########
getscorenonwindow = (pr.PyRanges(overlappedggdataframe).join(filemakewindowsinwindow1, how="left")).df
sumoutwindscore = getscorenonwindow[getscorenonwindow['Start_b'] < 0]['score'].sum()
countoffhits = getscorenonwindow[getscorenonwindow['Start_b'] < 0].shape[0]
#############
#suminwindscore = sum(map(float, inwindscore))
#sumoutwindscore = sum(map(float, outwindscore))
finalscore = ((float(counthits) * (suminwindscore / float(counthits))) / float(counthits + countoffhits)) - ( (float(countoffhits) * (sumoutwindscore / float(countoffhits))) / float(counthits + countoffhits))
###########
#print (finalscore)
#getstddev = calculatestdev(bestwindow)
getscorewindow2.loc[:,"finalscore"] = finalscore
getscorewindow2["crnaid"] = rowdatchunk['crnaid']
getscorewindow2["sequence"] = rowdatchunk['sequence']
getscorewindow2.loc[:,"Totalhits"] = len(overlappedggdataframe)
#print(getscorewindow2)
####################
argu.windownumbers = 1
getscorewindow3 = getscorewindow2.sort_values(['taggingcount'], ascending=[False]).head(argu.windownumbers)
# print(data1['Totalhits'].iloc[0])
# print(data1)
getscorewindow3['offtargetshits'] = pd.to_numeric(getscorewindow3['Totalhits']).iloc[0] - getscorewindow3['taggingcount'].sum()
# data1['off-targets-hits'] = data1.apply(lambda row: TotalHitsforthatRNA - inwindowtarget, axis=1)
# print ("I am writing")
getscorewindow3[['Chromosome', 'Start', 'End']] = getscorewindow3['stringforgroup'].str.split("-", expand=True)
#print (getscorewindow3)
fileoutstd = open(os.path.join(argu.workingdirectory, "Crispr_broad_multihits.xls"), 'a')
getscorewindow3.to_csv(fileoutstd, sep="\t", index=False, header=False)