In [1]:
import pandas as pd

In [3]:
all_terms = pd.read_csv("3endpos.csv")
all_terms


Unnamed: 0,primary 3' end position,Gene strand
0,308.0,+
1,5076.0,+
2,9230.0,+
3,9920.0,+
4,14144.0,+
...,...,...
1048570,,+
1048571,,-
1048572,,+
1048573,,+


In [4]:
# remove nan
all_terms = all_terms.dropna(axis=0, how='any')
all_terms

Unnamed: 0,primary 3' end position,Gene strand
0,308.0,+
1,5076.0,+
2,9230.0,+
3,9920.0,+
4,14144.0,+
...,...,...
1093,4457393.0,+
1094,3651345.0,-
1095,848301.0,-
1096,3641507.0,+


In [5]:
rho_status = pd.read_csv("rho_status.csv")
rho_status

Unnamed: 0,primary 3' end position,Termination mechanism
0,5076,Independent
1,9230,rho-dependent
2,20763,Independent
3,27282,Independent
4,29276,Independent
...,...,...
457,4618512,rho-dependent
458,4624203,rho-dependent
459,4629357,Independent
460,1483934,undecided


In [7]:
rho_status.columns

Index(['primary 3' end position', 'Termination mechanism'], dtype='object')

In [8]:
# join the two dataframes based on the gene name
all_terms = all_terms.merge(rho_status, on="primary 3' end position", how='left')
all_terms

Unnamed: 0,primary 3' end position,Gene strand,Termination mechanism
0,308.0,+,
1,5076.0,+,Independent
2,9230.0,+,rho-dependent
3,9920.0,+,
4,14144.0,+,
...,...,...,...
1093,4457393.0,+,
1094,3651345.0,-,
1095,848301.0,-,
1096,3641507.0,+,


In [10]:
all_terms["Termination mechanism"].describe() # correct is 462 count

count             462
unique              3
top       Independent
freq              174
Name: Termination mechanism, dtype: object

In [11]:
# replace nan with "unknown"
all_terms["Termination mechanism"].fillna("unknown", inplace=True)
all_terms

Unnamed: 0,primary 3' end position,Gene strand,Termination mechanism
0,308.0,+,unknown
1,5076.0,+,Independent
2,9230.0,+,rho-dependent
3,9920.0,+,unknown
4,14144.0,+,unknown
...,...,...,...
1093,4457393.0,+,unknown
1094,3651345.0,-,unknown
1095,848301.0,-,unknown
1096,3641507.0,+,unknown


In [13]:
# chromosome, start, end, Associated gene , score, strand
all_terms['gene'] = ["CP009273.1" for _ in range(len(all_terms))]
all_terms

Unnamed: 0,primary 3' end position,Gene strand,Termination mechanism,gene
0,308.0,+,unknown,CP009273
1,5076.0,+,Independent,CP009273
2,9230.0,+,rho-dependent,CP009273
3,9920.0,+,unknown,CP009273
4,14144.0,+,unknown,CP009273
...,...,...,...,...
1093,4457393.0,+,unknown,CP009273
1094,3651345.0,-,unknown,CP009273
1095,848301.0,-,unknown,CP009273
1096,3641507.0,+,unknown,CP009273


In [14]:
all_terms["start"] = all_terms["primary 3' end position"]- 1
all_terms

Unnamed: 0,primary 3' end position,Gene strand,Termination mechanism,gene,start
0,308.0,+,unknown,CP009273,307.0
1,5076.0,+,Independent,CP009273,5075.0
2,9230.0,+,rho-dependent,CP009273,9229.0
3,9920.0,+,unknown,CP009273,9919.0
4,14144.0,+,unknown,CP009273,14143.0
...,...,...,...,...,...
1093,4457393.0,+,unknown,CP009273,4457392.0
1094,3651345.0,-,unknown,CP009273,3651344.0
1095,848301.0,-,unknown,CP009273,848300.0
1096,3641507.0,+,unknown,CP009273,3641506.0


In [15]:
all_terms["score"] = [0 for _ in range(len(all_terms))]
all_terms

Unnamed: 0,primary 3' end position,Gene strand,Termination mechanism,gene,start,score
0,308.0,+,unknown,CP009273,307.0,0
1,5076.0,+,Independent,CP009273,5075.0,0
2,9230.0,+,rho-dependent,CP009273,9229.0,0
3,9920.0,+,unknown,CP009273,9919.0,0
4,14144.0,+,unknown,CP009273,14143.0,0
...,...,...,...,...,...,...
1093,4457393.0,+,unknown,CP009273,4457392.0,0
1094,3651345.0,-,unknown,CP009273,3651344.0,0
1095,848301.0,-,unknown,CP009273,848300.0,0
1096,3641507.0,+,unknown,CP009273,3641506.0,0


In [17]:
# change order of the columns   
all_terms = all_terms[['gene', 'start', "primary 3' end position", 'Termination mechanism', 'score', 'Gene strand']]
all_terms

Unnamed: 0,gene,start,primary 3' end position,Termination mechanism,score,Gene strand
0,CP009273,307.0,308.0,unknown,0,+
1,CP009273,5075.0,5076.0,Independent,0,+
2,CP009273,9229.0,9230.0,rho-dependent,0,+
3,CP009273,9919.0,9920.0,unknown,0,+
4,CP009273,14143.0,14144.0,unknown,0,+
...,...,...,...,...,...,...
1093,CP009273,4457392.0,4457393.0,unknown,0,+
1094,CP009273,3651344.0,3651345.0,unknown,0,-
1095,CP009273,848300.0,848301.0,unknown,0,-
1096,CP009273,3641506.0,3641507.0,unknown,0,+


In [18]:
# change type of the columns to int 
all_terms['start'] = all_terms['start'].astype(int)
all_terms["primary 3' end position"] = all_terms["primary 3' end position"].astype(int)
all_terms

Unnamed: 0,gene,start,primary 3' end position,Termination mechanism,score,Gene strand
0,CP009273,307,308,unknown,0,+
1,CP009273,5075,5076,Independent,0,+
2,CP009273,9229,9230,rho-dependent,0,+
3,CP009273,9919,9920,unknown,0,+
4,CP009273,14143,14144,unknown,0,+
...,...,...,...,...,...,...
1093,CP009273,4457392,4457393,unknown,0,+
1094,CP009273,3651344,3651345,unknown,0,-
1095,CP009273,848300,848301,unknown,0,-
1096,CP009273,3641506,3641507,unknown,0,+


In [19]:
# convert df to bed file format
# {FullNameOfTheBacteria}_{strainCode}_{GenomeAccessionNumber}.bed    
all_terms.to_csv('EscherichiaColi_K-12_CP009273.1.bed', sep='\t', header=False, index=False)

In [2]:
import pandas as pd
terms = pd.read_csv("EscherichiaColi_K-12_CP009273.1.bed", sep='\t', header=None)


In [3]:
terms

Unnamed: 0,0,1,2,3,4,5
0,CP009273,307,308,unknown,0,+
1,CP009273,5075,5076,Independent,0,+
2,CP009273,9229,9230,rho-dependent,0,+
3,CP009273,9919,9920,unknown,0,+
4,CP009273,14143,14144,unknown,0,+
...,...,...,...,...,...,...
1093,CP009273,4457392,4457393,unknown,0,+
1094,CP009273,3651344,3651345,unknown,0,-
1095,CP009273,848300,848301,unknown,0,-
1096,CP009273,3641506,3641507,unknown,0,+


In [4]:
avgt = pd.read_csv("avg_term_reads.csv", header=None)
avgt

Unnamed: 0,0
0,11
1,7
2,426
3,11
4,163
...,...
1093,28
1094,98
1095,34
1096,81


In [5]:
terms[4] = avgt[0].copy()
terms

Unnamed: 0,0,1,2,3,4,5
0,CP009273,307,308,unknown,11,+
1,CP009273,5075,5076,Independent,7,+
2,CP009273,9229,9230,rho-dependent,426,+
3,CP009273,9919,9920,unknown,11,+
4,CP009273,14143,14144,unknown,163,+
...,...,...,...,...,...,...
1093,CP009273,4457392,4457393,unknown,28,+
1094,CP009273,3651344,3651345,unknown,98,-
1095,CP009273,848300,848301,unknown,34,-
1096,CP009273,3641506,3641507,unknown,81,+


In [7]:
terms[0] = terms[0] + "_" + str(list(range(len(terms)))).strip('[]').split(', ')
terms

Unnamed: 0,0,1,2,3,4,5
0,CP009273_0,307,308,unknown,11,+
1,CP009273_1,5075,5076,Independent,7,+
2,CP009273_2,9229,9230,rho-dependent,426,+
3,CP009273_3,9919,9920,unknown,11,+
4,CP009273_4,14143,14144,unknown,163,+
...,...,...,...,...,...,...
1093,CP009273_1093,4457392,4457393,unknown,28,+
1094,CP009273_1094,3651344,3651345,unknown,98,-
1095,CP009273_1095,848300,848301,unknown,34,-
1096,CP009273_1096,3641506,3641507,unknown,81,+


In [8]:
terms.to_csv('EscherichiaColi_K12-BW25113_CP009273.1.bed', sep='\t', header=False, index=False)