In [1]:
import pandas as pd
import numpy as np

In [2]:
pos_df = pd.read_csv('WRKY_info_20190507/WRKY_info_table_positive.csv', sep='\t')
neg1_df = pd.read_csv('WRKY_info_20190507/WRKY_info_table_negative_one.csv', sep='\t')
neg2_df = pd.read_csv('WRKY_info_20190507/WRKY_info_table_negative_two.csv', sep='\t')
neg3_df = pd.read_csv('WRKY_info_20190507/WRKY_info_table_negative_three.csv', sep='\t')

In [3]:
print(pos_df.info())
pos_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26480 entries, 0 to 26479
Data columns (total 5 columns):
TF_ID        26480 non-null object
Pseq_ID      26480 non-null object
Pseq         26480 non-null object
DBD_seq      26480 non-null object
matrix_ID    26480 non-null object
dtypes: object(5)
memory usage: 1.0+ MB
None


Unnamed: 0,TF_ID,Pseq_ID,Pseq,DBD_seq,matrix_ID
0,AT1G13960,TFprotseq_12499,MSEKEEAPSTSKSTGAPSRPTLSLPPRPFSEMFFNGGVGFSPGPMT...,ADDGYNWRKYGQKQVKGSEFPRSYYKCTNPGCPVKKKVERSLDGQV...,TF_motif_seq_0270
1,AT1G13960,TFprotseq_12499,MSEKEEAPSTSKSTGAPSRPTLSLPPRPFSEMFFNGGVGFSPGPMT...,ADDGYNWRKYGQKQVKGSEFPRSYYKCTNPGCPVKKKVERSLDGQV...,TF_motif_seq_0339
2,AT1G13960,TFprotseq_12499,MSEKEEAPSTSKSTGAPSRPTLSLPPRPFSEMFFNGGVGFSPGPMT...,ADDGYNWRKYGQKQVKGSEFPRSYYKCTNPGCPVKKKVERSLDGQV...,TFmatrixID_0449
3,AT1G13960,TFprotseq_12499,MSEKEEAPSTSKSTGAPSRPTLSLPPRPFSEMFFNGGVGFSPGPMT...,ADDGYNWRKYGQKQVKGSEFPRSYYKCTNPGCPVKKKVERSLDGQV...,TFmatrixID_0451
4,AT1G13960,TFprotseq_12499,MSEKEEAPSTSKSTGAPSRPTLSLPPRPFSEMFFNGGVGFSPGPMT...,ADDGYNWRKYGQKQVKGSEFPRSYYKCTNPGCPVKKKVERSLDGQV...,TFmatrixID_0465


In [4]:
col_names = ['matrix_ID', 'alength', 'width', 'nsites', 'E', 'ATCG_prob_list', 'DNA_seq']
matrices_df = pd.DataFrame(columns=col_names)

In [5]:
matricesFile = open('WRKY_info_20190507/All_matrices.txt', 'r')
for i in range(8):
    matricesFile.readline()

In [6]:
# decode index to ATCG character
def getATCGchar(index):
    if (index == -1):
        return '_'
    elif (index == 0):
        return 'A'
    elif (index == 1):
        return 'C'
    elif (index == 2):
        return 'G'
    elif (index == 3):
        return 'T'

In [7]:
while True:
    # eat empty line
    matricesFile.readline()
    
    # Read matrix header
    header = matricesFile.readline().rstrip('\n')
    
    # If the header have nothing, it means we are at the EOF, then break and stop reading it
    if len(header) == 0:
        break
    header = header.split()[1]
    
    # eat empty line
    matricesFile.readline()
    
    # read matrix basic info(alength, width, nsites, E)
    matrix_info = matricesFile.readline()
    matrix_info = matrix_info.split()
    
    alength = int(matrix_info[3])
    width = int(matrix_info[5])
    nsites = int(matrix_info[7])
    E = int(matrix_info[9])
    
    # read DNA sequence(ATCG probability)
    ATCG_prob_list = np.zeros((width, 4))
    DNA_seq = ''
    for j in range(width):
        nucleotide = matricesFile.readline()
        nucleotide = nucleotide.split()
        
        # store the posibility of ATCG in every position of DNA sequence into ATCG_prob_list
        ATCG_prob_list[j][0] = float(nucleotide[0])
        ATCG_prob_list[j][1] = float(nucleotide[1])
        ATCG_prob_list[j][2] = float(nucleotide[2])
        ATCG_prob_list[j][3] = float(nucleotide[3])
        
        # Find the max posibility in every position, if prob > 0.5 then choose that one. 
        # if no prob is > 0.5, then drop that position and store it as '_', which represents empty
        max_ATCG = -1
        for k in range(4):
            if (ATCG_prob_list[j][k] > 0.5):
                max_ATCG = k
                break
        DNA_seq += getATCGchar(max_ATCG)
    
    # Append each matrix to dataframe
    matrices_df.loc[len(matrices_df)] = [header, alength, width, nsites, E, ATCG_prob_list, DNA_seq]

In [8]:
matricesFile.close()

In [9]:
print(matrices_df.info())
matrices_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2361 entries, 0 to 2360
Data columns (total 7 columns):
matrix_ID         2361 non-null object
alength           2361 non-null object
width             2361 non-null object
nsites            2361 non-null object
E                 2361 non-null object
ATCG_prob_list    2361 non-null object
DNA_seq           2361 non-null object
dtypes: object(7)
memory usage: 147.6+ KB
None


Unnamed: 0,matrix_ID,alength,width,nsites,E,ATCG_prob_list,DNA_seq
0,TF_motif_seq_0001,4,10,1,0,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...",AACCTAACCT
1,TF_motif_seq_0002,4,10,1,0,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...",AACGCGTGTC
2,TF_motif_seq_0003,4,10,1,0,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...",AAGCGTAAGT
3,TF_motif_seq_0004,4,10,1,0,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...",AATAAA_AAA
4,TF_motif_seq_0005,4,10,1,0,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...",AATGGAAATG
