In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
pos_df = pd.read_csv('WRKY_info_20190507/WRKY_info_table_positive.csv', sep='\t')
neg1_df = pd.read_csv('WRKY_info_20190507/WRKY_info_table_negative_one.csv', sep='\t')
neg2_df = pd.read_csv('WRKY_info_20190507/WRKY_info_table_negative_two.csv', sep='\t')
neg3_df = pd.read_csv('WRKY_info_20190507/WRKY_info_table_negative_three.csv', sep='\t')

In [3]:
pos_df['is_combined'] = np.ones(len(pos_df)).astype(int)
neg1_df['is_combined'] = np.zeros(len(pos_df)).astype(int)
neg2_df['is_combined'] = np.zeros(len(pos_df)).astype(int)
neg3_df['is_combined'] = np.zeros(len(pos_df)).astype(int)

In [4]:
print(pos_df.info())
pos_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26480 entries, 0 to 26479
Data columns (total 6 columns):
TF_ID          26480 non-null object
Pseq_ID        26480 non-null object
Pseq           26480 non-null object
DBD_seq        26480 non-null object
matrix_ID      26480 non-null object
is_combined    26480 non-null int64
dtypes: int64(1), object(5)
memory usage: 1.2+ MB
None


Unnamed: 0,TF_ID,Pseq_ID,Pseq,DBD_seq,matrix_ID,is_combined
0,AT1G13960,TFprotseq_12499,MSEKEEAPSTSKSTGAPSRPTLSLPPRPFSEMFFNGGVGFSPGPMT...,ADDGYNWRKYGQKQVKGSEFPRSYYKCTNPGCPVKKKVERSLDGQV...,TF_motif_seq_0270,1
1,AT1G13960,TFprotseq_12499,MSEKEEAPSTSKSTGAPSRPTLSLPPRPFSEMFFNGGVGFSPGPMT...,ADDGYNWRKYGQKQVKGSEFPRSYYKCTNPGCPVKKKVERSLDGQV...,TF_motif_seq_0339,1
2,AT1G13960,TFprotseq_12499,MSEKEEAPSTSKSTGAPSRPTLSLPPRPFSEMFFNGGVGFSPGPMT...,ADDGYNWRKYGQKQVKGSEFPRSYYKCTNPGCPVKKKVERSLDGQV...,TFmatrixID_0449,1
3,AT1G13960,TFprotseq_12499,MSEKEEAPSTSKSTGAPSRPTLSLPPRPFSEMFFNGGVGFSPGPMT...,ADDGYNWRKYGQKQVKGSEFPRSYYKCTNPGCPVKKKVERSLDGQV...,TFmatrixID_0451,1
4,AT1G13960,TFprotseq_12499,MSEKEEAPSTSKSTGAPSRPTLSLPPRPFSEMFFNGGVGFSPGPMT...,ADDGYNWRKYGQKQVKGSEFPRSYYKCTNPGCPVKKKVERSLDGQV...,TFmatrixID_0465,1


In [5]:
ATCG_1D_list = ['A', 'C', 'G', 'T']
ATCG_2D_list = []
ATCG_3D_list = []

for i in range(4):
    for j in range(4):
        ATCG_2D_list.append(ATCG_1D_list[i] + ATCG_1D_list[j])
        for k in range(4):
            ATCG_3D_list.append(ATCG_1D_list[i] + ATCG_1D_list[j] + ATCG_1D_list[k])

## Read matrices

In [6]:
col_names = ['matrix_ID', 'alength', 'width', 'nsites', 'E', 'ATCG_prob_list', 'DNA_seq'] + ATCG_2D_list + ATCG_3D_list
matrices_df = pd.DataFrame(columns=col_names)

In [7]:
matricesFile = open('WRKY_info_20190507/All_matrices.txt', 'r')
for i in range(8):
    matricesFile.readline()

In [8]:
# decode index to ATCG character
def getATCGchar(index):
    if (index == -1):
        return '_'
    elif (index == 0):
        return 'A'
    elif (index == 1):
        return 'C'
    elif (index == 2):
        return 'G'
    elif (index == 3):
        return 'T'

In [None]:
while True:
    # eat empty line
    matricesFile.readline()
    
    # Read matrix header
    header = matricesFile.readline().rstrip('\n')
    
    # If the header have nothing, it means we are at the EOF, then break and stop reading it
    if len(header) == 0:
        break
    header = header.split()[1]
    
    # eat empty line
    matricesFile.readline()
    
    # read matrix basic info(alength, width, nsites, E)
    matrix_info = matricesFile.readline()
    matrix_info = matrix_info.split()
    
    alength = int(matrix_info[3])
    width = int(matrix_info[5])
    nsites = int(matrix_info[7])
    E = int(matrix_info[9])
    
    # read DNA sequence(ATCG probability)
    ATCG_prob_list = np.zeros((width, 4))
    DNA_seq = ''
    for j in range(width):
        nucleotide = matricesFile.readline()
        nucleotide = nucleotide.split()
        
        # store the posibility of ATCG in every position of DNA sequence into ATCG_prob_list
        ATCG_prob_list[j][0] = float(nucleotide[0])
        ATCG_prob_list[j][1] = float(nucleotide[1])
        ATCG_prob_list[j][2] = float(nucleotide[2])
        ATCG_prob_list[j][3] = float(nucleotide[3])
        
        # Find the max posibility in every position, if prob > 0.5 then choose that one. 
        # if no prob is > 0.5, then drop that position and store it as '_', which represents empty
        max_ATCG = -1
        for k in range(4):
            if (ATCG_prob_list[j][k] > 0.5):
                max_ATCG = k
                break
        DNA_seq += getATCGchar(max_ATCG)
    
    # Append each matrix to dataframe
    matrices_df.loc[len(matrices_df)] = [header, alength, width, nsites, E, ATCG_prob_list, DNA_seq] + [0] * (4*4 + 4*4*4)

In [None]:
matricesFile.close()

In [None]:
# print(matrices_df.info())
matrices_df.head()

Unnamed: 0,matrix_ID,alength,width,nsites,E,ATCG_prob_list,DNA_seq,AA,AC,AG,...,TCG,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT
0,TF_motif_seq_0001,4,10,1,0,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...",AACCTAACCT,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,TF_motif_seq_0002,4,10,1,0,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...",AACGCGTGTC,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,TF_motif_seq_0003,4,10,1,0,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...",AAGCGTAAGT,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,TF_motif_seq_0004,4,10,1,0,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...",AATAAA_AAA,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,TF_motif_seq_0005,4,10,1,0,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...",AATGGAAATG,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
for i in range(len(matrices_df)):
    for nucleo in ATCG_2D_list + ATCG_3D_list:
        if (nucleo in matrices_df.iloc[i]['DNA_seq']):
            matrices_df.iloc[i][nucleo] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


## join pos dataset with matrices_df by matrix_ID

In [None]:
merged_pos_df = pd.merge(pos_df, matrices_df)
merged_pos_df.head(10)

In [None]:
pos_ATCG_sum_dict = merged_pos_df.loc[ : , ATCG_2D_list + ATCG_3D_list].sum()/len(pos_df)

In [None]:
sorted(pos_ATCG_sum_dict.items(), key=lambda d: d[1], reverse=True)

In [None]:
plt.figure(figsize=(20,10))
plt.bar(range(len(pos_ATCG_sum_dict)), list(pos_ATCG_sum_dict.values()), align='center')
plt.xticks(range(len(pos_ATCG_sum_dict)), list(pos_ATCG_sum_dict.keys()))
plt.show()

In [None]:
merged_neg1_df = pd.merge(neg1_df, matrices_df)
merged_neg2_df = pd.merge(neg2_df, matrices_df)
merged_neg3_df = pd.merge(neg3_df, matrices_df)

In [None]:
neg1_ATCG_sum_dict = merged_neg1_df.loc[ : , ATCG_2D_list + ATCG_3D_list].sum()/len(neg1_df)
neg2_ATCG_sum_dict = merged_neg2_df.loc[ : , ATCG_2D_list + ATCG_3D_list].sum()/len(neg2_df)
neg3_ATCG_sum_dict = merged_neg3_df.loc[ : , ATCG_2D_list + ATCG_3D_list].sum()/len(neg3_df)

In [None]:
print(sorted(neg1_ATCG_sum_dict.items(), key=lambda d: d[1], reverse=True), end='\n\n')
print(sorted(neg2_ATCG_sum_dict.items(), key=lambda d: d[1], reverse=True), end='\n\n')
print(sorted(neg3_ATCG_sum_dict.items(), key=lambda d: d[1], reverse=True), end='\n\n')

# K means

In [None]:
#接下來匯入KMeans函式庫
from sklearn.cluster import KMeans

#請KMeans分成三類
clf = KMeans(n_clusters=15)

#開始訓練！
clf.fit(merged_pos_df.loc[ : , ATCG_2D_list + ATCG_3D_list])

#這樣就可以取得預測結果了！
clf.labels_

In [None]:
allData_df = merged_pos_df

In [None]:
allData_df = allData_df.append([merged_neg1_df, merged_neg3_df], ignore_index=True)

In [None]:
from sklearn.model_selection import train_test_split

y = allData_df['is_combined']
X_train, X_test, y_train, y_test = train_test_split(allData_df.drop(columns=['is_combined', 'DNA_seq']), y, test_size=0.1)
