In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding, Dropout, BatchNormalization, SimpleRNN
from tensorflow.keras.preprocessing import sequence
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import KFold

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import itertools
from rdkit import Chem
from rdkit.Chem import AllChem

da = pd.read_parquet('data/de_train.parquet')
drug_smile = np.unique(da.SMILES)
y = da.iloc[:,5:]




### loss function

In [3]:
def mrrmse(y_true, y_pred):
    # 计算每一行的均方误差
    mse_per_row = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    # 计算每一行的均方根误差
    rmse_per_row = tf.sqrt(mse_per_row)
    # 计算所有行的均值
    mrrmse = tf.reduce_mean(rmse_per_row)
    return mrrmse

In [4]:
attention_train = pd.read_csv('feature test/Half_fea_train_500.csv')
attention_test = pd.read_csv('feature test/Half_fea_test_500.csv')

In [7]:
sum(attention_train.columns == attention_test.columns) == attention_train.shape[1]

True

In [None]:
n_f = attention_train .shape[1]
n_r = y.shape[1]


In [11]:
n_f = attention_train .shape[1]
n_r = y.shape[1]

def create_rnn_model(n_features = n_f, n_responses = n_r):
    model = Sequential()
    # Set return_sequences=True in the first RNN layer to output sequences for the next RNN layer
    model.add(SimpleRNN(256, activation='tanh', return_sequences=True, input_shape=(1, n_features)))
    model.add(Dropout(0.3))
    # The second RNN layer can have return_sequences=True since we want to maintain the sequence for batch normalization
    model.add(SimpleRNN(128, activation='tanh', return_sequences=True))
    model.add(BatchNormalization())
    # The last RNN layer outputs the final sequence, so return_sequences is set to False
    model.add(SimpleRNN(128, activation='tanh', return_sequences=False))
    model.add(Dense(n_responses))
    model.compile(optimizer=Adam(learning_rate=0.001), loss=mrrmse)
    return model

# 定义交叉验证参数
kfold = KFold(n_splits=5, shuffle=True)

# 用于存储每次迭代的分数
scores = []
attention_train_reshaped = np.expand_dims(attention_train, axis=1)

for train, test in kfold.split(attention_train_reshaped, y):
    model = create_rnn_model()
    model.fit(attention_train_reshaped[train], y.iloc[train], epochs=10, batch_size=32, verbose=0)
    scores.append(model.evaluate(attention_train_reshaped[test], y.iloc[test], verbose=0))






In [13]:
attention_test_reshaped = np.expand_dims(attention_test, axis=1)
predictions = model.predict(attention_test_reshaped)
print(predictions.shape)
# submission.iloc[:, 1:] = predictions

# submission.to_csv('RNN_chem_1.csv', index=False)

(255, 18211)


In [14]:
submission = pd.read_csv('data/sample_submission.csv') 
submission.iloc[:, 1:] = predictions
submission.to_csv('RNN_final_1.csv', index=False)

### GRU

In [16]:
from tensorflow.keras.regularizers import L1L2

def create_complex_gru_model(n_features = n_f, n_responses = n_r):
    model = Sequential()
    model.add(GRU(512, activation='tanh', return_sequences=True, input_shape=(1, n_features),
                  kernel_regularizer=L1L2(l1=0.01, l2=0.01)))
    model.add(Dropout(0.4))
    model.add(GRU(256, activation='tanh', return_sequences=True,
                  kernel_regularizer=L1L2(l1=0.01, l2=0.01)))
    model.add(BatchNormalization())
    model.add(GRU(256, activation='tanh', return_sequences=True))
    model.add(Dropout(0.4))
    model.add(GRU(128, activation='tanh', return_sequences=False))
    # Adding a TimeDistributed Dense layer to operate on each time step; only makes sense if return_sequences=True in the last GRU layer
    # model.add(TimeDistributed(Dense(64, activation='relu')))
    model.add(Dense(128, activation='tanh'))
    model.add(Dense(n_responses))
    model.compile(optimizer=Adam(learning_rate=0.001), loss=mrrmse)
    return model

kfold = KFold(n_splits=5, shuffle=True)


scores = []
# cv
for train, test in kfold.split(attention_train, y):
    model = create_complex_gru_model()
    model.fit(attention_train_reshaped[train], y.iloc[train], epochs=10, batch_size=32, verbose=0)
    scores.append(model.evaluate(attention_train_reshaped[test], y.iloc[test], verbose=0))
    
    
attention_test_reshaped = np.expand_dims(attention_test, axis=1)
predictions = model.predict(attention_test_reshaped)
print(predictions.shape)

submission.iloc[:, 1:] = predictions
submission.to_csv('GRU_final_1.csv', index=False)

(255, 18211)


### LSTM

In [18]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, BatchNormalization
from keras.optimizers import Adam

def create_lstm_model(n_features, n_responses):
    model = Sequential()
    # First LSTM layer with return_sequences=True to output sequences for the next LSTM layer
    model.add(LSTM(256, activation='tanh', return_sequences=True, input_shape=(1, n_features)))
    model.add(Dropout(0.3))
    # Second LSTM layer
    model.add(LSTM(128, activation='tanh', return_sequences=True))
    model.add(BatchNormalization())
    # Last LSTM layer, outputs final sequence, so return_sequences=False
    model.add(LSTM(128, activation='tanh', return_sequences=False))
    model.add(Dense(n_responses))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')  # Replace 'mrrmse' with your loss function
    return model


kfold = KFold(n_splits=5, shuffle=True)


scores = []
# cv
for train, test in kfold.split(attention_train, y):
    model = create_lstm_model(n_f, n_r)
    model.fit(attention_train_reshaped[train], y.iloc[train], epochs=10, batch_size=32, verbose=0)
    scores.append(model.evaluate(attention_train_reshaped[test], y.iloc[test], verbose=0))
    
    
attention_test_reshaped = np.expand_dims(attention_test, axis=1)
predictions = model.predict(attention_test_reshaped)
print(predictions.shape)

submission.iloc[:, 1:] = predictions
submission.to_csv('LSTM_final_1.csv', index=False)

(255, 18211)
