### packages and multioutput regression index

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import itertools
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.decomposition import PCA

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import KFold, cross_val_score
from sklearn.cluster import KMeans
from sklearn.multioutput import MultiOutputRegressor

model_list = pd.Series(range(1, 8))
model_list[0] =  LinearRegression()
model_list[1] =  DecisionTreeRegressor()
model_list[2] =  RandomForestRegressor()
model_list[3] =  GradientBoostingRegressor()
model_list[4] =  SVR()
model_list[5] =  Ridge()
model_list[6] =  Lasso()
model_list[7] =  ElasticNet()

### read in data for gene expression

In [2]:
da = pd.read_parquet('data/de_train.parquet')
submission = pd.read_csv('data/sample_submission.csv') #  提交
drug_smile = np.unique(da.SMILES)
y = da.iloc[:,5:]
gene_list = y.columns
Y = y.T

In [3]:
kmeans = KMeans(n_clusters=5, random_state=0)  # K =5 from elbow plot
kmeans.fit(Y) 
len_y = pd.Series(range(1, 5))

cluster_info = kmeans.labels_

clusters = {}
for i, label in enumerate(kmeans.labels_):
    if label not in clusters:
        clusters[label] = []
    clusters[label].append(Y.index[i])
    
y_0 = y.loc[:,clusters[0]]
y_1 = y.loc[:,clusters[1]]
y_2 = y.loc[:,clusters[2]]
y_3 = y.loc[:,clusters[3]]
y_4 = y.loc[:,clusters[4]]

  super()._check_params_vs_input(X, default_n_init=10)


### read in features: case-250

In [5]:
attention_train = pd.read_csv('feature test/All_fea_train_250.csv')
attention_test = pd.read_csv('feature test/All_fea_test_250.csv')

### prediction

In [8]:
def pca_train_test(compomenets = 10, y_train = y_0, model_index = 2, X_train = attention_train, X_test = attention_test):
    
    pca_y = PCA(n_components = compomenets)
    y_train_pca = pca_y.fit_transform(y_train)
    
    model = MultiOutputRegressor(model_list[model_index])
    model.fit(X_train, y_train_pca)
    
    y_test_pca_pred = model.predict(X_test)
    y_test_pred = pca_y.inverse_transform(y_test_pca_pred)
    
    return(y_test_pred)


In [9]:
y_test_0 = pca_train_test()
print(0)
y_test_1 = pca_train_test(y_train = y_1)
print(1)
y_test_2 = pca_train_test(y_train = y_2)
print(2)
y_test_3 = pca_train_test(y_train = y_3)
print(3)
y_test_4 = pca_train_test(y_train = y_4)
print(4)

0
1
2
3
4


In [10]:
df_zeros = pd.DataFrame(0, index=range(255), columns=range(18211))
df_zeros.columns = gene_list
for i in range(5):
    df_zeros[clusters[i]] = eval(f'y_test_{i}')
print("write in")
submission.iloc[:, 1:] = df_zeros
submission.to_csv('feature test/All_250_df.csv', index=False)

write in


### read in features: case-500

In [11]:
attention_train = pd.read_csv('feature test/All_fea_train_500.csv')
attention_test = pd.read_csv('feature test/All_fea_test_500.csv')

def pca_train_test(compomenets = 10, y_train = y_0, model_index = 2, X_train = attention_train, X_test = attention_test):
    
    pca_y = PCA(n_components = compomenets)
    y_train_pca = pca_y.fit_transform(y_train)
    
    model = MultiOutputRegressor(model_list[model_index])
    model.fit(X_train, y_train_pca)
    
    y_test_pca_pred = model.predict(X_test)
    y_test_pred = pca_y.inverse_transform(y_test_pca_pred)
    
    return(y_test_pred)

In [12]:
y_test_0 = pca_train_test()
print(0)
y_test_1 = pca_train_test(y_train = y_1)
print(1)
y_test_2 = pca_train_test(y_train = y_2)
print(2)
y_test_3 = pca_train_test(y_train = y_3)
print(3)
y_test_4 = pca_train_test(y_train = y_4)
print(4)

0
1
2
3
4


In [13]:
df_zeros = pd.DataFrame(0, index=range(255), columns=range(18211))
df_zeros.columns = gene_list
for i in range(5):
    df_zeros[clusters[i]] = eval(f'y_test_{i}')
print("write in")
submission.iloc[:, 1:] = df_zeros
submission.to_csv('feature test/All_500_df.csv', index=False)

write in


### read in features: case-768

In [14]:
attention_train = pd.read_csv('feature test/All_fea_train_768.csv')
attention_test = pd.read_csv('feature test/All_fea_test_768.csv')

def pca_train_test(compomenets = 10, y_train = y_0, model_index = 2, X_train = attention_train, X_test = attention_test):
    
    pca_y = PCA(n_components = compomenets)
    y_train_pca = pca_y.fit_transform(y_train)
    
    model = MultiOutputRegressor(model_list[model_index])
    model.fit(X_train, y_train_pca)
    
    y_test_pca_pred = model.predict(X_test)
    y_test_pred = pca_y.inverse_transform(y_test_pca_pred)
    
    return(y_test_pred)

In [15]:
y_test_0 = pca_train_test()
print(0)
y_test_1 = pca_train_test(y_train = y_1)
print(1)
y_test_2 = pca_train_test(y_train = y_2)
print(2)
y_test_3 = pca_train_test(y_train = y_3)
print(3)
y_test_4 = pca_train_test(y_train = y_4)
print(4)

0
1
2
3
4


In [16]:
df_zeros = pd.DataFrame(0, index=range(255), columns=range(18211))
df_zeros.columns = gene_list
for i in range(5):
    df_zeros[clusters[i]] = eval(f'y_test_{i}')
print("write in")
submission.iloc[:, 1:] = df_zeros
submission.to_csv('feature test/All_768_df.csv', index=False)

write in


### read data all -1000

In [4]:
attention_train = pd.read_csv('feature test/All_fea_train_1000.csv')
attention_test = pd.read_csv('feature test/All_fea_test_1000.csv')

def pca_train_test(compomenets = 10, y_train = y_0, model_index = 2, X_train = attention_train, X_test = attention_test):
    
    pca_y = PCA(n_components = compomenets)
    y_train_pca = pca_y.fit_transform(y_train)
    
    model = MultiOutputRegressor(model_list[model_index])
    model.fit(X_train, y_train_pca)
    
    y_test_pca_pred = model.predict(X_test)
    y_test_pred = pca_y.inverse_transform(y_test_pca_pred)
    
    return(y_test_pred)

In [5]:
y_test_0 = pca_train_test()
print(0)
y_test_1 = pca_train_test(y_train = y_1)
print(1)
y_test_2 = pca_train_test(y_train = y_2)
print(2)
y_test_3 = pca_train_test(y_train = y_3)
print(3)
y_test_4 = pca_train_test(y_train = y_4)
print(4)

0
1
2
3
4


In [6]:
df_zeros = pd.DataFrame(0, index=range(255), columns=range(18211))
df_zeros.columns = gene_list
for i in range(5):
    df_zeros[clusters[i]] = eval(f'y_test_{i}')
print("write in")
submission.iloc[:, 1:] = df_zeros
submission.to_csv('feature test/All_1000_df.csv', index=False)

write in


### half-250

In [4]:
attention_train = pd.read_csv('feature test/Half_fea_train_250.csv')
attention_test = pd.read_csv('feature test/Half_fea_test_250.csv')

def pca_train_test(compomenets = 10, y_train = y_0, model_index = 2, X_train = attention_train, X_test = attention_test):
    
    pca_y = PCA(n_components = compomenets)
    y_train_pca = pca_y.fit_transform(y_train)
    
    model = MultiOutputRegressor(model_list[model_index])
    model.fit(X_train, y_train_pca)
    
    y_test_pca_pred = model.predict(X_test)
    y_test_pred = pca_y.inverse_transform(y_test_pca_pred)
    
    return(y_test_pred)

In [5]:
y_test_0 = pca_train_test()
print(0)
y_test_1 = pca_train_test(y_train = y_1)
print(1)
y_test_2 = pca_train_test(y_train = y_2)
print(2)
y_test_3 = pca_train_test(y_train = y_3)
print(3)
y_test_4 = pca_train_test(y_train = y_4)
print(4)

0
1
2
3
4


In [6]:
df_zeros = pd.DataFrame(0, index=range(255), columns=range(18211))
df_zeros.columns = gene_list
for i in range(5):
    df_zeros[clusters[i]] = eval(f'y_test_{i}')
print("write in")
submission.iloc[:, 1:] = df_zeros
submission.to_csv('feature test/Half_250_df.csv', index=False)

write in


### half-500

In [7]:
attention_train = pd.read_csv('feature test/Half_fea_train_500.csv')
attention_test = pd.read_csv('feature test/Half_fea_test_500.csv')

def pca_train_test(compomenets = 10, y_train = y_0, model_index = 2, X_train = attention_train, X_test = attention_test):
    
    pca_y = PCA(n_components = compomenets)
    y_train_pca = pca_y.fit_transform(y_train)
    
    model = MultiOutputRegressor(model_list[model_index])
    model.fit(X_train, y_train_pca)
    
    y_test_pca_pred = model.predict(X_test)
    y_test_pred = pca_y.inverse_transform(y_test_pca_pred)
    
    return(y_test_pred)

In [8]:
y_test_0 = pca_train_test()
print(0)
y_test_1 = pca_train_test(y_train = y_1)
print(1)
y_test_2 = pca_train_test(y_train = y_2)
print(2)
y_test_3 = pca_train_test(y_train = y_3)
print(3)
y_test_4 = pca_train_test(y_train = y_4)
print(4)

0
1
2
3
4


In [9]:
df_zeros = pd.DataFrame(0, index=range(255), columns=range(18211))
df_zeros.columns = gene_list
for i in range(5):
    df_zeros[clusters[i]] = eval(f'y_test_{i}')
print("write in")
submission.iloc[:, 1:] = df_zeros
submission.to_csv('feature test/Half_500_df.csv', index=False)

write in


### half 768

In [10]:
attention_train = pd.read_csv('feature test/Half_fea_train_768.csv')
attention_test = pd.read_csv('feature test/Half_fea_test_768.csv')

def pca_train_test(compomenets = 10, y_train = y_0, model_index = 2, X_train = attention_train, X_test = attention_test):
    
    pca_y = PCA(n_components = compomenets)
    y_train_pca = pca_y.fit_transform(y_train)
    
    model = MultiOutputRegressor(model_list[model_index])
    model.fit(X_train, y_train_pca)
    
    y_test_pca_pred = model.predict(X_test)
    y_test_pred = pca_y.inverse_transform(y_test_pca_pred)
    
    return(y_test_pred)

In [11]:
y_test_0 = pca_train_test()
print(0)
y_test_1 = pca_train_test(y_train = y_1)
print(1)
y_test_2 = pca_train_test(y_train = y_2)
print(2)
y_test_3 = pca_train_test(y_train = y_3)
print(3)
y_test_4 = pca_train_test(y_train = y_4)
print(4)

0
1
2
3
4


In [12]:
df_zeros = pd.DataFrame(0, index=range(255), columns=range(18211))
df_zeros.columns = gene_list
for i in range(5):
    df_zeros[clusters[i]] = eval(f'y_test_{i}')
print("write in")
submission.iloc[:, 1:] = df_zeros
submission.to_csv('feature test/Half_768_df.csv', index=False)

write in


### half 1000

In [4]:
attention_train = pd.read_csv('feature test/Half_fea_train_1000.csv')
attention_test = pd.read_csv('feature test/Half_fea_test_1000.csv')

def pca_train_test(compomenets = 10, y_train = y_0, model_index = 2, X_train = attention_train, X_test = attention_test):
    
    pca_y = PCA(n_components = compomenets)
    y_train_pca = pca_y.fit_transform(y_train)
    
    model = MultiOutputRegressor(model_list[model_index])
    model.fit(X_train, y_train_pca)
    
    y_test_pca_pred = model.predict(X_test)
    y_test_pred = pca_y.inverse_transform(y_test_pca_pred)
    
    return(y_test_pred)

In [5]:
y_test_0 = pca_train_test()
print(0)
y_test_1 = pca_train_test(y_train = y_1)
print(1)
y_test_2 = pca_train_test(y_train = y_2)
print(2)
y_test_3 = pca_train_test(y_train = y_3)
print(3)
y_test_4 = pca_train_test(y_train = y_4)
print(4)

0
1
2
3
4


In [6]:
df_zeros = pd.DataFrame(0, index=range(255), columns=range(18211))
df_zeros.columns = gene_list
for i in range(5):
    df_zeros[clusters[i]] = eval(f'y_test_{i}')
print("write in")
submission.iloc[:, 1:] = df_zeros
submission.to_csv('feature test/Half_1000_df.csv', index=False)

write in
