In [121]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from impyute.imputation.cs import mice
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [166]:
df = pd.read_excel('OUTPUT.xlsx')
df.head()

Unnamed: 0,ID,name,smiles,output,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,1,ABACAVIRSULFATE,Nc1nc(NC2CC2)c3ncn(C4CC(CO)C=C4)c3n1,0,0,-1.8653,3.479344,47.8891,44.044274,9,...,0.862046,0.116831,0.560654,0.37945,0.268039,20.70684,52.049186,91.644726,0.793068,1.208143
1,2,ACARBOSE,CC1OC(OC2C(O)C(O)C(OC3C(O)C(O)C(O)OC3CO)OC2CO)...,0,0,-7.2263,52.219412,137.768,88.208099,0,...,0.883822,0.065994,0.572409,0.487711,0.434117,42.761478,193.812432,465.446927,0.825733,1.494237
2,3,ACEBUTOLOL,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(c1)C(=O)C,0,0,-2.1004,4.41168,67.0727,55.758204,6,...,0.910758,0.057143,0.494202,0.319559,0.386393,29.370226,71.693573,143.387359,0.866137,1.200154
3,4,ACECAINIDE,CCN(CC)CCNC(=O)c1ccc(NC(=O)C)cc1,0,0,-0.5179,0.26822,52.6108,46.640239,6,...,0.90819,0.048952,0.533958,0.387559,0.470449,24.128665,49.765298,100.659835,0.862285,1.391967
4,5,ACECLOFENAC,OC(=O)COC(=O)Cc1ccccc1Nc2c(Cl)cccc2Cl,0,1,0.5703,0.325242,37.1534,45.496309,12,...,0.629985,0.3333,0.474489,0.401925,0.37442,15.58087,59.559927,104.301091,0.444977,1.250833


In [123]:
df_imputation_ans = df.drop(columns=['ID', 'name', 'smiles', 'output'])
df_imputation_ans = df_imputation_ans.dropna() 
df_imputation_ans
selected_features = df_imputation_ans.columns
#標準化 Standardization (Z-score Normalization)
scaler = StandardScaler()
df_imputation_ans = scaler.fit_transform(df_imputation_ans)
df_imputation_ans = pd.DataFrame(df_imputation_ans, columns=selected_features)
df_imputation_ans.head()

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,-0.398264,-0.935271,0.199276,-0.061023,-0.201572,0.186626,0.320195,-0.141073,-0.101282,-0.157913,...,0.881479,-1.083026,0.058215,-0.822817,-0.521293,0.499674,-0.065784,-0.212705,0.869784,-0.283531
1,-0.398264,-4.736758,8.346996,3.457954,2.457485,-1.58982,-1.559989,2.967302,3.13705,2.547691,...,1.001887,-1.623495,0.09001,0.629402,0.17234,2.851051,2.903921,2.799093,1.001296,0.209188
2,-0.398264,-1.101981,0.355132,0.690061,0.503712,-0.405522,-0.431879,0.700778,0.321109,0.924329,...,1.150825,-1.717595,-0.121517,-1.626201,-0.026984,1.423329,0.345734,0.204196,1.163969,-0.29729
3,-0.398264,0.02017,-0.337517,0.123843,-0.045271,-0.405522,-0.431879,0.117958,-0.242079,0.383208,...,1.136626,-1.804682,-0.013988,-0.714038,0.324082,0.864495,-0.113627,-0.140069,1.148461,0.033056
4,1.311243,0.791813,-0.327985,-0.481352,-0.114146,0.778775,0.696232,-0.335347,0.180312,-0.699034,...,-0.401662,1.218376,-0.174836,-0.521337,-0.076989,-0.046836,0.091554,-0.11073,-0.531691,-0.210009


In [124]:
# 產生要補值的格子的索引
np.random.seed(0)  
total_cells = df_imputation_ans.size  # 總格子數
desired_missing_cells = int(0.05 * total_cells)  # 總格子數的5%
missing_indices = np.random.choice(df_imputation_ans.size, desired_missing_cells, replace=False)
missing_indices

array([  91941,  120677,  898584, ...,  385688, 1186785,  340649])

In [125]:
# 將這些格子設置為缺失值
df_imputation_test = df_imputation_ans.copy()
for index in missing_indices:
    row_index = index // df_imputation_test.shape[1]
    col_index = index % df_imputation_test.shape[1]
    df_imputation_test.iat[row_index, col_index] = np.nan

In [127]:
df_imputation_test.head()

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,-0.398264,-0.935271,0.199276,-0.061023,-0.201572,0.186626,0.320195,,-0.101282,-0.157913,...,0.881479,-1.083026,0.058215,-0.822817,,0.499674,-0.065784,-0.212705,0.869784,-0.283531
1,-0.398264,-4.736758,8.346996,3.457954,2.457485,-1.58982,-1.559989,2.967302,3.13705,2.547691,...,1.001887,-1.623495,0.09001,0.629402,0.17234,2.851051,2.903921,2.799093,1.001296,0.209188
2,-0.398264,-1.101981,0.355132,0.690061,0.503712,-0.405522,-0.431879,0.700778,0.321109,0.924329,...,1.150825,-1.717595,-0.121517,-1.626201,-0.026984,1.423329,0.345734,0.204196,1.163969,-0.29729
3,-0.398264,0.02017,-0.337517,0.123843,-0.045271,-0.405522,-0.431879,0.117958,-0.242079,,...,1.136626,-1.804682,-0.013988,-0.714038,0.324082,0.864495,-0.113627,-0.140069,1.148461,0.033056
4,1.311243,0.791813,-0.327985,-0.481352,-0.114146,0.778775,0.696232,,0.180312,-0.699034,...,-0.401662,1.218376,-0.174836,-0.521337,-0.076989,-0.046836,0.091554,-0.11073,-0.531691,


In [128]:
# 使用KNN補值
knn_imputer = KNNImputer(n_neighbors=5)
knn_imputed_df = knn_imputer.fit_transform(df_imputation_test)
knn_rmse = np.sqrt(mean_squared_error(knn_imputed_df.flat[missing_indices], df_imputation_ans.values.flat[missing_indices]))
print("KNN RMSE:", knn_rmse)

KNN RMSE: 0.6072806189375165


In [None]:
# 使用MICE補值，會跑很久
# mice_imputed_df = mice(df_imputation_test.values)

In [167]:
data = df.drop(columns=['ID', 'name', 'smiles'])
data.head()

Unnamed: 0,output,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,0,0,-1.8653,3.479344,47.8891,44.044274,9,10,39,21,...,0.862046,0.116831,0.560654,0.37945,0.268039,20.70684,52.049186,91.644726,0.793068,1.208143
1,0,0,-7.2263,52.219412,137.768,88.208099,0,0,87,44,...,0.883822,0.065994,0.572409,0.487711,0.434117,42.761478,193.812432,465.446927,0.825733,1.494237
2,0,0,-2.1004,4.41168,67.0727,55.758204,6,6,52,24,...,0.910758,0.057143,0.494202,0.319559,0.386393,29.370226,71.693573,143.387359,0.866137,1.200154
3,0,0,-0.5179,0.26822,52.6108,46.640239,6,6,43,20,...,0.90819,0.048952,0.533958,0.387559,0.470449,24.128665,49.765298,100.659835,0.862285,1.391967
4,0,1,0.5703,0.325242,37.1534,45.496309,12,12,36,23,...,0.629985,0.3333,0.474489,0.401925,0.37442,15.58087,59.559927,104.301091,0.444977,1.250833


In [168]:
inf_locations = np.isinf(data)
print("無窮大值的位置：")
print(np.where(inf_locations))

無窮大值的位置：
(array([416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
       416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
       416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
       416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
       416, 416, 416, 416]), array([ 66, 120, 184, 185, 186, 187, 188, 189, 190, 191, 247, 248, 249,
       250, 251, 252, 253, 254, 730, 731, 733, 756, 758, 761, 762, 764,
       765, 777, 778, 779, 780, 862, 863, 865, 888, 890, 893, 894, 896,
       897, 909, 910, 911, 912, 920, 921, 923, 927, 928, 929, 930, 931,
       932, 962, 963, 964]))


In [169]:
for column in data.columns:
    max_value = data[column][data[column] != np.inf].max()  # Calculate the maximum value excluding inf
    data[column].replace(np.inf, max_value, inplace=True)  # Replace inf with the maximum value

In [173]:
data.iloc[416, 66]

1247.874914266118

In [174]:
# 使用KNN補值原始資料的NaN
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(data)

# 使用KNN補值
knn_imputer = KNNImputer(n_neighbors=5)
knn_imputed_df = knn_imputer.fit_transform(scaled_df)

# 將補值後的資料還原為原始範圍
original_df = scaler.inverse_transform(knn_imputed_df)

In [175]:
data.iloc[416, 66]

1247.874914266118

In [178]:
selected_features = data.columns
original_df = pd.DataFrame(original_df, columns=selected_features)
original_df.iloc[416, 66]

1247.874914266118

In [179]:
original_df.to_csv('knn_imputed_data.csv', index=False)