In [1]:
# 导入相关库
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from lightgbm import log_evaluation, early_stopping
import xgboost as xgb
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
df_train_mord_schr = pd.read_csv('df_train_feats.csv')
df_test_mord_schr = pd.read_csv('df_test_feats.csv')

**Here starts LightGBMPrediction**

In [4]:
callbacks = [ early_stopping(stopping_rounds=30)]
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'mae',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 8,
         "random_state": 666}

groups = df_train_mord_schr.groupby('Lab')

errors = {}
models = {}


for name, group in groups:
    X = group.drop(['SMILES','Lab','RT',], axis=1)
    y = group['RT']*60
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    mse_list = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_test, label=y_test)
        reg = lgb.train(param, train_data, valid_sets=[train_data, valid_data],callbacks=callbacks )
        y_pred = reg.predict(X_test,num_iteration=reg.best_iteration )
        
        mse = mean_absolute_error(y_test, y_pred)
        
        mse_list.append(mse)
    
    errors[name] = np.mean(mse_list)
    
    models[name] = reg

Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[100]	training's l1: 36.4286	valid_1's l1: 47.9469
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[100]	training's l1: 36.4147	valid_1's l1: 37.4515
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[100]	training's l1: 37.2456	valid_1's l1: 43.4536
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[100]	training's l1: 37.8175	valid_1's l1: 35.4427
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[100]	training's l1: 37.2836	valid_1's l1: 45.5823
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[100]	training's l1: 95.8939	valid_1's l1: 140.613
Training until validation scores don't i

Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[100]	training's l1: 44.2169	valid_1's l1: 42.0046
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[100]	training's l1: 42.4379	valid_1's l1: 49.7656
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[100]	training's l1: 43.7908	valid_1's l1: 46.0972
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[100]	training's l1: 42.2967	valid_1's l1: 49.9056
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[100]	training's l1: 42.873	valid_1's l1: 49.7668
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[1]	training's l1: 108.498	valid_1's l1: 104.042
Training until validation scores don't improve for 30 ro

Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[1]	training's l1: 132.128	valid_1's l1: 122.58
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[1]	training's l1: 129.537	valid_1's l1: 133.304
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[1]	training's l1: 135.711	valid_1's l1: 110.271
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[1]	training's l1: 130.821	valid_1's l1: 128.125
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[1]	training's l1: 123.12	valid_1's l1: 160.894


In [10]:
for name in errors:
    print(f'Category: {name}, MAE: {errors[name]}')

Category: Aarhus, MAE: 42.38260456810935
Category: Academy of Forensic Science, MAE: 119.83885356490066
Category: Adelaide, MAE: 72.51840038857029
Category: Australian Racing Forensic Laboratory, MAE: 28.14223155314489
Category: CFSRE, MAE: 53.23866913900139
Category: ChemCentre, MAE: 68.71052277414437
Category: Copenhagen, MAE: 84.29632045187395
Category: Estonian Forensic Science Institute, MAE: 84.19100320699984
Category: Finnish Customs Laboratory, MAE: 63.6021270407564
Category: Ghent University, MAE: 87.7731087598759
Category: IUPA, UJI I (E), MAE: 214.68768475677217
Category: King's College Hospital, MAE: 47.85094981149651
Category: LADR, MAE: 112.24566538887898
Category: Labor Krone, MAE: 91.06019970703123
Category: Mainz, MAE: 110.63710244408153
Category: Odense, MAE: 82.85909543504901
Category: San Francisco OCME, MAE: 49.04655211610766
Category: The University of Queensland, MAE: 151.5551894039578
Category: Trondheim, MAE: 140.17995455909497
Category: University Hospital of 

**Here starts XGBOOST Prediction**

In [5]:
callbacks = [ early_stopping(stopping_rounds=30)]

groups = df_train_mord_schr.groupby('Lab')
param = {'max_depth': 3, 'eta': 0.1, 'objective': 'reg:absoluteerror','nthread':12,'min_child_weight':1.1,'gamma':0.1,'lambda':10,'subsample':0.7,'colsample_bytree':0.7,'colsample_bylevel':0.7,}

errors = {}
models = {}

lab_fold_predictions = pd.DataFrame()

for name, group in groups:
    X = group.drop(['SMILES','Lab','RT',], axis=1)
    y = group['RT']*60
    
    test_group = df_test_mord_schr[df_test_mord_schr['Lab'] == name]
    testset_X  = test_group.drop(['SMILES','Lab',], axis=1)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mse_list = []
    
    predictions = np.zeros(len(testset_X)) 
    for train_index, test_index in kf.split(X):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        train_data = xgb.DMatrix(X_train, label=y_train)   ##lgb.Dataset(X_train, label=y_train)
        valid_data = xgb.DMatrix(X_test, label=y_test)
        watchlist = [(train_data, "train"), (valid_data, "valid")]
        
        testset_X_data = xgb.DMatrix(testset_X, )
        reg =  xgb.train(param, train_data, evals=watchlist,  early_stopping_rounds=10)
        y_pred = reg.predict(valid_data,)#num_iteration=reg.best_iteration )
        
        predictions += reg.predict(testset_X_data, ) / kf.n_splits
        
        mse = mean_absolute_error(y_test, y_pred)
        mse_list.append(mse)
        
    fold_predictions = pd.DataFrame([test_group['SMILES'].to_list(),[name]*testset_X.shape[0], predictions],index=['SMILES','Lab','RT']).T   
 #   print(fold_predictions)
    lab_fold_predictions = pd.concat([lab_fold_predictions,fold_predictions],axis=0, )

    
    errors[name] = np.mean(mse_list)
    models[name] = reg
    

[0]	train-mae:65.91335	valid-mae:81.17083
[1]	train-mae:61.92058	valid-mae:77.67286
[2]	train-mae:58.59920	valid-mae:74.94598
[3]	train-mae:55.03186	valid-mae:70.85737
[4]	train-mae:51.29728	valid-mae:67.34432
[5]	train-mae:47.56782	valid-mae:63.86177
[6]	train-mae:45.20125	valid-mae:61.11004
[7]	train-mae:42.50628	valid-mae:57.46641
[8]	train-mae:39.77574	valid-mae:54.39702
[9]	train-mae:37.85309	valid-mae:51.96983
[0]	train-mae:69.77958	valid-mae:60.52250
[1]	train-mae:65.89429	valid-mae:57.51742
[2]	train-mae:61.68697	valid-mae:54.81317
[3]	train-mae:56.54469	valid-mae:50.09444
[4]	train-mae:52.91424	valid-mae:46.20170
[5]	train-mae:49.85347	valid-mae:43.97014
[6]	train-mae:46.20351	valid-mae:41.87288
[7]	train-mae:42.86742	valid-mae:39.79364
[8]	train-mae:39.78798	valid-mae:37.45081
[9]	train-mae:37.89063	valid-mae:35.36612
[0]	train-mae:69.16826	valid-mae:67.62584
[1]	train-mae:64.59672	valid-mae:65.22601
[2]	train-mae:60.79687	valid-mae:63.08180
[3]	train-mae:56.88181	valid-mae:5

[3]	train-mae:32.06615	valid-mae:40.21460
[4]	train-mae:29.63578	valid-mae:37.65764
[5]	train-mae:27.68820	valid-mae:35.02747
[6]	train-mae:25.60597	valid-mae:33.29625
[7]	train-mae:24.02767	valid-mae:32.27664
[8]	train-mae:22.53155	valid-mae:30.92761
[9]	train-mae:21.07958	valid-mae:29.97605
[0]	train-mae:105.58369	valid-mae:110.61834
[1]	train-mae:96.97710	valid-mae:102.31543
[2]	train-mae:89.72734	valid-mae:96.60306
[3]	train-mae:82.28254	valid-mae:89.82616
[4]	train-mae:75.85624	valid-mae:83.41523
[5]	train-mae:69.93790	valid-mae:78.18571
[6]	train-mae:65.12303	valid-mae:73.91459
[7]	train-mae:60.01689	valid-mae:69.27722
[8]	train-mae:55.41992	valid-mae:65.11564
[9]	train-mae:51.61899	valid-mae:61.65949
[0]	train-mae:104.18387	valid-mae:118.61426
[1]	train-mae:95.80590	valid-mae:110.24645
[2]	train-mae:88.22529	valid-mae:102.69982
[3]	train-mae:81.08531	valid-mae:95.60284
[4]	train-mae:75.09077	valid-mae:89.38844
[5]	train-mae:70.31005	valid-mae:84.75026
[6]	train-mae:65.29238	vali

[6]	train-mae:56.18211	valid-mae:52.04375
[7]	train-mae:52.04207	valid-mae:50.50691
[8]	train-mae:48.37587	valid-mae:47.85227
[9]	train-mae:44.84047	valid-mae:47.50509
[0]	train-mae:78.17948	valid-mae:65.62714
[1]	train-mae:72.34813	valid-mae:65.07932
[2]	train-mae:68.20544	valid-mae:62.74989
[3]	train-mae:63.90939	valid-mae:60.91833
[4]	train-mae:60.17708	valid-mae:58.20264
[5]	train-mae:57.38416	valid-mae:57.13224
[6]	train-mae:53.62487	valid-mae:56.82493
[7]	train-mae:51.60747	valid-mae:56.16911
[8]	train-mae:48.82057	valid-mae:53.85532
[9]	train-mae:46.96421	valid-mae:52.59977
[0]	train-mae:88.49406	valid-mae:81.67565
[1]	train-mae:80.96683	valid-mae:76.54278
[2]	train-mae:74.89683	valid-mae:70.07076
[3]	train-mae:69.73959	valid-mae:67.10218
[4]	train-mae:64.39207	valid-mae:63.55259
[5]	train-mae:59.58667	valid-mae:60.52910
[6]	train-mae:55.68538	valid-mae:58.71131
[7]	train-mae:52.29583	valid-mae:56.91837
[8]	train-mae:49.17925	valid-mae:56.52446
[9]	train-mae:47.07745	valid-mae:5

[9]	train-mae:43.17108	valid-mae:53.57586
[0]	train-mae:75.91779	valid-mae:75.46171
[1]	train-mae:69.93256	valid-mae:70.75478
[2]	train-mae:64.71840	valid-mae:67.38104
[3]	train-mae:59.66606	valid-mae:65.16084
[4]	train-mae:55.30681	valid-mae:61.33779
[5]	train-mae:51.88153	valid-mae:58.12658
[6]	train-mae:48.90880	valid-mae:56.29420
[7]	train-mae:46.69463	valid-mae:53.67359
[8]	train-mae:44.10868	valid-mae:52.74690
[9]	train-mae:41.90000	valid-mae:51.27416
[0]	train-mae:73.88591	valid-mae:87.31000
[1]	train-mae:68.42113	valid-mae:80.79150
[2]	train-mae:63.79097	valid-mae:75.69861
[3]	train-mae:58.81313	valid-mae:70.61024
[4]	train-mae:54.95749	valid-mae:66.23417
[5]	train-mae:51.73669	valid-mae:64.04051
[6]	train-mae:48.72309	valid-mae:60.84363
[7]	train-mae:46.45117	valid-mae:58.41313
[8]	train-mae:43.56775	valid-mae:56.60640
[9]	train-mae:40.92278	valid-mae:54.49323
[0]	train-mae:95.16708	valid-mae:122.09785
[1]	train-mae:89.13135	valid-mae:113.67770
[2]	train-mae:82.86817	valid-mae

[2]	train-mae:67.39701	valid-mae:65.76443
[3]	train-mae:63.81127	valid-mae:64.00320
[4]	train-mae:59.49094	valid-mae:64.70615
[5]	train-mae:56.50527	valid-mae:64.95079
[6]	train-mae:53.97501	valid-mae:64.03585
[7]	train-mae:51.60325	valid-mae:59.60419
[8]	train-mae:49.49119	valid-mae:59.85394
[9]	train-mae:47.54093	valid-mae:56.80502
[0]	train-mae:74.56285	valid-mae:59.93250
[1]	train-mae:68.41079	valid-mae:54.96862
[2]	train-mae:63.69584	valid-mae:54.03751
[3]	train-mae:59.21806	valid-mae:47.69621
[4]	train-mae:56.12417	valid-mae:46.84427
[5]	train-mae:53.40440	valid-mae:45.90147
[6]	train-mae:50.64588	valid-mae:44.86810
[7]	train-mae:48.38701	valid-mae:44.68892
[8]	train-mae:45.68122	valid-mae:43.70605
[9]	train-mae:43.58299	valid-mae:42.81878
[0]	train-mae:56.71372	valid-mae:145.11375
[1]	train-mae:53.87057	valid-mae:141.33300
[2]	train-mae:48.87036	valid-mae:138.80183
[3]	train-mae:45.96376	valid-mae:136.75188
[4]	train-mae:42.22106	valid-mae:131.08039
[5]	train-mae:39.63797	valid-

[4]	train-mae:128.83680	valid-mae:242.49194
[5]	train-mae:117.10279	valid-mae:227.38247
[6]	train-mae:107.74723	valid-mae:215.04855
[7]	train-mae:99.44260	valid-mae:206.73770
[8]	train-mae:91.25123	valid-mae:197.00118
[9]	train-mae:88.30149	valid-mae:192.43477
[0]	train-mae:209.50941	valid-mae:202.25999
[1]	train-mae:193.02564	valid-mae:186.33149
[2]	train-mae:175.78768	valid-mae:170.34584
[3]	train-mae:159.47487	valid-mae:156.36624
[4]	train-mae:145.13047	valid-mae:138.65178
[5]	train-mae:133.59813	valid-mae:127.39506
[6]	train-mae:120.83262	valid-mae:123.44635
[7]	train-mae:109.96358	valid-mae:110.47536
[8]	train-mae:101.85684	valid-mae:102.50411
[9]	train-mae:93.01979	valid-mae:93.51734
[0]	train-mae:207.28677	valid-mae:226.98375
[1]	train-mae:189.35868	valid-mae:209.58599
[2]	train-mae:171.82753	valid-mae:191.73803
[3]	train-mae:155.40628	valid-mae:176.60110
[4]	train-mae:143.74319	valid-mae:163.96788
[5]	train-mae:131.21828	valid-mae:152.41796
[6]	train-mae:119.84613	valid-mae:138

In [17]:
for name in errors:
    print(f'Category: {name}, MAE: {errors[name]}')

Category: Aarhus, MAE: 45.41688212948754
Category: Academy of Forensic Science, MAE: 106.92542345201646
Category: Adelaide, MAE: 74.20083655996172
Category: Australian Racing Forensic Laboratory, MAE: 27.258848537868925
Category: CFSRE, MAE: 58.176997435461956
Category: ChemCentre, MAE: 68.1953873181217
Category: Copenhagen, MAE: 89.80087748970642
Category: Estonian Forensic Science Institute, MAE: 63.94618242536272
Category: Finnish Customs Laboratory, MAE: 60.570656523572595
Category: Ghent University, MAE: 57.18158027455054
Category: IUPA, UJI I (E), MAE: 119.01755353291828
Category: King's College Hospital, MAE: 50.75984756377248
Category: LADR, MAE: 68.49790982089199
Category: Labor Krone, MAE: 66.90434252929688
Category: Mainz, MAE: 118.99505207309446
Category: Odense, MAE: 61.32505811903211
Category: San Francisco OCME, MAE: 54.87039311108996
Category: The University of Queensland, MAE: 98.2032533094618
Category: Trondheim, MAE: 107.13513605075414
Category: University Hospital o

In [None]:
#lab_fold_predictions['RT'] = lab_fold_predictions['RT']/60
#lab_fold_predictions.to_csv('new-xgb-out-mins.csv')