In [1]:
import numpy as np
import pandas as pd
import jax.numpy as jnp

In [2]:
def selu(x, alpha=1.67, lmbda=1.05):
  return lmbda * jnp.where(x > 0, x, alpha * jnp.exp(x) - alpha)

x = jnp.arange(5.0)
print(selu(x))

[0.        1.05      2.1       3.1499999 4.2      ]


In [3]:
from jax import random

key = random.key(1701)
x = random.normal(key, (1_000_000,))
%timeit selu(x).block_until_ready()

7.26 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
from jax import jit

selu_jit = jit(selu)
_ = selu_jit(x)  # compiles on first call
%timeit selu_jit(x).block_until_ready()

1.9 ms ± 184 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [7]:
import torch
import torch.nn as nn

class MLPModel(nn.Module):
  def __init__(self):
      super().__init__()
      self.fc0 = nn.Linear(8, 8, bias=True)
      self.fc1 = nn.Linear(8, 4, bias=True)
      self.fc2 = nn.Linear(4, 2, bias=True)
      self.fc3 = nn.Linear(2, 2, bias=True)

  def forward(self, tensor_x: torch.Tensor):
      tensor_x = self.fc0(tensor_x)
      tensor_x = torch.sigmoid(tensor_x)
      tensor_x = self.fc1(tensor_x)
      tensor_x = torch.sigmoid(tensor_x)
      tensor_x = self.fc2(tensor_x)
      tensor_x = torch.sigmoid(tensor_x)
      output = self.fc3(tensor_x)
      return output

model = MLPModel()
tensor_x = torch.rand((97, 8), dtype=torch.float32)
onnx_program = torch.onnx.export(model, (tensor_x,), dynamo=True)



In [9]:
onnx_program.save("mlp.onnx")

In [7]:
import torch, time
import torch.nn as nn

class SimpleNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(4, 16), nn.ReLU(), nn.Linear(16, 1), nn.Sigmoid())
    def forward(self, x): return self.net(x)

model = SimpleNet()
model.eval()
example_input = torch.rand(100000, 4)

scripted_model = torch.jit.script(model)

def measure(model, x):
    start = time.time()
    with torch.no_grad():
        for _ in range(1000):
            _ = model(x)
    return time.time() - start

print("Regular model:", measure(model, example_input))
print("JIT model:", measure(scripted_model, example_input))

Regular model: 2.734799385070801
JIT model: 2.6656041145324707


In [2]:
import torch, time

In [8]:
input = torch.rand(100000, 1)

In [None]:
trad_sorftmax = []
fast_softmax = []
start = time.time()
for i in range(len(input)):
    e_power = torch.exp(input)
    sum_e_power = torch.sum(e_power)
    softmax = e_power / sum_e_power
    trad_sorftmax.append(softmax)

In [3]:
def majority_element(nums):
    # Step 1: Find a candidate
    count = 0
    candidate = None

    for num in nums:
        if count == 0:
            candidate = num
        count += (1 if num == candidate else -1)

    # Step 2: Verify the candidate
    if nums.count(candidate) > len(nums) // 2:
        return candidate
    else:
        return None

# Example usage
nums = [2, 2, 1, 1, 1, 2, 2,3,3,3,3,3,3,3,3]
print("Majority Element:", majority_element(nums))


Majority Element: 3


In [None]:
# ! pip install rdkit
!pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Processing /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from rdkit==2025.3.3) (1.26.4)
Requirement already satisfied: Pillow in /usr/local/lib/python3.11/dist-packages (from rdkit==2025.3.3) (11.1.0)
Requirement already satisfied: mkl_fft in /usr/local/lib/python3.11/dist-packages (from numpy->rdkit==2025.3.3) (1.3.8)
Requirement already satisfied: mkl_random in /usr/local/lib/python3.11/dist-packages (from numpy->rdkit==2025.3.3) (1.2.4)
Requirement already satisfied: mkl_umath in /usr/local/lib/python3.11/dist-packages (from numpy->rdkit==2025.3.3) (0.1.1)
Requirement already satisfied: mkl in /usr/local/lib/python3.11/dist-packages (from numpy->rdkit==2025.3.3) (2025.1.0)
Requirement already satisfied: tbb4py in /usr/local/lib/python3.11/dist-packages (from numpy->rdkit==2025.3.3) (2022.1.0)
Requirement already satisfied: mkl-service in /usr/local/lib/python3.11/dist-packages (from numpy->rdkit==2025.3.3) (2.4.1)
Requirement already satisfied: intel-openmp<2026,>=2024 in /usr/local/lib/python3.11/dist-packages (from mkl->numpy->rdkit==2025.3.3) (2024.2.0)
Requirement already satisfied: tbb==2022.* in /usr/local/lib/python3.11/dist-packages (from mkl->numpy->rdkit==2025.3.3) (2022.1.0)
Requirement already satisfied: tcmlib==1.* in /usr/local/lib/python3.11/dist-packages (from tbb==2022.*->mkl->numpy->rdkit==2025.3.3) (1.3.0)
Requirement already satisfied: intel-cmplr-lib-rt in /usr/local/lib/python3.11/dist-packages (from mkl_umath->numpy->rdkit==2025.3.3) (2024.2.0)
Requirement already satisfied: intel-cmplr-lib-ur==2024.2.0 in /usr/local/lib/python3.11/dist-packages (from intel-openmp<2026,>=2024->mkl->numpy->rdkit==2025.3.3) (2024.2.0)
Installing collected packages: rdkit
Successfully installed rdkit-2025.3.3
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold,KFold,StratifiedGroupKFold,GroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import os
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors
from rdkit import DataStructs
from rdkit import RDLogger  
RDLogger.DisableLog('rdApp.*')  
os.environ["TOKENIZERS_PARALLELISM"] = "false"
train = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')
test = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')
targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
train
/usr/local/lib/python3.11/dist-packages/pandas/io/formats/format.py:1458: RuntimeWarning: invalid value encountered in greater
  has_large_values = (abs_vals > 1e6).any()
/usr/local/lib/python3.11/dist-packages/pandas/io/formats/format.py:1459: RuntimeWarning: invalid value encountered in less
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
/usr/local/lib/python3.11/dist-packages/pandas/io/formats/format.py:1459: RuntimeWarning: invalid value encountered in greater
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
id	SMILES	Tg	FFV	Tc	Density	Rg
0	87817	*CC(*)c1ccccc1C(=O)OCCCCCC	NaN	0.374645	0.205667	NaN	NaN
1	106919	*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...	NaN	0.370410	NaN	NaN	NaN
2	388772	*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...	NaN	0.378860	NaN	NaN	NaN
3	519416	*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...	NaN	0.387324	NaN	NaN	NaN
4	539187	*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...	NaN	0.355470	NaN	NaN	NaN
...	...	...	...	...	...	...	...
7968	2146592435	*Oc1cc(CCCCCCCC)cc(OC(=O)c2cccc(C(*)=O)c2)c1	NaN	0.367498	NaN	NaN	NaN
7969	2146810552	*C(=O)OCCN(CCOC(=O)c1ccc2c(c1)C(=O)N(c1cccc(N3...	NaN	0.353280	NaN	NaN	NaN
7970	2147191531	*c1cc(C(=O)NCCCCCCCC)cc(N2C(=O)c3ccc(-c4ccc5c(...	NaN	0.369411	NaN	NaN	NaN
7971	2147435020	*C=C(*)c1ccccc1C	261.662355	NaN	NaN	NaN	NaN
7972	2147438299	*c1ccc(OCCCCCCCCCCCOC(=O)CCCCC(=O)OCCCCCCCCCCC...	NaN	0.374049	NaN	NaN	NaN
7973 rows × 7 columns

test
id	SMILES
0	1109053969	*Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc(*)cc4)(C(F)...
1	1422188626	*Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c...
2	2032016830	*c1cccc(OCCCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6...
train.isnull().sum()
id            0
SMILES        0
Tg         7462
FFV         943
Tc         7236
Density    7360
Rg         7359
dtype: int64
%%time

def compute_all_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(desc_names)
    return [desc[1](mol) for desc in Descriptors.descList]

desc_names = [desc[0] for desc in Descriptors.descList]
descriptors = [compute_all_descriptors(smi) for smi in train['SMILES'].to_list()]
descriptors = pd.DataFrame(descriptors, columns=desc_names)

train = pd.concat([train,descriptors],axis=1)
CPU times: user 4min 2s, sys: 777 ms, total: 4min 3s
Wall time: 3min 9s
descriptors = [compute_all_descriptors(smi) for smi in test['SMILES'].to_list()]
descriptors = pd.DataFrame(descriptors, columns=desc_names)
test = pd.concat([test,descriptors],axis=1)
train
/usr/local/lib/python3.11/dist-packages/pandas/io/formats/format.py:1458: RuntimeWarning: invalid value encountered in greater
  has_large_values = (abs_vals > 1e6).any()
/usr/local/lib/python3.11/dist-packages/pandas/io/formats/format.py:1459: RuntimeWarning: invalid value encountered in less
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
/usr/local/lib/python3.11/dist-packages/pandas/io/formats/format.py:1459: RuntimeWarning: invalid value encountered in greater
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
/usr/local/lib/python3.11/dist-packages/pandas/io/formats/format.py:1458: RuntimeWarning: invalid value encountered in greater
  has_large_values = (abs_vals > 1e6).any()
/usr/local/lib/python3.11/dist-packages/pandas/io/formats/format.py:1459: RuntimeWarning: invalid value encountered in less
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
/usr/local/lib/python3.11/dist-packages/pandas/io/formats/format.py:1459: RuntimeWarning: invalid value encountered in greater
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
id	SMILES	Tg	FFV	Tc	Density	Rg	MaxAbsEStateIndex	MaxEStateIndex	MinAbsEStateIndex	...	fr_sulfide	fr_sulfonamd	fr_sulfone	fr_term_acetylene	fr_tetrazole	fr_thiazole	fr_thiocyan	fr_thiophene	fr_unbrch_alkane	fr_urea
0	87817	*CC(*)c1ccccc1C(=O)OCCCCCC	NaN	0.374645	0.205667	NaN	NaN	12.144536	12.144536	0.105927	...	0	0	0	0	0	0	0	0	3	0
1	106919	*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...	NaN	0.370410	NaN	NaN	NaN	3.523412	3.523412	0.098918	...	0	0	0	0	0	0	0	0	2	0
2	388772	*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...	NaN	0.378860	NaN	NaN	NaN	13.714745	13.714745	0.107441	...	0	0	2	0	0	0	0	0	0	0
3	519416	*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...	NaN	0.387324	NaN	NaN	NaN	3.978671	3.978671	0.054569	...	0	0	0	0	0	0	0	0	0	0
4	539187	*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...	NaN	0.355470	NaN	NaN	NaN	13.703218	13.703218	0.068062	...	0	0	0	0	0	0	0	0	12	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
7968	2146592435	*Oc1cc(CCCCCCCC)cc(OC(=O)c2cccc(C(*)=O)c2)c1	NaN	0.367498	NaN	NaN	NaN	12.522270	12.522270	0.172388	...	0	0	0	0	0	0	0	0	5	0
7969	2146810552	*C(=O)OCCN(CCOC(=O)c1ccc2c(c1)C(=O)N(c1cccc(N3...	NaN	0.353280	NaN	NaN	NaN	13.679392	13.679392	0.005765	...	0	0	0	0	0	0	0	0	0	0
7970	2147191531	*c1cc(C(=O)NCCCCCCCC)cc(N2C(=O)c3ccc(-c4ccc5c(...	NaN	0.369411	NaN	NaN	NaN	13.555573	13.555573	0.193809	...	0	0	0	0	0	0	0	0	5	0
7971	2147435020	*C=C(*)c1ccccc1C	261.662355	NaN	NaN	NaN	NaN	2.502315	2.502315	0.396204	...	0	0	0	0	0	0	0	0	0	0
7972	2147438299	*c1ccc(OCCCCCCCCCCCOC(=O)CCCCC(=O)OCCCCCCCCCCC...	NaN	0.374049	NaN	NaN	NaN	12.020227	12.020227	0.116434	...	0	0	0	0	0	0	0	0	17	0
7973 rows × 224 columns

test
id	SMILES	MaxAbsEStateIndex	MaxEStateIndex	MinAbsEStateIndex	MinEStateIndex	qed	SPS	MolWt	HeavyAtomMolWt	...	fr_sulfide	fr_sulfonamd	fr_sulfone	fr_term_acetylene	fr_tetrazole	fr_thiazole	fr_thiocyan	fr_thiophene	fr_unbrch_alkane	fr_urea
0	1109053969	*Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc(*)cc4)(C(F)...	14.296609	14.296609	0.084660	-5.631140	0.133192	13.384615	540.463	522.319	...	0	0	0	0	0	0	0	0	0	0
1	1422188626	*Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c...	13.208391	13.208391	0.079396	-0.162743	0.195542	11.743590	510.589	484.381	...	0	0	0	0	0	0	0	0	0	0
2	2032016830	*c1cccc(OCCCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6...	13.556487	13.556487	0.203889	-0.654083	0.137097	14.454545	586.644	556.404	...	0	0	0	0	0	0	0	0	5	0
3 rows × 219 columns

test.columns.values
array(['id', 'SMILES', 'MaxAbsEStateIndex', 'MaxEStateIndex',
       'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'SPS', 'MolWt',
       'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons',
       'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge',
       'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1',
       'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI',
       'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI',
       'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'AvgIpc',
       'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n',
       'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v',
       'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA',
       'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12',
       'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4',
       'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9',
       'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4',
       'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9',
       'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12',
       'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5',
       'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9', 'TPSA',
       'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2',
       'EState_VSA3', 'EState_VSA4', 'EState_VSA5', 'EState_VSA6',
       'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'VSA_EState1',
       'VSA_EState10', 'VSA_EState2', 'VSA_EState3', 'VSA_EState4',
       'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8',
       'VSA_EState9', 'FractionCSP3', 'HeavyAtomCount', 'NHOHCount',
       'NOCount', 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles',
       'NumAliphaticRings', 'NumAmideBonds', 'NumAromaticCarbocycles',
       'NumAromaticHeterocycles', 'NumAromaticRings',
       'NumAtomStereoCenters', 'NumBridgeheadAtoms', 'NumHAcceptors',
       'NumHDonors', 'NumHeteroatoms', 'NumHeterocycles',
       'NumRotatableBonds', 'NumSaturatedCarbocycles',
       'NumSaturatedHeterocycles', 'NumSaturatedRings', 'NumSpiroAtoms',
       'NumUnspecifiedAtomStereoCenters', 'Phi', 'RingCount', 'MolLogP',
       'MolMR', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN',
       'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH', 'fr_Ar_OH', 'fr_COO',
       'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN',
       'fr_Imine', 'fr_NH0', 'fr_NH1', 'fr_NH2', 'fr_N_O',
       'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH',
       'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide',
       'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline',
       'fr_aryl_methyl', 'fr_azide', 'fr_azo', 'fr_barbitur',
       'fr_benzene', 'fr_benzodiazepine', 'fr_bicyclic', 'fr_diazo',
       'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether',
       'fr_furan', 'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone',
       'fr_imidazole', 'fr_imide', 'fr_isocyan', 'fr_isothiocyan',
       'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone',
       'fr_methoxy', 'fr_morpholine', 'fr_nitrile', 'fr_nitro',
       'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_nitroso',
       'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol',
       'fr_phenol_noOrthoHbond', 'fr_phos_acid', 'fr_phos_ester',
       'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd',
       'fr_pyridine', 'fr_quatN', 'fr_sulfide', 'fr_sulfonamd',
       'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole',
       'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea'],
      dtype=object)
%%time
def lgb_kfold(train_df, test_df, target, feats, folds):    
    params = {    
         'objective' : 'mae',#'binary', 
         'metric' : 'mae', 
         'num_leaves': 31,
         'min_data_in_leaf': 30,#30,
         'learning_rate': 0.01,
         'max_depth': -1,
         'max_bin': 256,
         'boosting': 'gbdt',
         'feature_fraction': 0.7,
         'bagging_freq': 1,
         'bagging_fraction': 0.7,
         'bagging_seed': 42,
         "lambda_l1":1,
         "lambda_l2":1,
         'verbosity': -1,        
         'num_boost_round' : 20000,
         'device_type' : 'cpu'        
    }      
    
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    cv_list = []
    df_importances = pd.DataFrame()
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, train_df[target])):     
        print ('n_fold:',n_fold)
        
        train_x = train_df[feats].iloc[train_idx].values
        train_y = train_df[target].iloc[train_idx].values
        
        valid_x = train_df[feats].iloc[valid_idx].values
        valid_y = train_df[target].iloc[valid_idx].values

        test_x = test_df[feats]
        
        print ('train_x',train_x.shape)
        print ('valid_x',valid_x.shape)    
        print ('test_x',test_x.shape)  
        
        dtrain = lgb.Dataset(train_x, label=train_y, )
        dval = lgb.Dataset(valid_x, label=valid_y, reference=dtrain, ) 
        callbacks = [
        lgb.log_evaluation(period=100,),
        lgb.early_stopping(200)    
        ]
        bst = lgb.train(params, dtrain,valid_sets=[dval,dtrain],callbacks=callbacks,
                       ) 

        #---------- feature_importances ---------#
        feature_importances = sorted(zip(feats, bst.feature_importance('gain')),key=lambda x: x[1], reverse=True)#[:100]
        for f in feature_importances[:30]:
            print (f)       
            
        new_feats = []
        importances = []
        for f in feature_importances:
            new_feats.append(f[0])
            importances.append(f[1])
        df_importance = pd.DataFrame()
        df_importance['feature'] = new_feats
        df_importance['importance'] = importances
        df_importance['fold'] = n_fold
        
        oof_preds[valid_idx] = bst.predict(valid_x, num_iteration=bst.best_iteration)
        # oof_cv = rmse(valid_y,  oof_preds[valid_idx])
        # cv_list.append(oof_cv)
        # print (cv_list)
        
        sub_preds += bst.predict(test_x, num_iteration=bst.best_iteration) / n_splits
        
        #bst.save_model(model_path+'lgb_fold_' + str(n_fold) + '.txt', num_iteration=bst.best_iteration)     

        df_importances = pd.concat([df_importances,df_importance])    
        
    # cv = mae(train_df[target],  oof_preds)
    # print (cv)
    
    return oof_preds,sub_preds

n_splits = 5
seed = 817
folds = KFold(n_splits=n_splits, random_state=seed, shuffle=True)
feats = ['MaxAbsEStateIndex', 'MaxEStateIndex',
       'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'SPS', 'MolWt',
       'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons',
       'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge',
       'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1',
       'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI',
       'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI',
       'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'AvgIpc',
       'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n',
       'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v',
       'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA',
       'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12',
       'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4',
       'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9',
       'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4',
       'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9',
       'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12',
       'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5',
       'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9', 'TPSA',
       'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2',
       'EState_VSA3', 'EState_VSA4', 'EState_VSA5', 'EState_VSA6',
       'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'VSA_EState1',
       'VSA_EState10', 'VSA_EState2', 'VSA_EState3', 'VSA_EState4',
       'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8',
       'VSA_EState9', 'FractionCSP3', 'HeavyAtomCount', 'NHOHCount',
       'NOCount', 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles',
       'NumAliphaticRings', 'NumAmideBonds', 'NumAromaticCarbocycles',
       'NumAromaticHeterocycles', 'NumAromaticRings',
       'NumAtomStereoCenters', 'NumBridgeheadAtoms', 'NumHAcceptors',
       'NumHDonors', 'NumHeteroatoms', 'NumHeterocycles',
       'NumRotatableBonds', 'NumSaturatedCarbocycles',
       'NumSaturatedHeterocycles', 'NumSaturatedRings', 'NumSpiroAtoms',
       'NumUnspecifiedAtomStereoCenters', 'Phi', 'RingCount', 'MolLogP',
       'MolMR', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN',
       'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH', 'fr_Ar_OH', 'fr_COO',
       'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN',
       'fr_Imine', 'fr_NH0', 'fr_NH1', 'fr_NH2', 'fr_N_O',
       'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH',
       'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide',
       'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline',
       'fr_aryl_methyl', 'fr_azide', 'fr_azo', 'fr_barbitur',
       'fr_benzene', 'fr_benzodiazepine', 'fr_bicyclic', 'fr_diazo',
       'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether',
       'fr_furan', 'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone',
       'fr_imidazole', 'fr_imide', 'fr_isocyan', 'fr_isothiocyan',
       'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone',
       'fr_methoxy', 'fr_morpholine', 'fr_nitrile', 'fr_nitro',
       'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_nitroso',
       'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol',
       'fr_phenol_noOrthoHbond', 'fr_phos_acid', 'fr_phos_ester',
       'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd',
       'fr_pyridine', 'fr_quatN', 'fr_sulfide', 'fr_sulfonamd',
       'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole',
       'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea']
for t in targets:
    print (t)
    if len(test)<10:
        test[t] = 0
    else:    
        train_df = train[train[t].notnull()]
        oof_preds,sub_preds = lgb_kfold(train_df, test, t, feats, folds)
        test[t] = sub_preds
Tg
FFV
Tc
Density
Rg
CPU times: user 2.03 ms, sys: 0 ns, total: 2.03 ms
Wall time: 1.96 ms
test
id	SMILES	MaxAbsEStateIndex	MaxEStateIndex	MinAbsEStateIndex	MinEStateIndex	qed	SPS	MolWt	HeavyAtomMolWt	...	fr_thiazole	fr_thiocyan	fr_thiophene	fr_unbrch_alkane	fr_urea	Tg	FFV	Tc	Density	Rg
0	1109053969	*Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc(*)cc4)(C(F)...	14.296609	14.296609	0.084660	-5.631140	0.133192	13.384615	540.463	522.319	...	0	0	0	0	0	0	0	0	0	0
1	1422188626	*Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c...	13.208391	13.208391	0.079396	-0.162743	0.195542	11.743590	510.589	484.381	...	0	0	0	0	0	0	0	0	0	0
2	2032016830	*c1cccc(OCCCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6...	13.556487	13.556487	0.203889	-0.654083	0.137097	14.454545	586.644	556.404	...	0	0	0	5	0	0	0	0	0	0
3 rows × 224 columns

test[['id','Tg', 'FFV', 'Tc', 'Density', 'Rg']].to_csv('submission.csv',index=False)