In [2]:
%pip install lightgbm

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
train_df = pd.read_csv('/home/codespace/.jupyter/NeurIPS/Data/train.csv')
ts = pd.read_csv('/home/codespace/.jupyter/NeurIPS/Data/test.csv')
ss = pd.read_csv('/home/codespace/.jupyter/NeurIPS/Data/sample_submission.csv')

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split


# 1. split off 20% for dev_test
temp_df, dev_test = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,  # for reproducibility
    shuffle=True
)

# 2. split the remaining 80% into 75% train / 25% valid → 0.6 / 0.2 overall
dev_train, dev_val = train_test_split(
    temp_df,
    test_size=0.25,  # 0.25 * 0.8 = 0.2 of the original
    random_state=42,
    shuffle=True
)

# Verify sizes
print(f"Total rows:   {len(train_df)}")
print(f"Dev train:    {len(dev_train)} ({len(dev_train)/len(train_df):.2%})")
print(f"Dev valid:    {len(dev_val)} ({len(dev_val)/len(train_df):.2%})")
print(f"Dev test:     {len(dev_test)} ({len(dev_test)/len(train_df):.2%})")
print(f"Polymer example:{dev_train['SMILES'].to_list()[:3]}")
print(f"Columns:{dev_train.columns}")

Total rows:   7973
Dev train:    4783 (59.99%)
Dev valid:    1595 (20.01%)
Dev test:     1595 (20.01%)
Polymer example:['*Nc1ccc(CC(CC(C)(C)c2ccc(N*)cc2)=C(C)C)cc1', '*CC(*)(CC(=O)OC)C(=O)OC12CC3CC(C)(CC(C)(C3)C1)C2', '*OP(=O)(Oc1c(Cl)cc(Cl)cc1Cl)Oc1c(Cl)c(Cl)c(*)c(Cl)c1Cl']
Columns:Index(['id', 'SMILES', 'Tg', 'FFV', 'Tc', 'Density', 'Rg'], dtype='object')


In [5]:

char_dic = {
    '<pad>': 0,
    '#': 1,   # Triple bond
    '%': 2,   # Two-digit ring closure (e.g., '%10')
    '(': 3,   # Branch opening
    ')': 4,   # Branch closing
    '*': 5,   # Wildcard atom (used in BigSMILES for polymer repeating units)
    '+': 6,   # Positive charge
    '-': 7,   # Negative charge
    '0': 8,   # Ring closure digit
    '1': 9,
    '2': 10,
    '3': 11,
    '4': 12,
    '5': 13,
    '6': 14,
    '7': 15,
    '8': 16,
    '9': 17,
    '.': 18,  # Disconnected structures
    '/': 19,  # Stereochemistry (up bond)
    '1': 20,  # Stereochemistry (down bond)
    'A': 21
}

In [15]:
# Install dependencies
%pip install torch_molecule tqdm

# Import libraries
from tqdm.notebook import tqdm as notebook_tqdm
import tqdm
tqdm.tqdm = notebook_tqdm
tqdm.trange = notebook_tqdm

from torch_molecule import LSTMMolecularPredictor
from torch_molecule.utils.search import ParameterType, ParameterSpec
import pandas as pd

# Define search parameters for hyperparameter tuning
search_parameters = {
    "output_dim": ParameterSpec(ParameterType.INTEGER, (8, 32)),
    "LSTMunits": ParameterSpec(ParameterType.INTEGER, (30, 120)),
    "learning_rate": ParameterSpec(ParameterType.LOG_FLOAT, (1e-4, 1e-2)),
}

# Initialize LSTM model for regression with 5 targets
lstm = LSTMMolecularPredictor(
    task_type="regression",
    num_task=5,
    batch_size=192,
    epochs=200,
    verbose=True
)

print("Model initialized successfully")

# Prepare training and validation data
X_train = dev_train['SMILES'].to_list()
y_train = dev_train[['Tg', 'FFV', 'Tc', 'Density', 'Rg']].to_numpy()

X_val = dev_val['SMILES'].to_list()
y_val = dev_val[['Tg', 'FFV', 'Tc', 'Density', 'Rg']].to_numpy()

# Train the model with hyperparameter search
lstm.autofit(
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    search_parameters=search_parameters,
    n_trials=10  # number of hyperparameter search trials
)

print("Model training complete!")



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


[I 2025-09-12 23:45:23,598] A new study created in memory with name: LSTMMolecularPredictor_optimization


Note: you may need to restart the kernel to use updated packages.
Model initialized successfully

Parameter Search Configuration:
--------------------------------------------------

 Parameters being searched:
  • LSTMunits: [30, 120]
  • learning_rate: [0.0001, 0.01]
  • output_dim: [8, 32]

 Fixed parameters (not being searched):
  • batch_size: 192
  • device: cpu
  • epochs: 200
  • evaluate_criterion: <function mean_absolute_error at 0x7c4277d6d800>
  • evaluate_higher_better: False
  • evaluate_name: mae
  • fitting_epoch: 0
  • fitting_loss: []
  • input_dim: 54
  • loss_criterion: MSELoss()
  • max_input_len: 200
  • num_task: 5
  • patience: 50
  • scheduler_factor: 0.5
  • scheduler_patience: 5
  • task_type: regression
  • use_lr_scheduler: False
  • verbose: True
  • weight_decay: 0.0

--------------------------------------------------

Starting hyperparameter optimization using mae metric
Direction: minimize
Number of trials: 10


--- Logging error ---                 
Traceback (most recent call last):
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/optuna/progress_bar.py", line 24, in emit
    tqdm.write(msg)
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/std.py", line 720, in write
    with cls.external_write_mode(file=file, nolock=nolock):
  File "/usr/local/python/3.12.1/lib/python3.12/contextlib.py", line 144, in __exit__
    next(self.gen)
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/std.py", line 750, in external_write_mode
    inst.refresh(nolock=True)
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/std.py", line 1347, in refresh
    self.display()
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/notebook.py", line 156, in display
    ltext, pbar, rtext = self.container.children
                         ^^^^^^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'container'
Call stack:
  File "<frozen

[W 2025-09-12 23:45:23,605] Trial 0 failed with parameters: {'output_dim': 29, 'LSTMunits': 42, 'learning_rate': 0.0003796094420115622} because of the following error: ImportError('IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html').
Traceback (most recent call last):
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/torch_molecule/predictor/lstm/modeling_lstm.py", line 304, in objective
    self.fit(X_train, y_train, X_val, y_val)
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/torch_molecule/predictor/lstm/modeling_lstm.py", line 423, in fit
    train_dataset = self._convert_to_pytorch_data(X_train, y_train)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/python/3.12.

--- Logging error ---
Traceback (most recent call last):
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/optuna/progress_bar.py", line 24, in emit
    tqdm.write(msg)
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/std.py", line 720, in write
    with cls.external_write_mode(file=file, nolock=nolock):
  File "/usr/local/python/3.12.1/lib/python3.12/contextlib.py", line 144, in __exit__
    next(self.gen)
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/std.py", line 750, in external_write_mode
    inst.refresh(nolock=True)
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/std.py", line 1347, in refresh
    self.display()
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/notebook.py", line 156, in display
    ltext, pbar, rtext = self.container.children
                         ^^^^^^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'container'
Call stack:
  File "<frozen runpy>", line 19

[W 2025-09-12 23:45:23,658] Trial 3 failed with parameters: {'output_dim': 21, 'LSTMunits': 97, 'learning_rate': 0.004524421285451781} because of the following error: ImportError('IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html').
Traceback (most recent call last):
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/torch_molecule/predictor/lstm/modeling_lstm.py", line 304, in objective
    self.fit(X_train, y_train, X_val, y_val)
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/torch_molecule/predictor/lstm/modeling_lstm.py", line 423, in fit
    train_dataset = self._convert_to_pytorch_data(X_train, y_train)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/python/3.12.1

--- Logging error ---
Traceback (most recent call last):
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/optuna/progress_bar.py", line 24, in emit
    tqdm.write(msg)
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/std.py", line 720, in write
    with cls.external_write_mode(file=file, nolock=nolock):
  File "/usr/local/python/3.12.1/lib/python3.12/contextlib.py", line 144, in __exit__
    next(self.gen)
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/std.py", line 750, in external_write_mode
    inst.refresh(nolock=True)
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/std.py", line 1347, in refresh
    self.display()
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/notebook.py", line 156, in display
    ltext, pbar, rtext = self.container.children
                         ^^^^^^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'container'
Call stack:
  File "<frozen runpy>", line 19

[W 2025-09-12 23:45:23,699] Trial 5 failed with parameters: {'output_dim': 8, 'LSTMunits': 113, 'learning_rate': 0.00033125219830987273} because of the following error: ImportError('IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html').
Traceback (most recent call last):
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/torch_molecule/predictor/lstm/modeling_lstm.py", line 304, in objective
    self.fit(X_train, y_train, X_val, y_val)
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/torch_molecule/predictor/lstm/modeling_lstm.py", line 423, in fit
    train_dataset = self._convert_to_pytorch_data(X_train, y_train)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/python/3.12




In [16]:
# Load sample submission and test set
sample_sub = ss.copy()
ts_test = ts.copy()

print("Sample submission:")
print(sample_sub.head())
print("Test set:")
print(ts_test.head())

# Prepare test SMILES list
X_test = ts_test['SMILES'].to_list()

# Predict using the trained LSTM model
lstm_preds = lstm.predict(X_test)  # Returns a NumPy array (num_samples, 5)

# Build the submission DataFrame
submission_df = sample_sub.copy()
submission_df[['Tg', 'FFV', 'Tc', 'Density', 'Rg']] = lstm_preds

print("Submission DataFrame:")
print(submission_df.head())

Exception ignored in: <function tqdm.__del__ at 0x7c4296bde980>
Traceback (most recent call last):
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/std.py", line 1148, in __del__
    self.close()
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'
Exception ignored in: <function tqdm.__del__ at 0x7c4296bde980>
Traceback (most recent call last):
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/std.py", line 1148, in __del__
    self.close()
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


Sample submission:
           id  Tg  FFV  Tc  Density  Rg
0  1109053969   0    0   0        0   0
1  1422188626   0    0   0        0   0
2  2032016830   0    0   0        0   0
Test set:
           id                                             SMILES
0  1109053969  *Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc(*)cc4)(C(F)...
1  1422188626  *Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c...
2  2032016830  *c1cccc(OCCCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6...


AttributeError: This model is not fitted yet. Call 'fit' before using it.

In [19]:
# Ensure test SMILES are strings
X_test = ts['SMILES'].astype(str).to_list()

# Predict using the trained LSTM model
preds_dict = lstm.predict(X_test)
lstm_preds = preds_dict['prediction']  # NumPy array of shape (num_samples, 5)

# Build the submission DataFrame
submission_df = ss.copy()
submission_df[['Tg', 'FFV', 'Tc', 'Density', 'Rg']] = lstm_preds

# Preview predictions
print(submission_df.head())

# Save predictions to CSV
submission_df.to_csv('submission.csv', index=False)
print("Predictions for test set saved as submission.csv")


AttributeError: This model is not fitted yet. Call 'fit' before using it.

Exception ignored in: <function tqdm.__del__ at 0x7c4296bde980>
Traceback (most recent call last):
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/std.py", line 1148, in __del__
    self.close()
  File "/usr/local/python/3.12.1/lib/python3.12/site-packages/tqdm/notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


In [None]:
# save to CSV
submission_df.to_csv('submission.csv', index=False)