In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from rdkit import Chem
from rdkit.Chem import Descriptors
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
print("Loading Raw Data...")
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

# Store test IDs for submission file
test_ids = test_df['id']

print(f"Initial train shape: {train_df.shape}")
print(f"Initial test shape: {test_df.shape}")

Loading Raw Data...
Initial train shape: (2662, 427)
Initial test shape: (666, 426)


In [3]:
# 2. Log transform Target variable
print("Applying Log Transform to target variable (Tm)")
train_df['Tm'] = np.log1p(train_df['Tm'])
print("Tm transformed to log scale")

Applying Log Transform to target variable (Tm)
Tm transformed to log scale


In [4]:
# 3. Remove low variance Features
print("\nRemoving low and zero variance features")
group_features = [col for col in train_df.columns if col.startswith('Group')]

selector = VarianceThreshold(threshold=0.01)
selector.fit(train_df[group_features])

# Get the features to keep
retained_features = train_df[group_features].columns[selector.get_support()]
removed_features = train_df[group_features].columns[~selector.get_support()]

print(f"Number of features removed: {len(removed_features)}")
print(f"Number of features retained: {len(retained_features)}")

# Filter both train and test dataframes
train_df = train_df[['id', 'SMILES', 'Tm'] + list(retained_features)]
test_df = test_df[['id', 'SMILES'] + list(retained_features)]

print(f"Train shape after variance removal: {train_df.shape}")
print(f"Test shape after variance removal: {test_df.shape}")


Removing low and zero variance features
Number of features removed: 319
Number of features retained: 105
Train shape after variance removal: (2662, 108)
Test shape after variance removal: (666, 107)


In [5]:
print(f"Engineering Features from SMILES using RDKit")

def generate_rdkit_description(smiles_string):
    """Generate a wide range of molecular description for a given SMILES string"""
    mol = Chem.MolFromSmiles(smiles_string)
    if mol is None:
        return {desc_name: np.nan for desc_name, _ in Descriptors._descList}
    # Cal. all available 2D descriptors
    descriptors = {desc_name: func(mol) for desc_name, func in Descriptors._descList}

    return descriptors

train_descriptors = train_df['SMILES'].apply(generate_rdkit_description)
test_descriptors = test_df['SMILES'].apply(generate_rdkit_description)

# Convert list of dictionaries to a DataFrame
train_descriptors_df = pd.DataFrame(train_descriptors.tolist())
test_descriptors_df = pd.DataFrame(test_descriptors.tolist())

print(f"Generated {train_descriptors_df.shape[1]} RDKit Descriptors")

Engineering Features from SMILES using RDKit
Generated 217 RDKit Descriptors


In [6]:
# Combine the new descriptors with the original dataframes
train_processed = pd.concat([train_df.drop('SMILES', axis=1), train_descriptors_df], axis=1)
test_processed = pd.concat([test_df.drop('SMILES', axis=1), test_descriptors_df], axis=1)

nan_cols = test_processed.columns[test_processed.isna().all()].tolist()
if nan_cols:
    print(f"Removing columns with all NaN values: {nan_cols}")
    train_processed = train_processed.drop(columns=nan_cols)
    test_processed = test_processed.drop(columns=nan_cols)

train_processed.fillna(train_processed.median(), inplace=True)
test_processed.fillna(train_processed.median(), inplace=True)

print(f"Final train shape: {train_processed.shape}")
print(f"Final test shape: {test_processed.shape}")

Final train shape: (2662, 324)
Final test shape: (666, 323)


In [7]:
# Save it
train_processed.to_csv('../data/processed/train_processed.csv', index=False)
test_processed.to_csv('../data/processed/test_processed.csv', index=False)
print("Processed files saved to 'data/processed/' directory.")

Processed files saved to 'data/processed/' directory.
