### Importing the required dependencies

In [3]:
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem
from rdkit.Chem.EState import Fingerprinter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import numpy as np
import pandas as pd

In [12]:
def preprocess_data(smiles_list, temperatures):
    """Convert SMILES to RDKit molecules and validate data"""
    molecules = []
    valid_temps = []
    
    for smiles, temp in zip(smiles_list, temperatures):
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            molecules.append(mol)
            valid_temps.append(temp)
    
    return molecules, valid_temps

In [13]:
def extract_chemical_features(molecules):
    """Extract chemical descriptors from molecules"""
    features = []
    for mol in molecules:
        descriptor = {
            'MolWt': Descriptors.MolWt(mol),
            'LogP': Descriptors.MolLogP(mol),
            'TPSA': Descriptors.TPSA(mol),
            'HBD': Descriptors.NumHDonors(mol),
            'HBA': Descriptors.NumHAcceptors(mol),
            'RotableBonds': Descriptors.NumRotatableBonds(mol),
            'AromaticRings': Descriptors.NumAromaticRings(mol),
            'HeavyAtoms': Descriptors.HeavyAtomCount(mol)
        }
        features.append(descriptor)
    return pd.DataFrame(features)

In [14]:
def extract_topological_features(molecules):
    """Extract topological descriptors from molecules"""
    features = []
    for mol in molecules:
        descriptor = {
            'BertzCT': Descriptors.BertzCT(mol),
            'Chi0v': Descriptors.Chi0v(mol),
            'Chi1v': Descriptors.Chi1v(mol),
            'Chi2v': Descriptors.Chi2v(mol),
            'HallKierAlpha': Descriptors.HallKierAlpha(mol),
            'Kappa1': Descriptors.Kappa1(mol),
            'Kappa2': Descriptors.Kappa2(mol),
            'Kappa3': Descriptors.Kappa3(mol)
        }
        features.append(descriptor)
    return pd.DataFrame(features)

In [15]:
def split_and_scale_data(X, y):
    """Split data into train, validation, test sets and scale features"""
    # First split: separate test set
    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Second split: separate train and validation
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    return (X_train_scaled, X_val_scaled, X_test_scaled, 
            y_train, y_val, y_test, scaler)

In [16]:
def get_model(model_name, **params):
    """Create model based on name"""
    if model_name.lower() == 'random_forest':
        default_params = {'n_estimators': 100,'random_state': 42}
        default_params.update(params)
        return RandomForestRegressor(**default_params)
    
    elif model_name.lower() == 'xgboost':
        default_params = {'n_estimators': 100,'learning_rate': 0.1,'random_state': 42}
        default_params.update(params)
        return xgb.XGBRegressor(**default_params)
    
    else:
        raise ValueError(f"Unsupported model: {model_name}")

In [17]:
def evaluate_model(model, X_train, X_val, X_test, y_train, y_val, y_test):
    """Train model and evaluate performance"""
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)
    test_pred = model.predict(X_test)
    
    # Calculate metrics
    results = {
        'train': {'r2': r2_score(y_train, train_pred),'rmse': np.sqrt(mean_squared_error(y_train, train_pred))},
        'validation': {'r2': r2_score(y_val, val_pred),'rmse': np.sqrt(mean_squared_error(y_val, val_pred))},
        'test': {'r2': r2_score(y_test, test_pred),'rmse': np.sqrt(mean_squared_error(y_test, test_pred))}
    }
    
    return results

In [18]:
data = pd.read_csv("TgDataSet.csv")
data = data[['Glass Transition Temperature', 'SMILES String']]

In [19]:
data.head()

Unnamed: 0,Glass Transition Temperature,SMILES String
0,279.0,C=CC(=O)OCc1ccccc1
1,383.0,C=CC(=O)Oc2ccc(c1ccccc1)cc2
2,219.0,CCCCOC(=O)C=C
3,250.0,CC(OC(=O)C=C)CC
4,345.0,C=CC(=O)Oc1ccccc1C(C)(C)C


In [20]:
molecules, temperatures = preprocess_data(data['SMILES String'], data['Glass Transition Temperature'])

In [None]:
chemical_features = extract_chemical_features(molecules)
topological_features = extract_topological_features(molecules)

In [23]:
for features, feature_type in [(chemical_features, "Chemical"), (topological_features, "Topological")]:
    print(f"\n{feature_type} Features Results:")

    splits = split_and_scale_data(features, temperatures)
    X_train, X_val, X_test, y_train, y_val, y_test, scaler = splits

    for model_name in ['random_forest', 'xgboost']:
        print(f"\n{model_name.upper()} Results:")
        model = get_model(model_name)
        results = evaluate_model(model, X_train, X_val, X_test, y_train, y_val, y_test)

        for split_name, metrics in results.items():
            print(f"{split_name.capitalize()}:")
            print(f"R^2 score: {metrics['r2']:.3f}")
            print(f"RMSE: {metrics['rmse']:.3f}")


Chemical Features Results:

RANDOM_FOREST Results:
Train:
R^2 score: 0.976
RMSE: 16.487
Validation:
R^2 score: 0.865
RMSE: 35.901
Test:
R^2 score: 0.891
RMSE: 34.606

XGBOOST Results:
Train:
R^2 score: 0.989
RMSE: 11.116
Validation:
R^2 score: 0.850
RMSE: 37.786
Test:
R^2 score: 0.877
RMSE: 36.759

Topological Features Results:

RANDOM_FOREST Results:
Train:
R^2 score: 0.973
RMSE: 17.483
Validation:
R^2 score: 0.769
RMSE: 46.867
Test:
R^2 score: 0.823
RMSE: 44.058

XGBOOST Results:
Train:
R^2 score: 0.992
RMSE: 9.399
Validation:
R^2 score: 0.717
RMSE: 51.951
Test:
R^2 score: 0.813
RMSE: 45.254
