### Loading Dataset 

#### Train 

In [1]:
#import dataset 

import os
import pandas as pd

# === Paths ===
BASE_DIR = 'molecular-property-prediction-challenge'
DIPOLE_FILE = os.path.join(BASE_DIR, 'dipole_moments_train.csv')
STRUCTURE_DIR = os.path.join(BASE_DIR, 'structures_train')  #for test can change dir later

# === Load dipole moment labels ===
dipole_df = pd.read_csv(DIPOLE_FILE)

# === Function to load .xyz file and return DataFrame ===
def parse_xyz(filepath):
    with open(filepath, 'r') as f:
        lines = f.readlines()
    
    num_atoms = int(lines[0].strip())
    atom_lines = lines[2:2 + num_atoms]
    
    data = []
    for line in atom_lines:
        parts = line.strip().split()
        atom = parts[0]
        x, y, z = map(float, parts[1:])
        data.append((atom, x, y, z))
        
    return pd.DataFrame(data, columns=['atom', 'x', 'y', 'z'])

# === Load all structure files into a single DataFrame ===
def load_structures(structure_dir):
    all_data = []
    
    for filename in os.listdir(structure_dir):
        if filename.endswith('.xyz'):
            mol_name = filename.replace('.xyz', '')
            filepath = os.path.join(structure_dir, filename)
            df = parse_xyz(filepath)
            df['molecule_name'] = mol_name
            df['atom_index'] = range(len(df))  #keeping track of atom index, this is not informative 
            all_data.append(df)
    
    structures_df = pd.concat(all_data, ignore_index=True)
    return structures_df

# === Load training structure data ===
train_structures = load_structures(STRUCTURE_DIR)

# === Merge dipole moment target with training structures ===
train_df = train_structures.merge(dipole_df, on='molecule_name')

In [87]:
# dipole_df

In [86]:
# train_df

#### Loading test

In [4]:
# === For test, structure loading only ===
def load_test_structures(test_structure_dir):
    return load_structures(test_structure_dir)

In [28]:
#get test here 
STRUCTURE_DIR_TEST = os.path.join(BASE_DIR, 'structures_test')  #for test can change dir later

test_structures = load_test_structures(STRUCTURE_DIR_TEST)

### XYZ format strings to SMILES (Simplified Molecular Input Line Entry System)

In [89]:
from rdkit.Chem import Descriptors

def mol_to_descriptors(mol):
    descriptor_funcs = [
        Descriptors.MolWt,
        Descriptors.MolLogP,
        Descriptors.NumHDonors,
        Descriptors.NumHAcceptors,
        Descriptors.TPSA,
        Descriptors.HeavyAtomCount,
        Descriptors.FractionCSP3,
        Descriptors.NumRotatableBonds,
        Descriptors.RingCount
    ]
    return [func(mol) for func in descriptor_funcs]

In [None]:
from openbabel import openbabel, pybel
from rdkit import Chem
from rdkit.Chem import rdmolops

def xyz_to_smiles(xyz_string):
    # Save the XYZ string to a temporary file
    with open("temp.xyz", "w") as f:
        f.write(xyz_string.strip())

    # Read the XYZ with Pybel
    mol = next(pybel.readfile("xyz", "temp.xyz"))

    # Add hydrogens and perceive connectivity
    mol.addh()
    mol.make3D()

    # Convert to RDKit mol
    obmol = mol.OBMol
    obConversion = openbabel.OBConversion()
    obConversion.SetOutFormat("mol")
    mol_block = obConversion.WriteString(obmol)
    
    rdkit_mol = Chem.MolFromMolBlock(mol_block, sanitize=True)

    # Generate SMILES
    if rdkit_mol:
        smiles = Chem.MolToSmiles(rdkit_mol, canonical=True)
        return smiles
    else:
        return None

In [7]:
import os
from tqdm import tqdm  # for progress bar

# Store SMILES and molecule names
smiles_list = []
mol_names = []

for filename in tqdm(os.listdir(STRUCTURE_DIR)):
    if filename.endswith('.xyz'):
        mol_name = filename.replace('.xyz', '')
        filepath = os.path.join(STRUCTURE_DIR, filename)

        with open(filepath, 'r') as f:
            xyz_string = f.read()

        smiles = xyz_to_smiles(xyz_string)
        smiles_list.append(smiles)
        mol_names.append(mol_name)

smiles_df = pd.DataFrame({
    'molecule_name': mol_names,
    'smiles': smiles_list
})

smiles_df.to_csv('smiles_output.csv', index=False)

  0%|          | 67/20000 [00:00<01:51, 178.58it/s][20:55:10] Cannot process coordinates on line 12
  There exists NaN in calculated coordinates.
  1%|          | 157/20000 [00:00<01:35, 207.16it/s][20:55:11] Explicit valence for atom # 1 C, 5, is greater than permitted
  1%|          | 180/20000 [00:00<01:33, 212.04it/s][20:55:11] Cannot process coordinates on line 7
  There exists NaN in calculated coordinates.
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is temp.xyz)

  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is temp.xyz)

  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is temp.xyz)

  2%|▏         | 401/20000 [00:02<01:31, 214.96it/s][20:55:12] Cannot process coordinates on line 12
  There exists NaN in calculated coordinates.
  2%|▏         | 494/20000 [00:02<01:30, 216.73it/s][20:55:12] Cannot process coordinates on line 12
  There exists NaN in calculated coordinates.
  3%|▎         | 611/20000 [00:

In [88]:
# smiles_df

### Smiles for Test 

In [None]:
# Store SMILES and molecule names
smiles_list_test = []
mol_names_test = []

for filename in tqdm(os.listdir(STRUCTURE_DIR_TEST)):
    if filename.endswith('.xyz'):
        mol_name = filename.replace('.xyz', '')
        filepath = os.path.join(STRUCTURE_DIR_TEST, filename)

        with open(filepath, 'r') as f:
            xyz_string = f.read()

        smiles = xyz_to_smiles(xyz_string)
        smiles_list_test.append(smiles)
        mol_names_test.append(mol_name)


In [None]:
smiles_test_df = pd.DataFrame({
    'molecule_name': mol_names_test,
    'smiles': smiles_list_test
})

smiles_test_df.to_csv('smiles_output_test.csv', index=False)
smiles_test_df

Unnamed: 0,molecule_name,smiles
0,mol_968,C1=C2[C@@H]3[C@H]4O[C@@H]3[C@@]24CC1
1,mol_3484,C#C[C@@H](C)N(C)C=O
2,mol_2942,C#C[C@H]1C[C]2[CH]CO[C@H]21
3,mol_940,Cc1cc(=O)c(O)c[nH]1
4,mol_798,[CH]1C[C@H]2[C@@H]3O[C@H]4[C@@H]3[C]1[C@@H]24
...,...,...
4995,mol_4494,C1CC2(C1)CC1(CO1)C2
4996,mol_971,CC[C@]1(C)COCO1
4997,mol_2973,C[C@H](C=O)c1ccoc1
4998,mol_965,CCO[C@@H](C)CC#N


#### SMILES (Simplified Molecular Input Line Entry System) are strings that encode the 2D structure of molecules using ASCII characters.

Each part represents:

Atoms: C, O, N, etc.

Bonds:

1.  Single: implicit (no symbol)

2. Double: =

3. Triple: #

Branches: ()

Rings: 1, 2, etc., to denote where rings start/end

Stereochemistry: @, @@ denote chiral centers

#### Morgan fingerprints (aka ECFP – Extended-Connectivity Fingerprints) 
are fixed-length binary vectors. Each bit encodes the presence/absence of circular substructures around atoms.


In [12]:
from rdkit import Chem
from rdkit.Chem import AllChem

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack, csr_matrix
import numpy as np
import pandas as pd

# === Load data ===
df = pd.read_csv("smiles_output.csv")  # contains molecule_name and smiles
labels = pd.read_csv("molecular-property-prediction-challenge/dipole_moments_train.csv")  # contains molecule_name and dipole_moment

# === Merge SMILES with dipole moment ===
data = df.merge(labels, on='molecule_name')
data = data.dropna(subset=['smiles'])  # remove missing SMILES
data['smiles'] = data['smiles'].astype(str)

# === Apply RDKit to get Mol objects ===
data['mol'] = data['smiles'].apply(Chem.MolFromSmiles)
data = data[data['mol'].notnull()]  # drop invalid molecules

# === Generate Morgan Fingerprints ===
def mol_to_fp(mol, radius=2, nBits=2048):
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)

# === Generate RDKit Descriptors ===
# (Note: I ran this function definition in a separate cell)
def mol_to_descriptors(mol):
    descriptor_funcs = [
        Descriptors.MolWt,
        Descriptors.MolLogP,
        Descriptors.NumHDonors,
        Descriptors.NumHAcceptors,
        Descriptors.TPSA,
        Descriptors.HeavyAtomCount,
        Descriptors.FractionCSP3,
        Descriptors.NumRotatableBonds,
        Descriptors.RingCount
    ]
    return [func(mol) for func in descriptor_funcs]

# === Create features ===
fps = [mol_to_fp(mol) for mol in data['mol']]
X_fp = np.array(fps)

desc = [mol_to_descriptors(mol) for mol in data['mol']]
X_desc = StandardScaler().fit_transform(desc)

# === Combine fingerprints and descriptors ===
X_combined = hstack([csr_matrix(X_fp), csr_matrix(X_desc)])
y = data['dipole_moment'].values



MAE (Fingerprints + Descriptors): 0.5278


In [71]:
# Convert to numpy array
import numpy as np
X = np.array([np.asarray(fp) for fp in fps])
y = data['dipole_moment'].values

#### Extra descriptors 

In [98]:
# === Train/Test Split and Model Training ===
X_train, X_val, y_train, y_val = train_test_split(X_combined, y, test_size=0.2, random_state=42)

model_rf_extradescp = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
model_rf_extradescp.fit(X_train, y_train)

preds = model_rf_extradescp.predict(X_val)
mae = mean_absolute_error(y_val, preds)
print(f"MAE (Fingerprints + Descriptors): {mae:.4f}")

MAE (Fingerprints + Descriptors): 0.5278


In [99]:
# === Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === Train model ===
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# === Evaluate ===
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Test MAE: {mae:.4f}")

Test MAE: 0.5399


In [94]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [83]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Test MAE: {mae:.4f}")

Test MAE: 0.2673


In [75]:
import xgboost as xgb

In [97]:
xgb_model = xgb.XGBRegressor(       
    n_estimators=800,  # number of trees
    learning_rate=0.1, # step size
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train)

# === Evaluate ===
y_pred = xgb_model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred) #X_train, X_val, y_train, y_val
print(f"XGBoost Test MAE: {mae:.4f}")

XGBoost Test MAE: 0.5555


In [None]:
test_smiles = pd.read_csv("smiles_output_test.csv")  # molecule_name, smiles
test_ids = pd.read_csv("molecular-property-prediction-challenge/dipole_moments_test.csv")  # molecule_name

# === Merge and preprocess ===
test_ids = test_ids.rename(columns={"ID": "molecule_name"})

test_data = test_ids.merge(test_smiles, on='molecule_name')
test_data = test_data.dropna(subset=['smiles'])
test_data['smiles'] = test_data['smiles'].astype(str)
test_data['mol'] = test_data['smiles'].apply(Chem.MolFromSmiles)
test_data = test_data[test_data['mol'].notnull()]

# === Generate fingerprints and predict ===
X_test = [mol_to_fp(mol) for mol in test_data['mol']]
X_test = np.array([list(fp) for fp in X_test])

test_preds = xgb_model.predict(X_test)



In [59]:
test_ids = pd.read_csv("molecular-property-prediction-challenge/dipole_moments_test.csv")
test_smiles = pd.read_csv("smiles_output_test.csv")

print("Expected (from test_ids):", len(test_ids))       # Should be 5000
print("Got SMILES for        :", len(test_smiles))       # Should match or be more


Expected (from test_ids): 5000
Got SMILES for        : 5000


In [60]:
test_ids = test_ids.rename(columns={"ID": "molecule_name"})
test_data = test_ids.merge(test_smiles, on='molecule_name', how='left')

print("After merging:", len(test_data))  # Still should be 5000

After merging: 5000


In [61]:
missing_smiles = test_data['smiles'].isna().sum()
print("Rows with missing SMILES:", missing_smiles)

# Now convert
test_data['mol'] = test_data['smiles'].apply(lambda x: Chem.MolFromSmiles(x) if pd.notnull(x) else None)
invalid_mols = test_data['mol'].isna().sum()
print("Rows with invalid mols:", invalid_mols)


Rows with missing SMILES: 89
Rows with invalid mols: 89


In [84]:
from xgboost import XGBRegressor
import numpy as np

# 1. Get valid rows
valid_rows = test_data['mol'].notnull()

# 2. Generate features only for valid molecules
X_test = [mol_to_fp(mol) for mol in test_data.loc[valid_rows, 'mol']]

# 3. Predict only on valid rows
preds = model.predict(X_test)

# 4. Create a full-size predictions array
full_preds = pd.Series(index=test_data.index, dtype=float)
full_preds.loc[valid_rows] = preds

# 5. Fill missing ones with the mean of valid predictions
mean_pred = full_preds.loc[valid_rows].mean()
full_preds = full_preds.fillna(mean_pred)



In [85]:
submission = pd.DataFrame({
    'ID': test_data['molecule_name'],
    'dipole_moment': full_preds
})
submission.to_csv('submission_smiles_mean_rf.csv', index=False)
print("Predictions saved to submission_smiles_mean_xgb.csv")

Predictions saved to submission_smiles_mean_xgb.csv
