<a href="https://colab.research.google.com/github/AryanPhanse/dop/blob/main/ZnO_bandgap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install compatible DeepChem version for Python 3.10
!pip install deepchem tensorflow==2.11.0 rdkit-pypi



In [3]:
# Step 1: Set Up the Environment
!pip install rdkit-pypi scikit-learn



In [4]:
# Step 2: Import Required Libraries
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [5]:
# Step 3: Create the dataset with the provided format
data = {
    'SMILES': [
        '[Zn+2]([O-2])[O-2][Zn+2] ZnO', '[Zn+2].[Zn+2]1[O-2][Zn+2][O-2][Zn+2][O-2]1.[O-2] ZnO',
        '[Zn+2][O-2][Zn+2]([Zn+2][O-2])([O-2])[O-2][Zn+2] ZnO', '[Zn+2].[Zn+2]12[O-2][Zn+2]([O-2][Zn+2][O-2]1)[O-2]2 ZnO',
        '[Zn+2]([O-2][Zn+2][O-2][Zn+2][O-2][Zn+2][O-2][Zn+2])[O-2][Zn+2][O-2][Zn+2][O-2][Zn+2][O-2][Zn+2][O-2][Zn+2][O-2][Zn+2][O-2] ZnO',
        '[Zn+2][O-2].[Zn+2][O-2] ZnO',
        '[Zn+2][O-2][Zn+2]1[Zn+2]([Zn+2]23([Zn+2]([O-2]2)[O-2][Zn+2][O-2][Zn+2])[O-2][Zn+2][O-2][Zn+2]([O-2])[O-2][Zn+2]2[O-2][Zn+2]([O-2][Zn+2])[O-2][Zn+2]([O-2][Zn@+2]45[O-2][Zn@+2]6([Zn@+2]7([Zn@+2]([Zn@+2]([O-2][Zn+2])([O-2][Zn+2]([O-2]1)[O-2]5)[O-2]3)([O-2][Zn+2][O-2][Zn+2])[O-2]7)[O-2][Zn+2])[Zn+2][O-2][Zn@+2]1([Zn+2]([Zn+2]([O-2])[O-2][Zn+2])([O-2]4)([O-2]1)[O-2][Zn+2][O-2][Zn+2][O-2][Zn+2][O-2])[O-2][Zn+2]6)[O-2]2)[O-2][Zn+2].[Zn+2]([O-2])[O-2][Zn+2][O-2][Zn+2][O-2].[Zn+2]([O-2])[O-2].[Zn+2].[Zn+2].[Zn+2] ZnO',
        '[Zn+2][Zn+2][O-2].[O-2] ZnO'
    ],
    'Band_Gap': [0.72, 0.63, 0.74, 0.72, 0.28, 0.80, 1.22, 0.45],  # Band gap in eV
    'Formation_Energy': [-1.65, 1.64, -1.61, -1.50, -1.47, -1.42, -1.40, -1.35]  # In eV/atom
}

# Create DataFrame
df = pd.DataFrame(data)

In [7]:
# Step 4: Extract Molecular Descriptors (using more descriptors)
def compute_descriptors(smiles_list):
    mols = [Chem.MolFromSmiles(smiles.split()[0]) for smiles in smiles_list]  # Split to handle extra labels like "ZnO"
    descriptors = []
    for mol in mols:
        if mol is None:  # Check if the molecule was successfully created
            descriptors.append([0] * 6)  # Fallback for invalid SMILES
            continue
        mol_weight = Descriptors.MolWt(mol)
        rotatable_bonds = Descriptors.NumRotatableBonds(mol)
        h_donors = Descriptors.NumHDonors(mol)
        h_acceptors = Descriptors.NumHAcceptors(mol)
        logp = Descriptors.MolLogP(mol)
        polar_surface_area = Descriptors.TPSA(mol)
        descriptors.append([mol_weight, rotatable_bonds, h_donors, h_acceptors, logp, polar_surface_area])
    return np.array(descriptors)

# Generate features
X_descriptors = compute_descriptors(df['SMILES'])

# Prepare target variables
y_bandgap = df['Band_Gap'].values
y_stability = df['Formation_Energy'].values

[19:44:41] Explicit valence for atom # 1 O, 3, is greater than permitted
[19:44:41] Explicit valence for atom # 2 O, 4, is greater than permitted
[19:44:41] Explicit valence for atom # 1 O, 4, is greater than permitted
[19:44:41] Explicit valence for atom # 2 O, 4, is greater than permitted
[19:44:41] Explicit valence for atom # 1 O, 4, is greater than permitted
[19:44:41] Explicit valence for atom # 1 O, 3, is greater than permitted
[19:44:41] Explicit valence for atom # 1 O, 4, is greater than permitted
[19:44:41] Explicit valence for atom # 2 O, 3, is greater than permitted


In [8]:
# Step 5: Split Data into Training and Test Sets
X_train_bandgap, X_test_bandgap, y_train_bandgap, y_test_bandgap = train_test_split(X_descriptors, y_bandgap, test_size=0.2, random_state=42)
X_train_stability, X_test_stability, y_train_stability, y_test_stability = train_test_split(X_descriptors, y_stability, test_size=0.2, random_state=42)

In [9]:
# Step 6: Train a RandomForest Model for Band Gap Prediction
rf_bandgap = RandomForestRegressor(random_state=42)
rf_bandgap.fit(X_train_bandgap, y_train_bandgap)

In [10]:
# Step 7: Predict Band Gap
pred_bandgap_rf = rf_bandgap.predict(X_test_bandgap)
print("Predicted Band Gaps (RandomForest):", pred_bandgap_rf)

Predicted Band Gaps (RandomForest): [0.69126667 0.69126667]


In [None]:
# Step 8: Train a RandomForest Model for Formation energy Prediction
rf_stability = RandomForestRegressor(random_state=42)
rf_stability.fit(X_train_stability, y_train_stability)


In [11]:
# Step 9: Predict Thermal Stability
pred_stability_rf = rf_stability.predict(X_test_stability)
print("Predicted formation energy (RandomForest):", pred_stability_rf)

Predicted formation energy (RandomForest): [-1.49731667 -1.49731667]
