In [12]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Point to your extracted folder
dataset_folder = "/home/sriram/Downloads/dsgdb9nsd.xyz"

# 1. Define the official QM9 column names for the properties
columns = [
    "tag", "id", "A", "B", "C", "mu", "alpha", 
    "homo", "lumo", "gap", "r2", "zpve", 
    "U0", "U", "H", "G", "Cv"
]

data_rows = []

# We'll just load the first 10,000 molecules for a quick analysis
all_files = sorted(os.listdir(dataset_folder))
files_to_load = all_files[:10000]

print("Compiling data into Pandas DataFrame...")

for filename in files_to_load:
    filepath = os.path.join(dataset_folder, filename)
    with open(filepath, 'r') as f:
        lines = f.readlines()
        
        # Line 0 is the number of atoms
        num_atoms = int(lines[0].strip())
        
        # Line 1 contains all the target properties separated by spaces
        properties = lines[1].strip().split()
        
        if len(properties) == 17:
            # Create a dictionary mapping the column names to the values
            row_data = {col: prop for col, prop in zip(columns, properties)}
            row_data["num_atoms"] = num_atoms
            row_data["filename"] = filename
            data_rows.append(row_data)

# 2. CREATE THE PANDAS DATAFRAME
df = pd.DataFrame(data_rows)

# 3. Clean the data
# Drop the useless 'tag' column (they all just say 'gdb')
df.drop(columns=['tag'], inplace=True)

# Convert all the math columns from strings to floats
numeric_cols = ["id", "A", "B", "C", "mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "U0", "U", "H", "G", "Cv", "num_atoms"]
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric)

# ==========================================
# LET'S LOOK AT THE DATA!
# ==========================================
print("\n‚úÖ DATASET READY!")
print("-" * 50)
print(f"Total Rows: {len(df)}")
print(f"Total Columns: {len(df.columns)}")
print("-" * 50)

print("\nüîç THE COLUMNS WE CAN PREDICT (The 'Y' Variables):")
print(df.columns.tolist())

print("\nüìä FIRST 5 ROWS (df.head()):")
display(df.head()) # If in Jupyter, this makes a pretty table. Otherwise, use print(df.head())

# ==========================================
# HOW TO USE SKLEARN WITH THIS
# ==========================================
# Let's say we choose the Energy Gap as our target
X_files = df['filename'] # We need the filenames to know which 3D coords to load later
y_targets = df['gap']

# Sklearn Trick 1: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_files, y_targets, test_size=0.2, random_state=42)

# Sklearn Trick 2: Scaling Targets
# Neural networks (quantum or classical) hate tiny decimal ranges.
# We scale the 'gap' to have a mean of 0 and variance of 1.
scaler = StandardScaler()
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))

print("\n‚úÇÔ∏è DATA SPLITTING (Sklearn):")
print(f"Training set size: {len(X_train)} molecules")
print(f"Testing set size: {len(X_test)} molecules")

Compiling data into Pandas DataFrame...

‚úÖ DATASET READY!
--------------------------------------------------
Total Rows: 10000
Total Columns: 18
--------------------------------------------------

üîç THE COLUMNS WE CAN PREDICT (The 'Y' Variables):
['id', 'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv', 'num_atoms', 'filename']

üìä FIRST 5 ROWS (df.head()):


Unnamed: 0,id,A,B,C,mu,alpha,homo,lumo,gap,r2,zpve,U0,U,H,G,Cv,num_atoms,filename
0,1,157.7118,157.70997,157.70699,0.0,13.21,-0.3877,0.1171,0.5048,35.3641,0.044749,-40.47893,-40.476062,-40.475117,-40.498597,6.469,5,dsgdb9nsd_000001.xyz
1,2,293.60975,293.54111,191.39397,1.6256,9.46,-0.257,0.0829,0.3399,26.1563,0.034358,-56.525887,-56.523026,-56.522082,-56.544961,6.316,4,dsgdb9nsd_000002.xyz
2,3,799.58812,437.90386,282.94545,1.8511,6.31,-0.2928,0.0687,0.3615,19.0002,0.021375,-76.404702,-76.401867,-76.400922,-76.422349,6.002,3,dsgdb9nsd_000003.xyz
3,4,0.0,35.610036,35.610036,0.0,16.28,-0.2845,0.0506,0.3351,59.5248,0.026841,-77.308427,-77.305527,-77.304583,-77.327429,8.574,4,dsgdb9nsd_000004.xyz
4,5,0.0,44.593883,44.593883,2.8937,12.99,-0.3604,0.0191,0.3796,48.7476,0.016601,-93.411888,-93.40937,-93.408425,-93.431246,6.278,3,dsgdb9nsd_000005.xyz



‚úÇÔ∏è DATA SPLITTING (Sklearn):
Training set size: 8000 molecules
Testing set size: 2000 molecules


In [11]:
!pip install scikit-learn pandas numpy matplotlib seaborn

Collecting scikit-learn
  Using cached scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Using cached joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (8.9 MB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Using cached joblib-1.5.3-py3-none-any.whl (309 kB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn, seaborn
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m4/4[0m [seaborn]m2/4[0m [scikit-learn]
[1A[2KSuccessfully installed

mu: Dipole moment (How polarized the molecule is).

alpha: Isotropic polarizability.

homo: Energy of the Highest Occupied Molecular Orbital.

lumo: Energy of the Lowest Unoccupied Molecular Orbital.

gap: The difference between HOMO and LUMO. (Highly recommended target!)

zpve: Zero-point vibrational energy.

U0: Internal energy at 0 Kelvin.

U, H, G: Energy, Enthalpy, and Free Energy at room temperature.

Cv: Heat capacity.

num_atoms: The size of the molecule

In [None]:
import os
import numpy as np

# Pointing directly to the folder from your screenshot
dataset_folder = "/home/sriram/Downloads/dsgdb9nsd.xyz"

def parse_qm9_file(filepath):
    """Reads a .xyz file and converts X,Y,Z into r, theta, phi"""
    with open(filepath, 'r') as file:
        lines = file.readlines()
        
    num_atoms = int(lines[0].strip())
    
    # Line index 1 contains the properties (HOMO, LUMO, Gap, etc.)
    # The Gap is the 5th value in the properties array (index 4 after 'gdb' and 'id')
    properties = lines[1].split()
    energy_gap = float(properties[4]) 
    
    atom_dict = {'H': 1, 'C': 6, 'N': 7, 'O': 8, 'F': 9}
    parsed_atoms = []
    
    # Atoms start at line index 2
    for line in lines[2:2+num_atoms]:
        parts = line.split()
        symbol = parts[0]
        x, y, z = float(parts[1]), float(parts[2]), float(parts[3])
        
        # Convert Cartesian to Spherical (The Math for the Quantum Encoder)
        r = np.sqrt(x**2 + y**2 + z**2)
        theta = np.arccos(z / r) if r != 0 else 0.0
        phi = np.arctan2(y, x)
        
        atomic_num = atom_dict.get(symbol, 0)
        
        parsed_atoms.append({
            'element': symbol,
            'type': atomic_num,
            'r': r,
            'theta': theta,
            'phi': phi
        })
        
    return parsed_atoms, energy_gap

# Let's test it on the very first file (Methane)
# Usually named 'dsgdb9nsd_000001.xyz'
all_files = sorted(os.listdir(dataset_folder))
first_file_path = os.path.join(dataset_folder, all_files[0])

print(f"Reading file: {all_files[0]}")
print("-" * 40)

atoms, target_gap = parse_qm9_file(first_file_path)

print(f"Target Value (Energy Gap to predict): {target_gap}")
print("\nParsed Atomic Coordinates for Quantum Circuit:")
for atom in atoms:
    print(f"Atom {atom['element']} | r={atom['r']:.4f}, Œ∏={atom['theta']:.4f}, œÜ={atom['phi']:.4f}")

Reading file: dsgdb9nsd_000001.xyz
----------------------------------------
Target Value (Energy Gap to predict): 157.70699

Parsed Atomic Coordinates for Quantum Circuit:
Atom C | r=1.0859, Œ∏=1.5634, œÜ=1.5825
Atom H | r=0.0067, Œ∏=1.2715, œÜ=-1.2283
Atom H | r=1.7794, Œ∏=1.5706, œÜ=0.9660
Atom H | r=1.7766, Œ∏=2.0868, œÜ=1.9284
Atom H | r=1.7786, Œ∏=1.0361, œÜ=1.9201
