# Import des packages

In [15]:
import os
from pathlib import Path
import pandas as pd
import torch
import numpy as np
from torch.utils.data import random_split
from torch_geometric.data import Data, Dataset, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
import torch.nn.functional as F
import matplotlib.pyplot as plt
import networkx as nx
from torch_geometric.utils import to_networkx
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Récupération du dataset sous forme de graphes

In [17]:
DATA_ROOT = Path("data")
XYZ_DIR = DATA_ROOT / "atoms" / "train"
ENERGY_CSV = DATA_ROOT / "energies" / "train.csv"

ATOM_TYPES = ['H', 'C', 'N', 'O', 'F', 'S', 'Cl']

# Valence maximale par type d’atome
VALENCE_MAX = {
    'H': 1,
    'C': 4,
    'N': 3,
    'O': 2,
    'F': 1,
    'S': 6,
    'Cl': 1
}


def atom_to_int(symbol):
    return ATOM_TYPES.index(symbol)

def z_to_one_hot(z, num_classes=7):
    return F.one_hot(z, num_classes=num_classes).float()


def build_edge_index(pos, z, cutoff=1.5):
    dist = torch.cdist(pos, pos)
    mask = (dist < cutoff) & (dist > 0)
    row, col = torch.where(mask)
    
    edges = list(zip(row.tolist(), col.tolist()))
    
    # Compter les connexions par atome
    connections = {i: [] for i in range(len(z))}
    edge_scores = {}

    for i, j in edges:
        d = dist[i, j].item()
        connections[i].append((j, d))
        edge_scores[(i, j)] = d

    final_edges = []
    for i in connections:
        # Limiter au nombre de liaisons maximum
        symbol = int_to_atom(z[i].item())
        max_conn = VALENCE_MAX[symbol]
        
        # Trier les voisins par distance croissante
        neighbors = sorted(connections[i], key=lambda x: x[1])
        selected = neighbors[:max_conn]

        for j, _ in selected:
            final_edges.append((i, j))

    # Éviter les doublons (i,j) et (j,i)
    final_edges = list(set(final_edges))
    row, col = zip(*final_edges)
    edge_index = torch.tensor([row, col], dtype=torch.long)
    return edge_index


def int_to_atom(index):
    return ATOM_TYPES[index]

In [18]:
class QM7XDataset(Dataset):
    def __init__(self, xyz_dir, energy_csv, cutoff=1.8):
        super().__init__()
        self.xyz_dir = xyz_dir
        self.cutoff = cutoff
        self.energy_df = pd.read_csv(energy_csv, index_col='id')

    def __len__(self):
        return len(self.energy_df)

    def _load_xyz(self, file_path):
        with open(file_path, 'r') as f:
            lines = f.readlines()[2:]
        atoms = []
        positions = []
        for line in lines:
            parts = line.split()
            atoms.append(atom_to_int(parts[0]))
            positions.append([float(x) for x in parts[1:4]])
        z = torch.tensor(atoms, dtype=torch.long)
        pos = torch.tensor(positions, dtype=torch.float)
        return z, pos

    def __getitem__(self, idx):
        mol_id = idx + 1
        file_path = self.xyz_dir / f"id_{mol_id}.xyz"
        z, pos = self._load_xyz(file_path)
        x = z_to_one_hot(z)
        edge_index = build_edge_index(pos, z, self.cutoff)

        energy = torch.tensor([self.energy_df.loc[mol_id, 'energy']], dtype=torch.float)
        data = Data(x=x, edge_index=edge_index, pos=pos, y=energy)
        return data


In [19]:
dataset = QM7XDataset(XYZ_DIR, ENERGY_CSV)

# Construction des histogrammes

In [13]:
num_molecules = len(dataset)
num_atom_types = len(ATOM_TYPES)

X = np.zeros((num_molecules, num_atom_types), dtype=int)
y = np.zeros(num_molecules)

for i in range(num_molecules):
    data = dataset[i]
    atom_types = data.x.argmax(dim=1).tolist()
    counts = np.bincount(atom_types, minlength=num_atom_types)
    X[i, :] = counts
    y[i] = data.y.item()

print("Exemple de vecteur de caractéristiques pour molécule 0 :", X[0])
print("Énergie cible :", y[0])


Exemple de vecteur de caractéristiques pour molécule 0 : [13  6  1  0  0  0  0]
Énergie cible : -90.10787963867188


# Prédiction de l'energie

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"RMSE sur test: {mse**0.5:.4f}")

# Cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
print(f"Scores de validation croisée : {scores}")


RMSE sur test: 0.5552
Scores de validation croisée : [-0.30327407 -0.29145346 -0.30686539 -0.30227561 -0.31632483]
