In [13]:
import pickle
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load pickle
with open("combined_scattering_data.pkl", "rb") as f:
    data = pickle.load(f)

print("Loaded type:", type(data))

if isinstance(data, pd.DataFrame):
    print("Columns:", list(data.columns))
    images = np.array(data["scattering_pattern"].tolist())
    labels = np.array(data["particle_type"].tolist())

elif isinstance(data, dict):
    print("Keys:", list(data.keys()))
    images = np.array(data.get("scattering_pattern"))
    labels = np.array(data.get("particle_type"))

elif isinstance(data, (tuple, list)):
    print("Length of data:", len(data))
    images = np.array(data[0])
    labels = np.array(data[1])

else:
    raise ValueError("Unsupported data format in pickle file.")

print("Images shape:", images.shape)
print("Labels shape:", labels.shape)

#  Encode string labels
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)

#  Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    images, labels_encoded, test_size=0.2, random_state=42
)

#  Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32).unsqueeze(1)  # add channel dim
X_test = torch.tensor(X_test, dtype=torch.float32).unsqueeze(1)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

#  Data loaders
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

#  Simple CNN
class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
        self.fc1 = nn.Linear(32 * (X_train.shape[2]//4) * (X_train.shape[3]//4), 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = SimpleCNN(num_classes=len(le.classes_))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

#  Training loop
for epoch in range(5):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}")

#  Evaluation
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")


Loaded type: <class 'pandas.core.frame.DataFrame'>
Columns: ['scattering_pattern', 'particle_type', 'radius_nm', 'n_particle', 'data_shape']
Images shape: (703, 227, 227)
Labels shape: (703,)
Epoch 1, Loss: 0.7873
Epoch 2, Loss: 0.6846
Epoch 3, Loss: 0.6817
Epoch 4, Loss: 0.6848
Epoch 5, Loss: 0.6816
Test Accuracy: 54.61%
