Authors : Jinsu Kim, JunHo Park

ⓒ 2022 CCNets, Inc. All Rights Reserved.

https://ccnets.org

In [None]:
import sys
path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

In [None]:
import os
import pandas as pd

# Define the base directory and CSV file name
base_dir = path_append + "../data/"  # Update this to the directory where your data folder is located
csv_file = "Data_Cortex_Nuclear.csv"  # Update this to your CSV file name if different

# Full path to the CSV file
full_path = os.path.join(base_dir, csv_file)

# Load the dataset
try:
    df = pd.read_csv(full_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print("Failed to load data. File not found at:", full_path)

# No need for image_size here unless it is used later in your code


TrainLoader / DataLoader

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import IterativeImputer

# Preprocess dataset
df = df.drop("MouseID", axis=1)
label_cols = ["Genotype", "Treatment", "Behavior", "class"]
for col in label_cols:
    df[col] = LabelEncoder().fit_transform(df[col].values)

In [None]:
# Impute missing values
imputer = IterativeImputer(max_iter=10, random_state=0)  # max_iter was num_features; adjust as appropriate
df[:] = imputer.fit_transform(df)

In [None]:
# Scale features
from sklearn.model_selection import train_test_split

feature_cols = df.columns[df.columns != 'class']
df[feature_cols] = StandardScaler().fit_transform(df[feature_cols])

# Determine number of features and classes
num_features = len(feature_cols)
num_classes = len(df['class'].unique())

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df['class'], test_size=0.2, random_state=1)


In [None]:
# Custom dataset class
import torch
from torch.utils.data import Dataset
from sklearn.experimental import enable_iterative_imputer

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.x = torch.tensor(features, dtype=torch.float32)
        self.y = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        return self.x[index], self.y[index]


In [None]:
# Create Dataset instances
train_dataset = CustomDataset(X_train.values, y_train.values)
test_dataset = CustomDataset(X_test.values, y_test.values)


In [None]:
# Example usage
for features, labels in train_dataset:
    print(features, labels)
    break

In [None]:
from tools.setting.ml_params import MLParameters
from tools.setting.data_config import DataConfig

data_config = DataConfig(dataset_name = 'genetic_variant', task_type='multi_class_classification', obs_shape=[num_features], label_size=num_classes)
#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters(core_model = 'gpt', encoder_model = 'none')

In [None]:
from trainer_hub import TrainerHub

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False)