bulidmodule

Libarys

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,LabelEncoder, MinMaxScaler
import pandas as pd
import numpy as np

Dataset Loader

In [2]:
class GeneDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

Fully Connected Neural Net

In [3]:
import torch
import torch.nn as nn

class GenePredictorCNN(nn.Module):
    def __init__(self, input_size, output_size=11):
        super(GenePredictorCNN, self).__init__()
        
        self.conv_layers = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=3, padding=1),   # input channels=1, output=32
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Conv1d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.MaxPool1d(kernel_size=2),  # Downsample by 2
            nn.Conv1d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.MaxPool1d(kernel_size=2)
        )
        
        # Calculate output size after convolutions
        reduced_size = input_size // 4  # 2 MaxPools (each halves the size)
        
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256 * reduced_size, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, output_size),
            nn.Sigmoid()  # for multi-label classification
        )

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension (batch_size, 1, features)
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x


Training Function

In [4]:
def train_model(model, train_loader, val_loader, epochs=10, lr=0.001):
    device = torch.device('mps' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                output = model(X_batch)
                loss = criterion(output, y_batch)
                val_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")


Main Flow

In [6]:
if __name__ == "__main__":
    # 1. Load data
    df = pd.read_csv("/Users/bearcheung/Documents/Year3/FYP/dna_100k.csv")

    # 2. Prepare input features (X) and labels (y)
    feature_columns = [
        'Chromosome_1', 'Chromosome_2', 'Chromosome_3', 'Chromosome_4', 'Chromosome_5',
        'Chromosome_6', 'Chromosome_7', 'Chromosome_8', 'Chromosome_9', 'Chromosome_10',
        'Chromosome_11', 'Chromosome_12', 'Chromosome_13', 'Chromosome_14', 'Chromosome_15',
        'Chromosome_16', 'Chromosome_17', 'Chromosome_18', 'Chromosome_19', 'Chromosome_20',
        'Chromosome_21', 'Chromosome_22', 'Chromosome_X', 'Chromosome_Y'
    ]
    label_columns = ['DISC1', 'TCF4', 'BDNF', 'DRD2', 'COMT', 'GRIN2B', 'NRG1', 'RELN', 'DTNBP1', 'HTR2A', 'Mental_Disorder']

    # Only select feature columns + label columns
    df = df[feature_columns + label_columns]

    # Encode DNA sequences (A, C, G, T, etc.) into integer labels
    for col in feature_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))  # Ensure all are strings first

    X = df[feature_columns].values
    y = df[label_columns].values

    # Normalize X to 0-1
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    # 3. Split into train/validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # 4. Create DataLoaders
    train_dataset = GeneDataset(X_train, y_train)
    val_dataset = GeneDataset(X_val, y_val)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64)

    # 5. Define the model
    input_size = X_train.shape[1]
    output_size = y_train.shape[1]
    model = GenePredictorCNN(input_size, output_size)

    # 6. Train the model
    train_model(model, train_loader, val_loader, epochs=10)

Epoch 1/10, Train Loss: 0.0737, Val Loss: 0.0684
Epoch 2/10, Train Loss: 0.0712, Val Loss: 0.0688
Epoch 3/10, Train Loss: 0.0704, Val Loss: 0.0673
Epoch 4/10, Train Loss: 0.0698, Val Loss: 0.0678
Epoch 5/10, Train Loss: 0.0690, Val Loss: 0.0674
Epoch 6/10, Train Loss: 0.0684, Val Loss: 0.0679
Epoch 7/10, Train Loss: 0.0677, Val Loss: 0.0671
Epoch 8/10, Train Loss: 0.0673, Val Loss: 0.0668
Epoch 9/10, Train Loss: 0.0669, Val Loss: 0.0669
Epoch 10/10, Train Loss: 0.0664, Val Loss: 0.0672


performance

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(model, data_loader, threshold=0.5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()
    preds = []
    trues = []

    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch = X_batch.to(device)
            outputs = model(X_batch)
            outputs = outputs.cpu().numpy()
            y_batch = y_batch.numpy()

            preds.append(outputs)
            trues.append(y_batch)

    preds = np.vstack(preds)
    trues = np.vstack(trues)

    # Apply threshold to get binary predictions
    preds_binary = (preds >= threshold).astype(int)

    # Calculate scores
    acc = accuracy_score(trues.flatten(), preds_binary.flatten())
    prec = precision_score(trues.flatten(), preds_binary.flatten(), zero_division=0)
    rec = recall_score(trues.flatten(), preds_binary.flatten(), zero_division=0)
    f1 = f1_score(trues.flatten(), preds_binary.flatten(), zero_division=0)
    roc_auc = roc_auc_score(trues.flatten(), preds.flatten())

    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")

evaluate_model(model, val_loader)

Accuracy: 0.9879
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
ROC AUC Score: 0.5748


In [None]:
# Save the trained model
model_path = "checker.pth"
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")


Model saved to checker.pth
