### Imports and Set Up

In [1]:
import scanpy as sc
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score 
import numpy as np
import pandas as pd
import time
import modelMLP 
import matplotlib.pyplot as plt
from IPython.display import display, clear_output

In [2]:
sc.settings.verbosity
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

### Prepping Data

In [3]:
#obtain data from file
adata =  sc.read_h5ad("data/Norman_2019.h5ad")  # replace with your path

In [12]:
ddata = adata.X.toarray()
ddata_reshaped = ddata[:, :, np.newaxis]
n_cells = ddata.shape[0]
positional_encodings_expanded = np.tile(positional_encodings, (n_cells, 1, 1))
X_sequence_features = np.concatenate([ddata_reshaped, positional_encodings_expanded], axis=2)
print("Shape of the new feature matrix for a sequence model:", X_sequence_features.shape)

: 

In [None]:
#export data from sc.read
ddata = adata.X.toarray()
labels = adata.obs['perturbation_name'].to_numpy()
parsed_labels = [p.split('+') if p != 'control' else [] for p in labels]

In [None]:
#multilabel encode the data 
mlb = MultiLabelBinarizer()
labels_int = mlb.fit_transform(parsed_labels)

In [None]:
#split data
X_train, X_test, y_train, y_test = train_test_split(
    ddata, 
    labels_int, 
    test_size=0.2, 
    random_state=67, #SIX SEVEENNNNNNNNNN
    #stratify=labels_int
)

##### More embedding

In [5]:
import pyensembl
from tqdm import tqdm

In [6]:
data = pyensembl.EnsemblRelease(109)
data.download()
data.index()

gene_names = adata.var['index']

gene_data = []
for gene_name in tqdm(gene_names):
    try:
        gene = data.gene_by_id(gene_name)
        gene_data.append({
            'gene_name': gene_name,
            'contig': gene.contig,
            'start': gene.start,
            'end': gene.end
        })
    except ValueError:
        # This happens if the gene name is not found in the database
        gene_data.append({
            'gene_name': gene_name,
            'contig': None,
            'start': None,
            'end': None
        })

gene_info_df = pd.DataFrame(gene_data)



INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/steveyin/Library/Caches/pyensembl/GRCh38/ensembl109/Homo_sapiens.GRCh38.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/steveyin/Library/Caches/pyensembl/GRCh38/ensembl109/Homo_sapiens.GRCh38.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/steveyin/Library/Caches/pyensembl/GRCh38/ensembl109/Homo_sapiens.GRCh38.pep.all.fa.gz.pickle
100%|██████████| 19018/19018 [00:00<00:00, 109745.90it/s]


In [7]:
chromosome_lengths = {
    # Autosomes
    '1': 248956422,
    '2': 242193529,
    '3': 198295559,
    '4': 190214555,
    '5': 181538259,
    '6': 170805979,
    '7': 159345973,
    '8': 145138636,
    '9': 138394717,
    '10': 133797422,
    '11': 135086622,
    '12': 133275309,
    '13': 114364328,
    '14': 107043718,
    '15': 101991189,
    '16': 90338345,
    '17': 83257441,
    '18': 80373285,
    '19': 58617616,
    '20': 64444167,
    '21': 46709983,
    '22': 50818468,
    # Sex Chromosomes
    'X': 156040895,
    'Y': 57227415
}

chr_names = [str(i) for i in range(1, 23)] + ['X', 'Y']
chromosome_map = {name: i for i, name in enumerate(chr_names)}

In [8]:
def positional_encoding(row):
    """Takes a row from our gene_info_df and creates the encoding vector."""
    encoding = np.zeros(24)
    
    contig = row['contig']
    start = row['start']
    
    if pd.notna(contig) and contig in chromosome_map:
        chr_index = chromosome_map[contig]
        chr_length = chromosome_lengths[contig]
        
        # Calculate normalized position (the magnitude)
        normalized_position = start / chr_length
        encoding[chr_index] = normalized_position
        
    return encoding

positional_encodings = np.array(gene_info_df.apply(positional_encoding, axis=1).tolist())
positional_encodings

array([[0.00011871, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00035868, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00074397, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

##### Prepping Model

In [None]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
from modelTRAN import TransMLP

input_size = X_train.shape[1] # Number of genes
num_classes = labels_int[0].size
learning_rate = 0.00026
num_epochs = 25

model = modelMLP.MLP(input_size=input_size, num_classes=num_classes)
criterion = nn.BCEWithLogitsLoss() # Best for multi-class classification
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay= 2e-6)



