In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import numpy as np
import os
from FACS_Sampling.utils import create_adata

sc.set_figure_params(figsize=(8,8), fontsize=15, )

In [1]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using GPU.")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU instead.")

GPU not available, using CPU instead.


In [3]:
n_gpus_per_node = torch.cuda.device_count()

In [4]:
n_gpus_per_node

0

In [2]:

# Get the file path from the environment variable
file_path_env = os.getenv('MY_FACS_DATA_PATH')
input_file1 = os.path.join(file_path_env,'sara_data',"adata_ref_sara_2M.h5ad")
input_file2 = os.path.join(file_path_env,'sara_data', 'reps',"random_adata_2_30__0.h5ad")
input_file3 = os.path.join(file_path_env,'sara_data', 'reps',"fsbs_adata_2_30__0.h5ad")

adata_ref = sc.read_h5ad(input_file1)
adata_random = sc.read_h5ad(input_file2)
adata_fsbs =  sc.read_h5ad(input_file3)


In [3]:
label_key = 'population'

In [4]:
adata_random.obs.reset_index(drop=False, inplace=True)

In [5]:
from FACS_Sampling.methods.methods import bin_sample, sample_random
seed = 12345
np.random.seed(seed)
new_seed = np.random.randint(100000)

ps, _ = bin_sample(adata_random, n_bins=30, s_size=100, seed=new_seed)
rs = sample_random(adata_random, s_size=ps.size, seed=new_seed)


In [6]:
ad_random = adata_random[rs].copy()
ad_fsbs = adata_random[ps].copy()



### Encoding labels

In [7]:
from sklearn.preprocessing import LabelEncoder

class CustomLabelEncoder:
    def __init__(self):
        self.label_encoder = LabelEncoder()

    def fit(self, labels):
        self.label_encoder.fit(labels)

    def encode(self, labels):
        # Encode a list of labels and return the encoded results as a list
        return self.label_encoder.transform(labels).tolist()

    def decode(self, encoded_labels):
        # Decode a list of encoded results and return the actual labels as a list
        return self.label_encoder.inverse_transform(encoded_labels).tolist()

# Example usage:
custom_encoder = CustomLabelEncoder()

# Original labels
labels = ad_fsbs.obs['population'].values

# Fit the encoder with the labels
custom_encoder.fit(labels)


# Data prep

In [8]:
fsbs_labels = custom_encoder.encode(ad_fsbs.obs['population'].values)
random_labels = custom_encoder.encode(ad_random.obs['population'].values)

In [9]:
X_train_fsbs, y_train_fsbs = (ad_fsbs.X, fsbs_labels)
X_train_random, y_train_random = (ad_random.X, random_labels)

In [26]:
seed = 43
np.random.seed(seed)

rem_indices = list(set(adata_ref.obs.index) - set(adata_random.obs['index'].values))
validation_indices = np.random.choice(rem_indices, size=50000, replace=False).tolist()

valid_labels = custom_encoder.encode(adata_ref[validation_indices].obs['population'].values.copy())
X_valid, y_valid = (adata_ref[validation_indices].X.copy(), valid_labels)

# Models

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the Encoder
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(22, 10),
            nn.ReLU(True),
            nn.Linear(10, 2)
        )

    def forward(self, x):
        x = self.layer(x)
        return x

# Define the Decoder
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(2, 10),
            nn.ReLU(True),
            nn.Linear(10, 22),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.layer(x)
        return x

# Define the Classifier
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.fc = nn.Linear(2, 16)

    def forward(self, x):
        x = self.fc(x)
        return x

# Instantiate the models
encoder = Encoder()
decoder = Decoder()
classifier = Classifier()

# Define the loss functions and optimizer
criterion_ae = nn.MSELoss()  # For the autoencoder
criterion_cl = nn.CrossEntropyLoss()  # For the classifier
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()) + list(classifier.parameters()), lr=0.001)

# Training Loop (Pseudo-code)
for epoch in range(num_epochs):
    for data in data_loader:
        inputs, labels = data

        # Forward pass through encoder and decoder
        encoded = encoder(inputs)
        decoded = decoder(encoded)

        # Forward pass through classifier
        class_outputs = classifier(encoded)

        # Compute loss
        loss_ae = criterion_ae(decoded, inputs)
        loss_cl = criterion_cl(class_outputs, labels)
        total_loss = loss_ae + loss_cl

        # Backward pass and optimize
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

# Using the encoder
# new_data = ...  # Your new data
# encoded_representation = encoder(new_data)


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

# Assuming you have two dataframes: df_train for training and df_val for validation
# df_train = pd.read_csv('train_data.csv')
# df_val = pd.read_csv('val_data.csv')

# Example preprocessing
scaler = StandardScaler()
df_train_features = scaler.fit_transform(df_train.drop('label_column', axis=1))
df_val_features = scaler.transform(df_val.drop('label_column', axis=1))

df_train_labels = df_train['label_column'].values
df_val_labels = df_val['label_column'].values

# Custom dataset
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float), torch.tensor(self.labels[idx], dtype=torch.long)

# Create datasets
train_dataset = CustomDataset(df_train_features, df_train_labels)
val_dataset = CustomDataset(df_val_features, df_val_labels)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Training Loop (Pseudo-code)
for epoch in range(num_epochs):
    # Training phase
    for inputs, labels in train_loader:
        # Train your model

    # Validation phase
    with torch.no_grad():
        for inputs, labels in val_loader:
            # Validate your model


In [None]:
# 1. Check if CUDA (GPU support) is available
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using GPU.")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU instead.")

# 2. Move your model to the chosen device
model = YourModel()
model.to(device)

# 3. When loading data, send the data to the same device
for inputs, labels in dataloader:
    inputs, labels = inputs.to(device), labels.to(device)

    # Forward pass, backward pass, optimize
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = loss_function(outputs, labels)
    loss.backward()
    optimizer.step()


In [27]:
import lightgbm as lgb
clf_ra = lgb.LGBMClassifier(
    num_leaves=31,  # This is just a starting point, adjust based on performance and overfitting
    learning_rate=0.05,
    n_estimators=100,
    num_class=16,  # Set this to the number of classes
    objective='multiclass'  # Specify the multiclass objective
)

clf_fs = lgb.LGBMClassifier(
    num_leaves=31,  # This is just a starting point, adjust based on performance and overfitting
    learning_rate=0.05,
    n_estimators=100,
    num_class=16,  # Set this to the number of classes
    objective='multiclass'  # Specify the multiclass objective
)

# clf_ra = lgb.LGBMClassifier()
# clf_fs = lgb.LGBMClassifier()


## Train data
### Goal is to compare models being traind on different data to see how good the data is affecting the results.

In [28]:
clf_ra.fit(
    X_train_random, y_train_random,
    eval_set=[(X_valid, y_valid)],
    eval_metric='logloss', # You can choose the metric relevant to your problem
    # early_stopping_rounds=10, # Stops training if one metric of one validation data doesn’t improve in last 10 rounds
    # verbose=True
)

clf_fs.fit(
    X_train_fsbs, y_train_fsbs,
    eval_set=[(X_valid, y_valid)],
    eval_metric='logloss', # You can choose the metric relevant to your problem
    # early_stopping_rounds=10, # Stops training if one metric of one validation data doesn’t improve in last 10 rounds
    # verbose=True
)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004688 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5610
[LightGBM] [Info] Number of data points in the train set: 28245, number of used features: 22
[LightGBM] [Info] Start training from score -2.128380
[LightGBM] [Info] Start training from score -2.990964
[LightGBM] [Info] Start training from score -3.581715
[LightGBM] [Info] Start training from score -1.595375
[LightGBM] [Info] Start training from score -4.568499
[LightGBM] [Info] Start training from score -3.469887
[LightGBM] [Info] Start training from score -5.388859
[LightGBM] [Info] Start training from score -4.321746
[LightGBM] [Info] Start training from score -1.453695
[LightGBM] [Info] Start training from score -3.495234
[LightGBM] [Info] Start training from score -1.790168
[LightGBM] [Info] Start training from score -2.972115
[LightGBM] [Info] Start training from score -4.177934
[LightGBM

In [29]:
# clf_fs.fit(X_train_fsbs, y_train_fsbs)

In [30]:
# clf_ra.fit(X_train_random, y_train_random)

In [31]:
# from sklearn.metrics import accuracy_score
# y_pred_fs=clf_ra.predict(X_train_random)
# accuracy=accuracy_score(y_pred_fs, y_train_random)
# print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_train_random, y_pred_fs)))

#### Acc of models on the validation data

In [32]:
y_pred_fs=clf_fs.predict(X_valid)

In [33]:
from sklearn.metrics import accuracy_score
# accuracy_val_fs=accuracy_score(y_pred_fs, y_valid)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_valid, y_pred_fs)))

LightGBM Model accuracy score: 0.9248


In [34]:
y_pred_ra=clf_ra.predict(X_valid)

In [35]:
from sklearn.metrics import accuracy_score
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_valid, y_pred_ra)))

LightGBM Model accuracy score: 0.9216


In [36]:
from sklearn.metrics import classification_report
print(classification_report(y_valid, y_pred_fs))

              precision    recall  f1-score   support

           0       0.95      0.92      0.93      5870
           1       0.93      0.91      0.92      2571
           2       0.80      0.68      0.73      1448
           3       0.96      0.97      0.96     10302
           4       0.84      0.76      0.80       625
           5       0.87      0.83      0.85      1574
           6       0.88      0.82      0.85       236
           7       0.86      0.76      0.80       681
           8       0.95      0.97      0.96     11531
           9       0.91      0.87      0.89      1462
          10       0.90      0.95      0.92      8371
          11       0.85      0.83      0.84      2546
          12       0.96      0.96      0.96       669
          13       0.92      0.88      0.90       609
          14       0.92      0.81      0.86       290
          15       0.92      0.93      0.93      1215

    accuracy                           0.92     50000
   macro avg       0.90   

In [37]:
print(classification_report(y_valid, y_pred_ra))

              precision    recall  f1-score   support

           0       0.94      0.93      0.93      5870
           1       0.92      0.91      0.91      2571
           2       0.79      0.69      0.74      1448
           3       0.96      0.97      0.96     10302
           4       0.83      0.69      0.75       625
           5       0.87      0.83      0.85      1574
           6       0.86      0.69      0.76       236
           7       0.86      0.73      0.79       681
           8       0.95      0.97      0.96     11531
           9       0.89      0.87      0.88      1462
          10       0.90      0.94      0.92      8371
          11       0.85      0.83      0.84      2546
          12       0.92      0.96      0.94       669
          13       0.91      0.85      0.88       609
          14       0.95      0.66      0.78       290
          15       0.92      0.93      0.92      1215

    accuracy                           0.92     50000
   macro avg       0.89   

In [38]:
df_ra = pd.DataFrame(classification_report(y_valid, y_pred_ra, output_dict=True)).T
df_fs = pd.DataFrame(classification_report(y_valid, y_pred_fs, output_dict=True)).T

In [39]:
df_fs - df_ra

Unnamed: 0,precision,recall,f1-score,support
0,0.006805,-0.008177,-0.000874,0.0
1,0.016276,-0.003112,0.006405,0.0
2,0.003605,-0.014503,-0.00683,0.0
3,0.000198,0.002427,0.001297,0.0
4,0.009633,0.0672,0.042888,0.0
5,-0.002655,0.001906,-0.000266,0.0
6,0.02013,0.131356,0.084138,0.0
7,-0.001226,0.0279,0.015505,0.0
8,0.000385,-0.000347,2.7e-05,0.0
9,0.01766,0.0,0.008592,0.0
