In [29]:
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, recall_score, roc_curve


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm
import matplotlib.pyplot as plt             #visualisation
import seaborn as sns   #visualisation
from torch.utils.tensorboard import SummaryWriter
%matplotlib inline     
sns.set(color_codes=True)

In [30]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    raise Exception("Cry about it")


In [31]:
def add_cell_prefix(df, prefix):
    df.index = [prefix + i for i in df.index]
    return df

#Remove extra quotes from the filenames
def extract_cell_name_smartseq(x):
    y = x.split("_")
    return y[len(y)-2]

def get_cell_name_smartseq(file_name):
    return extract_cell_name_smartseq(file_name)

def convert_indexes_to_cell_names_smartseq(df):
    df.index = [get_cell_name_smartseq(x) for x in df.index]
    return df

def get_cell_hypo_or_norm_smartseq(df_meta, cell_name):
    return df_meta[df_meta["Cell name"]==cell_name]["Condition"].values[0]

def seperate_hypo_and_norm_smartseq(df, df_meta):
    df_hypo = df[df.index.map(lambda x: get_cell_hypo_or_norm_smartseq(df_meta, x)=="Hypo")]
    df_norm = df[df.index.map(lambda x: get_cell_hypo_or_norm_smartseq(df_meta, x)=="Norm" or get_cell_hypo_or_norm_smartseq(df_meta, x)=="Normo")]
    return df_hypo, df_norm

def process_df_smartseq(df, df_meta, prefix):
    df = convert_indexes_to_cell_names_smartseq(df)
    _, df_norm = seperate_hypo_and_norm_smartseq(df,df_meta)
    df = add_cell_prefix(df, prefix)
    df_norm = add_cell_prefix(df_norm, prefix)
    return df, df_norm.index

#Remove extra quotes from the filenames
def extract_cell_name_dropseq(x):
    y = x.split("_")
    return y[0]

def get_cell_name_dropseq(file_name):
    return extract_cell_name_dropseq(file_name)

def convert_indexes_to_cell_names_dropseq(df):
    df.index = [get_cell_name_dropseq(x) for x in df.index]
    return df

def get_cell_hypo_or_norm_dropseq(cell_name):
    return cell_name.split("_")[-1]

def seperate_hypo_and_norm_dropseq(df):
    df_hypo = df[df.index.map(lambda x: get_cell_hypo_or_norm_dropseq(x)=="Hypoxia")]
    df_norm = df[df.index.map(lambda x: get_cell_hypo_or_norm_dropseq(x)=="Normoxia")]
    return df_hypo, df_norm

def process_df_dropseq(df, prefix):
    _, df_norm = seperate_hypo_and_norm_dropseq(df)
    df = convert_indexes_to_cell_names_dropseq(df)
    df_norm = convert_indexes_to_cell_names_dropseq(df_norm)
    df = add_cell_prefix(df, prefix)
    df_norm = add_cell_prefix(df_norm, prefix)
    return df, df_norm.index

In [32]:
df_meta = pd.read_csv("Data/SmartSeq/MCF7_SmartS_MetaData.tsv",delimiter="\t", index_col=0)
dffn = pd.read_csv("Data/SmartSeq/MCF7_SmartS_Filtered_Normalised_3000_Data_train.txt",delimiter=" ",index_col=0).T
df2_meta = pd.read_csv("Data/SmartSeq/HCC1806_SmartS_MetaData.tsv",delimiter="\t",index_col=0)
df2fn = pd.read_csv("Data/SmartSeq/HCC1806_SmartS_Filtered_Normalised_3000_Data_train.txt",delimiter=" ",index_col=0).T
df3 = pd.read_csv("Data/DropSeq/MCF7_Filtered_Normalised_3000_Data_train.txt",delimiter=" ",index_col=0).T
df4 = pd.read_csv("Data/DropSeq/HCC1806_Filtered_Normalised_3000_Data_train.txt",delimiter=" ",index_col=0).T

KeyboardInterrupt: 

In [None]:
dffn, dffn_norm_idx = process_df_smartseq(dffn.copy(), df_meta, "MCF7_")
df2fn, df2fn_norm_idx = process_df_smartseq(df2fn.copy(), df2_meta, "HCC1806_")
df3, df3_norm_idx = process_df_dropseq(df3, "MCF7_")
df4, df4_norm_idx = process_df_dropseq(df4, "HCC1806_")

In [None]:
df3

Unnamed: 0,MALAT1,MT-RNR2,NEAT1,H1-5,TFF1,MT-RNR1,H4C3,GDF15,KRT81,MT-CO3,...,MROH1,SKIDA1,MICALL1,RARG,MYO1F,BRWD1-AS2,RPS19BP1,AUNIP,TNK2,SUDS3
MCF7_AAAAACCTATCG,1,0,0,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MCF7_AAAACAACCCTA,3,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MCF7_AAAACACTCTCA,3,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MCF7_AAAACCAGGCAC,6,2,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MCF7_AAAACCTAGCTC,4,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MCF7_TTTTCGCGTAGA,0,0,0,0,3,0,7,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MCF7_TTTTCGTCCGCT,1,0,0,0,4,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
MCF7_TTTTCTCCGGCT,0,0,0,1,2,0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,1
MCF7_TTTTGTTCAAAG,0,0,0,0,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df_drop_norm_idx = np.concatenate([df3_norm_idx, df4_norm_idx])
df_smart_norm_idx = np.concatenate([dffn_norm_idx, df2fn_norm_idx])
df_smart_idx = np.concatenate([dffn.index, df2fn.index])

df_all = pd.concat([dffn, df2fn, df3, df4])
df_all = df_all.fillna(0)

df_all_norm_idx = np.concatenate([dffn_norm_idx, df2fn_norm_idx, df3_norm_idx, df4_norm_idx])

df_MCF7_idx = [idx for idx in df_all.index if "MCF7" in idx]


In [8]:
#df_all["mcf"] = ["MCF7" in idx for idx in df_all.index]
#df_all["smart"] = [idx in df_smart_idx for idx in df_all.index]

In [9]:
df_all = df_all.astype(np.int32)

In [10]:
# Free up memory
del dffn, df2fn, df3, df4, dffn_norm_idx, df2fn_norm_idx, df3_norm_idx, df4_norm_idx, df_meta, df2_meta

In [11]:
class NNDataset(Dataset):
    def __init__(self, df, df_norm_idx):
        self.data = df.values  # Convert DataFrame to numpy array
        self.data_norm = df_norm_idx
        self.idx = df.index
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        dat = self.data[index, :]
        x = torch.tensor(dat, dtype=torch.float32, device=device)
        y = torch.tensor(0 if self.idx[index] in self.data_norm else 1,  dtype=torch.float32, device=device)
        return x, y

# Define the architecture of the autoencoder
# Define the architecture of the autoencoder
class Autoencoder2(nn.Module):
    def __init__(self, shrink_sizes, shrink_step_count):
        super(Autoencoder2, self).__init__()

        encoderLayers = []

        for i in range(shrink_step_count):
            encoderLayers.append(nn.Linear(shrink_sizes[i], shrink_sizes[i + 1]))
            encoderLayers.append(nn.BatchNorm1d(shrink_sizes[i + 1]))
            encoderLayers.append(nn.LeakyReLU())
            encoderLayers.append(nn.Dropout(0.2))
        self.encoder = nn.Sequential(*encoderLayers)

        decoderLayers = []
        for i in range(shrink_step_count):
            decoderLayers.append(nn.Linear(shrink_sizes[shrink_step_count - i], shrink_sizes[shrink_step_count - i - 1]))
            decoderLayers.append(nn.LeakyReLU())
        self.decoder = nn.Sequential(*decoderLayers)

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [12]:
@torch.no_grad
def measure_model_log(writer, model, data_loader, epoch):
    correct = 0
    total = 0
    y_pred = []
    y_pred_class = []
    y_true = []
    for inputs, labels in data_loader:
        outputs = model(inputs)
        predicted = nn.Sigmoid()(outputs[:, 0])
        total += labels.size(0)
        correct += (predicted.round() == labels).sum().item()
        y_pred_class.extend(predicted.cpu().round().numpy()) # Save Prediction
        y_pred.extend(predicted.cpu().numpy()) # Save Prediction
        labels = labels.data.cpu().numpy()
        y_true.extend(labels) # Save Truth
    if(np.isnan(y_pred_class).any()):
        print("Found nan, skipping evaluation")
        return 
    f1 = f1_score(y_true, y_pred_class)
    recall = recall_score(y_true, y_pred_class)
    accuracy = (correct / total)
    writer.add_scalar("F1 Score", f1, epoch)
    writer.add_scalar("Recall", recall, epoch)
    writer.add_scalar("Accuracy", accuracy, epoch)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print(f'Accuracy on test: {accuracy * 100}%')

    # constant for classes
    classes = ('Normoxia', 'Hypoxia')

    # Build confusion matrix
    cf_matrix = confusion_matrix(y_true, y_pred_class)
    df_cm = pd.DataFrame(cf_matrix / np.sum(cf_matrix, axis=1)[:, None], index = [i for i in classes],
                     columns = [i for i in classes])
    fig = plt.figure(figsize = (12,7))
    sns.heatmap(df_cm, annot=True)
    writer.add_figure("Confusion Matrix", fig, epoch)

    fig = plt.figure(figsize = (12,7))
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    plt.plot(fpr, tpr) # ROC curve = TPR vs FPR
    plt.title("Receiver Operating Characteristics")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")

    writer.add_figure("ROC Curve", fig, epoch)


@torch.no_grad
def measure_model(model, data_loader):
    plt.close()
    correct = 0
    total = 0
    y_pred = []
    y_pred_class = []
    y_true = []
    for inputs, labels in data_loader:
        outputs = model(inputs)
        predicted = nn.Sigmoid()(outputs[:, 0])
        total += labels.size(0)
        correct += (predicted +0.5 > labels).sum().item()
        y_pred_class.extend(predicted.cpu().round()) # Save Prediction
        y_pred.extend(predicted.cpu()) # Save Prediction
        labels = labels.data.cpu().numpy()
        y_true.extend(labels) # Save Truth
    
    # constant for classes
    classes = ('Normoxia', 'Hypoxia')

    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    plt.plot(fpr, tpr) # ROC curve = TPR vs FPR
    plt.title("Receiver Operating Characteristics")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")

    # Build confusion matrix
    cf_matrix = confusion_matrix(y_true, y_pred_class)
    df_cm = pd.DataFrame(cf_matrix / np.sum(cf_matrix, axis=1)[:, None], index = [i for i in classes],
                     columns = [i for i in classes])
    plt.figure(figsize = (12,7))
    sns.heatmap(df_cm, annot=True)
    print("F1 Score:", f1_score(y_true, y_pred_class))
    print("Recall:", recall_score(y_true, y_pred_class))
    print(f'Accuracy of the network on the {total} test images: {(correct / total) * 100}%')


In [13]:
input_size = df_all.shape[1]
learning_rate = 1e-2

middle_size = 3
shrink_step_count = 6

shrink_poly_factor = 4

shrink_sizes = np.linspace(input_size**(1/shrink_poly_factor), middle_size**(1/shrink_poly_factor), shrink_step_count + 1)
print(shrink_sizes)
# We want integer sizes
shrink_sizes = np.round(shrink_sizes**shrink_poly_factor).astype(int)
print(shrink_sizes)

test_amount = 0.2

[9.64253557 8.25479197 6.86704838 5.47930479 4.0915612  2.70381761
 1.31607401]
[8645 4643 2224  901  280   53    3]


In [14]:
df_train, df_test = train_test_split(df_all, test_size=test_amount, stratify=df_all.index.isin(df_all_norm_idx))

In [16]:
#df_train_smart = df_train[df_train["smart"] == 1]
#df_train_drop = df_train[df_train["smart"] == 0]

dataset = NNDataset(df_train, df_all_norm_idx)
#dataset_drop = NNDataset(df_train_drop, df_all_norm_idx)
#dataset_smart = NNDataset(df_train_smart, df_all_norm_idx)

data_loader = DataLoader(dataset, batch_size=128, shuffle=True)
#data_loader_drop = DataLoader(dataset_drop, batch_size=128, shuffle=True)
#data_loader_smart = DataLoader(dataset_smart, batch_size=32, shuffle=True)

In [17]:
dataset_test = NNDataset(df_test, df_all_norm_idx)
test_loader = DataLoader(dataset_test, batch_size=64, shuffle=True)

df_test_smart = df_test[[i in df_smart_idx for i in df_test.index]]
df_test_drop = df_test[[i not in df_smart_idx for i in df_test.index]]

dataset_test_smart = NNDataset(df_test_smart, df_smart_norm_idx)
dataset_test_drop = NNDataset(df_test_drop, df_drop_norm_idx)

test_loader_smart = DataLoader(dataset_test_smart, batch_size=64, shuffle=True)
test_loader_drop = DataLoader(dataset_test_drop, batch_size=64, shuffle=True)

In [18]:
# Define the model
model = Autoencoder2(shrink_sizes, shrink_step_count).to(device)
writer = SummaryWriter()

In [19]:
# Define the loss function
criterion = nn.CrossEntropyLoss()


In [20]:
lr_sgd = 1e-2
lr_adm = 1e-3
lr_ada = 1e-2

# Define the optimizer
#sgd = optim.SGD(model.parameters(), lr=lr_sgd)
adam = optim.Adam(model.parameters(), lr=lr_adm)
#ada = optim.Adagrad(model.parameters(), lr=lr_ada)

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
model

Autoencoder2(
  (encoder): Sequential(
    (0): Linear(in_features=8645, out_features=4643, bias=True)
    (1): BatchNorm1d(4643, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.01)
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=4643, out_features=2224, bias=True)
    (5): BatchNorm1d(2224, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): LeakyReLU(negative_slope=0.01)
    (7): Dropout(p=0.2, inplace=False)
    (8): Linear(in_features=2224, out_features=901, bias=True)
    (9): BatchNorm1d(901, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): LeakyReLU(negative_slope=0.01)
    (11): Dropout(p=0.2, inplace=False)
    (12): Linear(in_features=901, out_features=280, bias=True)
    (13): BatchNorm1d(280, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (14): LeakyReLU(negative_slope=0.01)
    (15): Dropout(p=0.2, inplace=False)
    (16): Linear(in_feat

In [22]:
train_config = [
#    [10, data_loader_smart, adam],
    [100, data_loader, adam]
]

In [23]:
total_epochs = 0
# Training loop
for segment in train_config:
    num_epochs = segment[0]
    data_loaderr = segment[1]
    optimizer = segment[2]
    for epoch in tqdm(range(num_epochs)):
        running_loss = 0.0
        for inputs, labels in data_loaderr:
            # Zero the parameter gradients
            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, inputs)
            
            # Backward pass and optimize
            loss.backward()

            # Clip gradients
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()

            # Print statistics
            running_loss += loss.item()

        writer.add_scalar("Loss/train", running_loss/len(data_loader), epoch)
        print(f"Epoch {epoch+1}, Loss: {running_loss/len(data_loader)}", flush=True)
        writer.flush()
        total_epochs += 1

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 1, Loss: 37344.4945535411


  1%|          | 1/100 [00:24<41:10, 24.96s/it]

Epoch 2, Loss: 32114.529345968494


  2%|▏         | 2/100 [00:49<40:11, 24.61s/it]

Epoch 3, Loss: 31614.756907322095


  3%|▎         | 3/100 [01:13<39:32, 24.46s/it]

Epoch 4, Loss: 31364.73068343453


  4%|▍         | 4/100 [01:37<39:05, 24.43s/it]

Epoch 5, Loss: 31046.025224503228


  5%|▌         | 5/100 [02:02<38:42, 24.45s/it]

Epoch 6, Loss: 31216.121749214504


  6%|▌         | 6/100 [02:27<38:25, 24.52s/it]

Epoch 7, Loss: 30960.198238472316


  7%|▋         | 7/100 [02:51<38:04, 24.56s/it]

Epoch 8, Loss: 31427.950584079907


  8%|▊         | 8/100 [03:16<37:35, 24.52s/it]

Epoch 9, Loss: 31142.943841552733


  9%|▉         | 9/100 [03:40<37:16, 24.58s/it]

Epoch 10, Loss: 30870.537678859546


 10%|█         | 10/100 [04:05<36:55, 24.62s/it]

Epoch 11, Loss: 31262.874714461617


 11%|█         | 11/100 [04:30<36:32, 24.64s/it]

Epoch 12, Loss: 31076.99619167162


 12%|█▏        | 12/100 [04:54<36:08, 24.65s/it]

Epoch 13, Loss: 30939.023745860224


 13%|█▎        | 13/100 [05:19<35:47, 24.69s/it]

Epoch 14, Loss: 31289.32822292162


 14%|█▍        | 14/100 [05:44<35:19, 24.65s/it]

Epoch 15, Loss: 31246.55524265455


 15%|█▌        | 15/100 [06:08<34:46, 24.55s/it]

Epoch 16, Loss: 30877.62824707031


 16%|█▌        | 16/100 [06:33<34:19, 24.52s/it]

Epoch 17, Loss: 30987.63469132133


 17%|█▋        | 17/100 [06:57<33:53, 24.50s/it]

Epoch 18, Loss: 30798.71215050739


 18%|█▊        | 18/100 [07:41<35:00, 25.62s/it]


KeyboardInterrupt: 

In [26]:
outputs = []
cell_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs.extend(model.encoder(inputs).cpu().numpy())
        cell_labels.extend(labels.cpu().numpy())

pred_df = pd.DataFrame(outputs)
pred_df["Condition"] = cell_labels

In [28]:
pred_df

Unnamed: 0,0,1,2,Condition
0,0.094960,0.245190,0.376383,1.0
1,0.853019,-0.002924,-0.001846,0.0
2,0.039274,0.340363,0.442429,0.0
3,0.949301,-0.002783,-0.001330,0.0
4,0.856668,-0.002833,-0.000638,0.0
...,...,...,...,...
7343,0.785825,-0.000831,0.133506,0.0
7344,0.000000,0.185379,0.297142,1.0
7345,0.285112,0.305510,0.000000,0.0
7346,0.532311,0.163768,0.000000,0.0


In [27]:
import plotly.express as px
fig = px.scatter_3d(pred_df, x=0, y=1, z=2, color="Condition")
fig.show()

In [32]:
torch.save(model.state_dict(), "autoenc.checkpoint")

In [389]:
import gc

model.cpu()
del model
gc.collect()
torch.cuda.empty_cache()