In [1]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [49]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, TensorDataset, DataLoader

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from tqdm import tqdm

In [50]:
device = 'cpu'
def model_eval(model, data_loader, return_preds = False):
  model.eval()
  y_preds = []
  y_true = []
  with torch.no_grad():
    for inputs, labels in data_loader:
      inputs, labels = inputs.to(device), labels.to(device)
      outputs = model(inputs)
      y_preds.append(outputs)
      y_true.append(labels)

  y_preds = torch.cat(y_preds).cpu().detach().numpy()
  y_true = torch.cat(y_true).cpu().detach().numpy()

  if return_preds:
    return {'y_true': y_true, 'y_preds': y_preds}
  else:
    return roc_auc_score(y_true, y_preds)

In [42]:
################################
#### Clean and Process Data ####
################################

data_trost = 'TRs_annotated_cleaned.csv'
data_mistra = 'Mistra_TRs_annotated_cleaned.csv'

df_trost = pd.read_csv(data_trost)
df_mistra = pd.read_csv(data_mistra)

In [18]:
col_names = df_trost.columns

categorical = [var for var in df_trost.columns if df_trost[var].dtype=='O']
numerical = [var for var in df_trost.columns if df_trost[var].dtype!='O']

df_trost.drop(['id', 'location', 'region', 'tissue_simple'], axis=1, inplace=True)
df_mistra.drop(['id', 'location', 'region', 'tissue_simple'], axis=1, inplace=True)

In [48]:
categorical

['id', 'location', 'region', 'gene_type', 'tissue_simple']

In [20]:
# Create a ColumnTransformer to apply different transformations to numeric and binary columns
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', StandardScaler(), numeric_columns),
        ('binary', 'passthrough', binary_columns),
        ('categorical', OneHotEncoder, categorical_columns)
    ])

# Define the pipeline with the preprocessing steps
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [23]:
# Fit and transform the training data
X_trost = pipeline.fit_transform(X)

# Convert the processed data back to DataFrame (optional)
X_trost= pd.DataFrame(X_trost, columns=numeric_columns + binary_columns)

In [26]:
X_trost.shape

(37042, 56)

In [41]:
numerical.remove('label')

In [43]:
df_mistra.drop(['id', 'location', 'region', 'tissue_simple'], axis=1, inplace=True)
one_hot_gene_type = pd.get_dummies(df_mistra.gene_type, prefix="gene_type", drop_first=True, dtype=int)
new_df = pd.concat([df_mistra[numerical], one_hot_gene_type], axis=1)
new_df.drop(['gene_type_intergenic'], axis=1, inplace=True)

In [47]:
new_df.shape

(34, 50)

In [None]:
class MLP(nn.Module):
    def __init__(self, dropout):
        super(MLP, self).__init__()

        self.dropout = nn.Dropout(p=dropout)

        self.fc1 = nn.LazyLinear(500, bias=False)
        self.bn1 = nn.BatchNorm1d(500)

        self.fc2 = nn.Linear(500, 1000, bias=False)
        self.bn2 = nn.BatchNorm1d(1000)

        self.fc3 = nn.Linear(1000, 500, bias=False)
        self.bn3 = nn.BatchNorm1d(500)

        self.fc4 = nn.Linear(500, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.bn1(x)
        x = self.dropout(x)

        x = F.relu(self.fc2(x))
        x = self.bn2(x)
        x = self.dropout(x)

        x = F.relu(self.fc3(x))
        x = self.bn3(x)
        x = self.dropout(x)

        x = F.sigmoid(self.fc4(x))
        return x.squeeze(1)


In [None]:
torch.manual_seed(1)
model = MLP(0.5)
model



MLP(
  (dropout): Dropout(p=0.5, inplace=False)
  (fc1): LazyLinear(in_features=0, out_features=500, bias=False)
  (bn1): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=500, out_features=1000, bias=False)
  (bn2): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=1000, out_features=500, bias=False)
  (bn3): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc4): Linear(in_features=500, out_features=1, bias=True)
)