In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# import mlprepare as mlp 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import learning_curve,  GridSearchCV, cross_validate, KFold, cross_val_score
from sklearn import metrics
import scikitplot as skplt


# VAE augmentation using PYTORCH from 

In [13]:
class customLoss(nn.Module):
    def __init__(self):
        super(customLoss, self).__init__()
        self.mse_loss = nn.MSELoss(reduction="sum")
    
    def forward(self, x_recon, x, mu, logvar):
        loss_MSE = self.mse_loss(x_recon, x)
        loss_KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        return loss_MSE + loss_KLD

class Autoencoder(nn.Module):
    def __init__(self,D_in=None,H=50,H2=12,latent_dim=3):

        if D_in==None:
            raise ValueError('You need to specify the Input shape.')
        
        #Encoder
        super(Autoencoder,self).__init__()
        self.linear1=nn.Linear(D_in,H)
        self.lin_bn1 = nn.BatchNorm1d(num_features=H)
        self.linear2=nn.Linear(H,H2)
        self.lin_bn2 = nn.BatchNorm1d(num_features=H2)
        self.linear3=nn.Linear(H2,H2)
        self.lin_bn3 = nn.BatchNorm1d(num_features=H2)
        
        # Latent vectors mu and sigma
        self.fc1 = nn.Linear(H2, latent_dim)
        self.bn1 = nn.BatchNorm1d(num_features=latent_dim)
        self.fc21 = nn.Linear(latent_dim, latent_dim)
        self.fc22 = nn.Linear(latent_dim, latent_dim)

        # Sampling vector
        self.fc3 = nn.Linear(latent_dim, latent_dim)
        self.fc_bn3 = nn.BatchNorm1d(latent_dim)
        self.fc4 = nn.Linear(latent_dim, H2)
        self.fc_bn4 = nn.BatchNorm1d(H2)
        
        # Decoder
        self.linear4=nn.Linear(H2,H2)
        self.lin_bn4 = nn.BatchNorm1d(num_features=H2)
        self.linear5=nn.Linear(H2,H)
        self.lin_bn5 = nn.BatchNorm1d(num_features=H)
        self.linear6=nn.Linear(H,D_in)
        self.lin_bn6 = nn.BatchNorm1d(num_features=D_in)
        
        self.relu = nn.ReLU()
        
    def encode(self, x):
        lin1 = self.relu(self.lin_bn1(self.linear1(x)))
        lin2 = self.relu(self.lin_bn2(self.linear2(lin1)))
        lin3 = self.relu(self.lin_bn3(self.linear3(lin2)))

        fc1 = F.relu(self.bn1(self.fc1(lin3)))

        r1 = self.fc21(fc1)
        r2 = self.fc22(fc1)
        
        return r1, r2
    
    def reparameterize(self, mu, logvar):
        if self.training:
            std = logvar.mul(0.5).exp_()
            eps = Variable(std.data.new(std.size()).normal_())
            return eps.mul(std).add_(mu)
        else:
            return mu
        
    def decode(self, z):
        fc3 = self.relu(self.fc_bn3(self.fc3(z)))
        fc4 = self.relu(self.fc_bn4(self.fc4(fc3)))

        lin4 = self.relu(self.lin_bn4(self.linear4(fc4)))
        lin5 = self.relu(self.lin_bn5(self.linear5(lin4)))
        return self.lin_bn6(self.linear6(lin5))
        
    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

class AutoencoderModel:
    def __init__(self,trainloader,testloader,device,D_in,H=50,H2=12,latent_dim=3):
        self.trainloader=trainloader
        self.testloader=testloader
        self.device=device
        self.D_in=D_in
        self.H=H
        self.H2=H2
        self.latent_dim=latent_dim
        self.model=Autoencoder(D_in, H, H2).to(self.device)
        self.optimizer=optim.Adam(self.model.parameters(), lr=1e-3)
        self.loss_mse = customLoss()
    
    def train_model(self,epoch, verbose, interval):
        train_losses = []
        self.model.train()
        train_loss = 0
        for _, data in enumerate(self.trainloader):
            data = data.to(self.device)
            self.optimizer.zero_grad()
            recon_batch, mu, logvar = self.model(data)
            loss = self.loss_mse(recon_batch, data, mu, logvar)
            loss.backward()
            train_loss += loss.item()
            self.optimizer.step()
        if verbose:
            if epoch % interval == 0:        
                print('====> Epoch: {} Average training loss: {:.4f}'.format(
                    epoch, train_loss / len(self.trainloader.dataset)))
                train_losses.append(train_loss / len(self.trainloader.dataset))

    def test_model(self, epoch, verbose, interval):
        test_losses = []
        with torch.no_grad():
            test_loss = 0
            for _, data in enumerate(self.testloader):
                data = data.to(self.device)
                self.optimizer.zero_grad()
                recon_batch, mu, logvar = self.model(data)
                loss = self.loss_mse(recon_batch, data, mu, logvar)
                test_loss += loss.item()
            if verbose:
                if epoch % interval == 0:        
                    print('====> Epoch: {} Average test loss: {:.4f}'.format(
                        epoch, test_loss / len(self.testloader.dataset)))
                test_losses.append(test_loss / len(self.testloader.dataset))

    def fit(self, epochs, verbose=True, interval=200):
        for epoch in range(1, epochs + 1):
            self.train_model(epoch, verbose, interval)
            self.test_model(epoch, verbose, interval)
        return self

    def predict(self, no_samples, target_class):
        with torch.no_grad():
            for batch_idx, data in enumerate(self.trainloader):
                data = data.to(self.device)
                self.optimizer.zero_grad()
                _, mu_, logvar_ = self.model(data)
                if batch_idx==0:
                    mu=mu_
                    logvar=logvar_
                else:
                    mu=torch.cat((mu, mu_), dim=0)
                    logvar=torch.cat((logvar, logvar_), dim=0)
        sigma = torch.exp(logvar/2)
        no_samples = no_samples
        q = torch.distributions.Normal(mu.mean(axis=0), sigma.mean(axis=0))
        z = q.rsample(sample_shape=torch.Size([no_samples]))
        with torch.no_grad():
            pred = self.model.decode(z).cpu().numpy()
        df_fake = pd.DataFrame(pred)
        df_fake['Class']=target_class
        return df_fake

In [15]:
y_18MinION= np.load("/home/alemsara/DirectRNA/datapsUAll/y_18MinION.npy")
X_18MinION_= pd.read_pickle("/home/alemsara/DirectRNA/datapsUAll/X_18MinION.pkl")
X_18MinION_ = X_18MinION_.fillna(0)
X_18MinION_['Level'] = X_18MinION_['Level'].replace('ND',0)
X_18MinION = X_18MinION_.drop(columns = ['Ref' , 'Pos','Base','ModStatus','Level','Cov1','Cov2','Min_cov'
                                           , 'A1', 'C1', 'G1', 'T1', 'A2', 'C2', 'G2', 'T2'])
X_18MinION = X_18MinION.fillna(0)

y_18flongle= np.load("/home/alemsara/DirectRNA/datapsUAll/y_18flongle.npy")
X_18flongle_ = pd.read_pickle("/home/alemsara/DirectRNA/datapsUAll/X_18flongle.pkl")
X_18flongle_ = X_18flongle_.fillna(0)
X_18flongle_['Level'] = X_18flongle_['Level'].replace('ND',0)
X_18flongle = X_18flongle_.drop(columns = ['Ref' , 'Pos','Base','ModStatus','Level','Cov1','Cov2','Min_cov'
                                           , 'A1', 'C1', 'G1', 'T1', 'A2', 'C2', 'G2', 'T2'])
X_18flongle = X_18flongle.fillna(0)

y_28flongle= np.load("/home/alemsara/DirectRNA/datapsUAll/y_28flongle.npy")
X_28flongle_ = pd.read_pickle("/home/alemsara/DirectRNA/datapsUAll/X_28flongle.pkl")
X_28flongle_ = X_28flongle_.fillna(0)
X_28flongle_['Level'] = X_28flongle_['Level'].replace('ND',0)
X_28flongle = X_28flongle_.drop(columns = ['Ref' , 'Pos','Base','ModStatus','Level','Cov1','Cov2','Min_cov'
                                           , 'A1', 'C1', 'G1', 'T1', 'A2', 'C2', 'G2', 'T2'])
X_28flongle = X_28flongle.fillna(0)

y_28MinION= np.load("/home/alemsara/DirectRNA/datapsUAll/y_28MinIONv1.npy")
X_28MinION_= pd.read_pickle("/home/alemsara/DirectRNA/datapsUAll/X_28MinION.pkl")
X_28MinION_ = X_28MinION_.fillna(0)
X_28MinION_['Level'] = X_28MinION_['Level'].replace('ND',0)
X_28MinION = X_28MinION_.drop(columns = ['Ref' , 'Pos','Base','ModStatus','Level','Cov1','Cov2','Min_cov'
                                           , 'A1', 'C1', 'G1', 'T1', 'A2', 'C2', 'G2', 'T2'])
X_28MinION = X_28MinION.fillna(0)


X_train= pd.concat([X_18MinION,X_28MinION])
X_test = pd.concat([X_18flongle,X_28flongle])
y_train = np.concatenate((y_18MinION,y_28MinION))
y_test = np.concatenate((y_18flongle,y_28flongle))

In [18]:
X_train= X_18MinION
X_test = X_18flongle
y_train = y_18MinION
y_test = y_18flongle
X_train_fraud = X_train.iloc[np.where(y_train==1)[0]]
X_test_fraud = X_test.iloc[np.where(y_test==1)[0]]

In [19]:
X_18MinION.shape

(396, 15)

In [20]:
from torch.utils.data import Dataset, DataLoader
class DataBuilder(Dataset):
    def __init__(self, dataset):
        self.x = dataset.values
        self.x = torch.from_numpy(self.x).to(torch.float)
        self.len=self.x.shape[0]
    def __getitem__(self,index):      
        return self.x[index]
    def __len__(self):
        return self.len

traindata_set=DataBuilder(X_train_fraud)
testdata_set=DataBuilder(X_test_fraud)

trainloader=DataLoader(dataset=traindata_set,batch_size=1024)
testloader=DataLoader(dataset=testdata_set,batch_size=1024)



In [21]:


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
D_in = traindata_set.x.shape[1]
H = 50
H2 = 12
autoenc_model = AutoencoderModel(trainloader, testloader, device, D_in, H, H2, latent_dim=3)



In [26]:
autoenc_model_fit = autoenc_model.fit(12000)

====> Epoch: 200 Average training loss: 130.7960
====> Epoch: 200 Average test loss: 1106.8752
====> Epoch: 400 Average training loss: 124.6550
====> Epoch: 400 Average test loss: 1120.8710
====> Epoch: 600 Average training loss: 118.4296
====> Epoch: 600 Average test loss: 1138.8858
====> Epoch: 800 Average training loss: 112.9318
====> Epoch: 800 Average test loss: 1132.6501
====> Epoch: 1000 Average training loss: 107.2976
====> Epoch: 1000 Average test loss: 1158.1762
====> Epoch: 1200 Average training loss: 102.4158
====> Epoch: 1200 Average test loss: 1162.9567
====> Epoch: 1400 Average training loss: 96.9689
====> Epoch: 1400 Average test loss: 1172.5975
====> Epoch: 1600 Average training loss: 92.2373
====> Epoch: 1600 Average test loss: 1183.2899
====> Epoch: 1800 Average training loss: 87.7009
====> Epoch: 1800 Average test loss: 1190.5964
====> Epoch: 2000 Average training loss: 83.8914
====> Epoch: 2000 Average test loss: 1209.1267
====> Epoch: 2200 Average training loss: 7

In [27]:
pd.DataFrame(y_train).value_counts()

0    354
1     42
dtype: int64

In [28]:
df_fake = autoenc_model_fit.predict(no_samples=42,target_class=1)

In [29]:
df_fake.columns.values[:-1] = X_train.columns
df_fake['Class'] = np.round(df_fake['Class']).astype(int)


In [30]:
X_train_augmented = X_train.append(df_fake.iloc[:,:-1]).reset_index(drop=True)
X_train_augmented.head()
X_train_augmented=X_train_augmented.fillna(0)
y_train_augmented = np.append(y_train,df_fake.iloc[:,-1])


In [31]:
X_train_augmented.to_pickle("/home/alemsara/DirectRNA/datapsUAll/X_train_augmented.pkl")
np.save("/home/alemsara/DirectRNA/datapsUAll/y_train_augmented", y_train_augmented)
# X_train.to_pickle("/home/alemsara/DirectRNA/datapsUAll/X_train.pkl")
# np.save("/home/alemsara/DirectRNA/datapsUAll/y_train", y_train)
# X_test.to_pickle("/home/alemsara/DirectRNA/datapsUAll/X_test.pkl")
# np.save("/home/alemsara/DirectRNA/datapsUAll/y_test", y_test)
# X_test_.to_pickle("/home/alemsara/DirectRNA/datapsUAll/X_test_.pkl")
# X_train_.to_pickle("/home/alemsara/DirectRNA/datapsUAll/X_train_.pkl")
