In [1]:
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
#from tqdm import tqdm
import sklearn.preprocessing
from torch.utils.data import Dataset
root_dir=""
online=False
import sklearn.preprocessing
from torch.utils.data import Dataset
if online:
    root_dir="/kaggle/input/lish-moa/"
else:
    root_dir="/data/lish-moa/"

n_train=23814

In [54]:

        
        
    
class MoADataset(Dataset):
    '''
    Nice reference : https://pytorch.org/docs/stable/_modules/torchvision/datasets/mnist.html#MNIST
    '''
    
    def __init__(self, root_dir, form='train',transform=None,validation_split=20000):
        #assert train #test not implemented yet!
        self.form=form
        if form=='train':
            self.features=MoADataset.preprocess_features(pd.read_csv(root_dir+"train_features.csv").iloc[:validation_split])
            self.targets=pd.read_csv(root_dir+"train_targets_scored.csv").iloc[:validation_split]
            self.root_dir = root_dir
            self.transform = transform
            self.weights_dict={i:1 for i in range(len(self.targets))}
        elif form=='validation':
            self.features=MoADataset.preprocess_features(pd.read_csv(root_dir+"train_features.csv").iloc[validation_split:])
            self.targets=pd.read_csv(root_dir+"train_targets_scored.csv").iloc[validation_split:]
            self.root_dir = root_dir
            self.transform = transform
            self.weights_dict={i:1 for i in range(len(self.targets))}
        else:
            self.features=MoADataset.preprocess_features(pd.read_csv(root_dir+"test_features.csv"))
            self.targets=None
            self.root_dir = root_dir
            self.transform = transform
        
        
    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
            
            
        features=np.array(self.features.iloc[idx,1:]).astype(np.float32)
        if self.transform:
                features = self.transform(features)
        if self.form!='test':
            targets=np.array(self.targets.iloc[idx,1:]).astype(np.float32)

            return features,targets,idx
        else: 
            return features
        
        

    def preprocess_features(df):
        #raw_train_data[[col for col in raw_train_data.columns if col.startswith('g-')]]
        #raw_train_data[[col for col in raw_train_data.columns if col.startswith('c-')]]
        #raw_train_data[['cp_time']]
        #raw_train_data[['cp_type','cp_dose']]

        enc=sklearn.preprocessing.OrdinalEncoder()
        enc.fit(df[['cp_type','cp_dose']])
        df[['cp_type','cp_dose']]=enc.transform(df[['cp_type','cp_dose']])
        return df
    
transform=transforms.Compose([transforms.ToTensor()])


training_full_data=MoADataset("data/lish-moa/",form='train',transform=None,validation_split=n_train)
train_full_loader=torch.utils.data.DataLoader(training_full_data,batch_size=100,drop_last=True,)

training_data=MoADataset("data/lish-moa/",form='train',transform=None)
train_loader=torch.utils.data.DataLoader(training_data,batch_size=100,drop_last=True)

validation_data=MoADataset("data/lish-moa/",form='validation',transform=None)
validation_loader=torch.utils.data.DataLoader(validation_data,batch_size=len(validation_data),drop_last=True)

test_data=MoADataset("data/lish-moa/",form='test',transform=None)
test_loader=torch.utils.data.DataLoader(test_data,batch_size=len(test_data))

   

In [106]:
# some code from https://github.com/pytorch/examples/blob/master/mnist/main.py

cat_inds=[0,1,2]
g_inds=[i for i in range(3,775)]
c_inds=[i for i in range(775,875)]

t_inds=[i for i in range(0,206)]

class Net(nn.Module):
    '''
    0       - cp_type
    1       - cp_time
    2       - cp_dose
    3-774   - g-s
    775-874 - c-s
    
    0-401 target
    '''
    def __init__(self):
        super(Net,self).__init__()
        #self.conv1=nn.Conv2d(1,10,3)        
        #self.conv2=nn.Conv2d(10,10,3)
        #self.drop =nn.Dropout2d(p=0.5)
        #self.nice =nn.Flatten()
        
        cat_out=3
        g_out=20
        c_out=20
        g_s=[len(g_inds),400,200,50,50,g_out]
        c_s=[len(c_inds),50,50,50,50,c_out]
        self.g_layers=[]
        for i in range(len(g_s)-1):
            self.g_layers.append(nn.Linear(g_s[i],g_s[i+1]))
            
        self.c_layers=[]
        for i in range(len(c_s)-1):
            self.c_layers.append(nn.Linear(c_s[i],c_s[i+1]))
        
        end_dims=[cat_out+g_out+c_out,80,40,len(t_inds)]
        self.end_layers=[]

        
        self.end_layers=[]
        for i in range(len(end_dims)-1):
            self.end_layers.append(nn.Linear(end_dims[i],end_dims[i+1]))

        self.g_layers  =nn.ModuleList(self.g_layers)
        self.c_layers  =nn.ModuleList(self.c_layers)
        self.end_layers=nn.ModuleList(self.end_layers)
        
        
    def forward(self,x):
        '''
        x=self.conv1(x)
        x=F.relu(x)
        x=self.conv2(x)
        x=F.relu(x)
        x=self.drop(x)
        x=self.nice(x)
        '''
        cat=x[:,cat_inds]
        g=x[:,g_inds]
        c=x[:,c_inds]
        
        
        for layer in self.g_layers:
            g=layer(g)
            g=torch.relu(g)
            
        for layer in self.c_layers:
            c=layer(c)
            c=torch.relu(c)
        
        
        x=torch.cat([cat,g,c],axis=1)
        for layer in self.end_layers[:-1]:
            x=layer(x)
            x=torch.relu(x)
        x=self.end_layers[-1](x)
        x=torch.sigmoid(x)*0.999+0.0005
        return x
    

class LSTM_FE(nn.Module):
    '''
    0       - cp_type
    1       - cp_time
    2       - cp_dose
    3-774   - g-s
    775-874 - c-s
    
    0-401 target
    '''
    def __init__(self,linear_dims=[],lstm_dims=[]):
        super(Net,self).__init__()
        #self.conv1=nn.Conv2d(1,10,3)        
        #self.conv2=nn.Conv2d(10,10,3)
        #self.drop =nn.Dropout2d(p=0.5)
        #self.nice =nn.Flatten()
        
        
        self.linear_layers=[]
        for i in range(len(linear_dims)-1):
            self.linear_layers.append(nn.Linear(linear_dims[i],linear_dims[i+1]))
            
        self.end_layers=[]

        
        self.end_layers=[]
        for i in range(len(end_dims)-1):
            self.end_layers.append(nn.Linear(end_dims[i],end_dims[i+1]))

        self.linear_layers  =nn.ModuleList(self.linear_layers)
        
        
    def forward(self,x):
        '''
        x=self.conv1(x)
        x=F.relu(x)
        x=self.conv2(x)
        x=F.relu(x)
        x=self.drop(x)
        x=self.nice(x)
        '''
        cat=x[:,cat_inds]
        g=x[:,g_inds]
        c=x[:,c_inds]
        
        
        for layer in self.g_layers:
            g=layer(g)
            g=torch.relu(g)
            
        for layer in self.c_layers:
            c=layer(c)
            c=torch.relu(c)
        
        
        x=torch.cat([cat,g,c],axis=1)
        for layer in self.end_layers[:-1]:
            x=layer(x)
            x=torch.relu(x)
        x=self.end_layers[-1](x)
        x=torch.sigmoid(x)*0.999+0.0005
        return x
    


        
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    av_loss=0
    loading_tqdm=tqdm(enumerate(train_loader),position=0,leave=True,total=len(train_loader))
    for i, (data,target,idx) in loading_tqdm:
        data,target,idx=data.to(device),target.to(device),idx.to(device)
        optimizer.zero_grad()
        output=model(data)
        #loss=F.nll_loss(output,target)
        #loss=torch.sum(-torch.log(output)*target-torch.log(1-output)*(1-target),1)/(target.numel())
        loss=torch.sum(-torch.log(output)*target-torch.log(1-output)*(1-target))/(target.numel())
        #cur_loss=torch.sum(loss).item()
        #old_losses=np.copy(loss.detach().numpy())
        #old_losses=old_losses/np.sum(old_losses)*np.size(old_losses)
        #loss=loss*torch.from_numpy(np.array([training_data.weights_dict[i.item()] for i in idx]).astype(np.float32))
        
        #for c,cidx in enumerate(idx):
        #    training_data.weights_dict[cidx.item()]=training_data.weights_dict[cidx.item()]*old_losses[c]
        
        #loss=torch.sum(loss)
        loss.backward()
        optimizer.step()
        
        cur_loss=loss.item()
        av_loss=i/(i+1)*av_loss+1/(i+1)*cur_loss
        loading_tqdm.set_description_str(f"Epoch:{epoch},Iteration:{i} ")
        loading_tqdm.set_postfix_str(f"Epoch loss: {av_loss},Loss: {cur_loss}")
        #print(f"Training epoch with {target.shape[0]} samples. Current loss: {cur_loss}. Epoch loss: {av_loss}")
        
def validation(model, device, validation_loader, optimizer):
    model.eval()

    with torch.no_grad():
        data,target,_=validation_loader.__iter__().__next__()   
        data,target=data.to(device),target.to(device)
        output=model(data)
        loss=torch.sum(-torch.log(output)*target-torch.log(1-output)*(1-target))/(target.numel())
        print(f"Validated {target.shape[0]} samples with a loss of {loss.item()}")
        
def test_output(model, device, test_loader):
    
    model.eval()
    with torch.no_grad():
        data=test_loader.__iter__().__next__()
        data.to(device)
        output=model(data)
        return output



def test_output_save(model, device, test_loader,fname='sub.CSV'):
    output=test_output(model,device,test_loader)

    sig_id_df=pd.read_csv(root_dir+"test_features.csv")['sig_id']
    main_df=pd.DataFrame(np.array(output).astype(str),
                 columns=pd.read_csv(root_dir+"train_targets_scored.csv").columns[1:])
    
    header=np.array(pd.read_csv(root_dir+"train_targets_scored.csv").columns,dtype=str)
    header=header.reshape(1,header.shape[0])
    
    data=np.concatenate((sig_id_df.values.reshape(sig_id_df.array.shape[0],1),main_df.values),axis=1).astype(str)
    
    whole=np.concatenate([header,data],axis=0)
    np.savetxt(fname,whole,fmt="%s",delimiter=',')
    return whole
    
    


In [107]:
test_output_save(model,device,test_loader,fname=)

array([['sig_id', '5-alpha_reductase_inhibitor',
        '11-beta-hsd1_inhibitor', ..., 'vitamin_b',
        'vitamin_d_receptor_agonist', 'wnt_inhibitor'],
       ['id_0004d9e33', '0.00094318355', '0.0016934909', ...,
        '0.0017184615', '0.0060428977', '0.00407704'],
       ['id_001897cda', '0.00070032815', '0.0005325907', ...,
        '0.0006041755', '0.0041928813', '0.0005038914'],
       ...,
       ['id_ffb710450', '0.0005976581', '0.00086395093', ...,
        '0.0005422598', '0.0011519161', '0.0015247206'],
       ['id_ffbb869f2', '0.0005612652', '0.0006645365', ...,
        '0.00051207165', '0.00075335894', '0.0010716119'],
       ['id_ffd5800b6', '0.0007048904', '0.0013147753', ...,
        '0.00071444316', '0.0025332312', '0.0023975582']], dtype='<U47')

In [42]:

model=Net()
optimizer=optim.Adam(model.parameters(),lr=0.001)
#optimizer=optim.SGD(model.parameters(),lr=0.001)
scheduler=StepLR(optimizer,100,gamma=0.8)



device=torch.device("cpu")
epochs=6
for epoch in range(1,epochs+1):
    train(model, device, train_full_loader, optimizer, epoch)# for an epoch
    validation(model, device, validation_loader, optimizer)# for an epoch
    #test(model, device, test_loader)
    scheduler.step()#next epoch
    


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


Validated 3814 samples with a loss of 0.023953478783369064


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


Validated 3814 samples with a loss of 0.023279456421732903


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


Validated 3814 samples with a loss of 0.021333934739232063


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


Validated 3814 samples with a loss of 0.01984424516558647


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


Validated 3814 samples with a loss of 0.01927703060209751


In [44]:
for epoch in range(0,1):
    train(model, device, train_loader, optimizer, epoch)# for an epoch
    validation(model, device, validation_loader, optimizer)# for an epoch
    #test(model, device, test_loader)
    scheduler.step()#next epoch

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


Validated 3814 samples with a loss of 0.018615854904055595


In [45]:
for epoch in range(0,1):
    train(model, device, train_loader, optimizer, epoch)# for an epoch
    validation(model, device, validation_loader, optimizer)# for an epoch
    #test(model, device, test_loader)
    scheduler.step()#next epoch

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


Validated 3814 samples with a loss of 0.018321024253964424


In [46]:
for epoch in range(0,1):
    train(model, device, train_loader, optimizer, epoch)# for an epoch
    validation(model, device, validation_loader, optimizer)# for an epoch
    #test(model, device, test_loader)
    scheduler.step()#next epoch

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


Validated 3814 samples with a loss of 0.018139513209462166
