## Load packages

In [2]:
import pandas as pd
import numpy as np
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

from sklearn import metrics
from joblib import dump, load

## Set seed

In [3]:
random_seed = 8022022 # or any of your favorite number 
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)

## Setting outcome

In [4]:
outcome = "was_preterm"

## Loading data

In [5]:
metadata = pd.read_csv('./input/metadata/metadata.csv', delimiter=',',index_col = 'specimen')
metadata_feature = ['participant_id','project','collect_wk']
metadata_selected = metadata[metadata_feature].sort_index()

In [6]:
CST = pd.read_csv('./input/community_state_types/cst_valencia.csv', delimiter=',',index_col = 'specimen')['CST'].sort_index()

In [7]:
alpha = pd.read_csv('./input/alpha_diversity/alpha_diversity.csv', delimiter=',',index_col = 'specimen').sort_index()

In [8]:
taxonomy = pd.read_csv('./input/taxonomy/taxonomy_relabd.family.csv', delimiter=',',index_col = 'specimen')
tax_feature = list(pd.read_csv('./selected_feature/tax_family_preterm.csv', delimiter=',').feature_selected)
taxonomy_selected = taxonomy[tax_feature].sort_index()

In [9]:
phylotype = pd.read_csv('./input/phylotypes/phylotype_relabd.1e_1.csv', delimiter=',',index_col = 'specimen')
phylo_feature = list(pd.read_csv('./selected_feature/phylo_.1_preterm.csv', delimiter=',').feature_selected)
phylotype_selected = phylotype[phylo_feature].sort_index()

In [10]:
mydata = pd.concat([metadata_selected,CST,alpha,taxonomy_selected,phylotype_selected], axis = 1).copy()

In [11]:
mydata["project"] = mydata["project"].astype('category')
mydata["CST"] = mydata["CST"].astype('category')


mydata.dtypes

participant_id                      object
project                           category
collect_wk                         float64
CST                               category
shannon                            float64
inv_simpson                        float64
bwpd                               float64
phylo_entropy                      float64
quadratic                          float64
unrooted_pd                        float64
rooted_pd                          float64
Prevotellaceae                     float64
Corynebacteriaceae                 float64
unclassified Clostridiales         float64
Staphylococcaceae                  float64
Bifidobacteriaceae                 float64
Lachnospiraceae                    float64
Fusobacteriaceae                   float64
unclassified Corynebacteriales     float64
Ruminococcaceae                    float64
Mycoplasmataceae                   float64
unclassified Tissierellia          float64
Veillonellaceae                    float64
Lactobacill

In [12]:
mydata.head(5)

Unnamed: 0_level_0,participant_id,project,collect_wk,CST,shannon,inv_simpson,bwpd,phylo_entropy,quadratic,unrooted_pd,...,pt__00092,pt__00096,pt__00007,pt__00009,pt__00041,pt__00006,pt__00021,pt__00039,pt__00126,pt__00052
specimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00001-05,A00001,A,33.0,III,1.0,1.0,0.0,-0.0,0.0,0.0,...,0.0,0.0,0.003675,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00002-01,A00002,A,38.0,III,1.96362,1.81277,2.62894,1.31887,0.876314,3.94341,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00003-02,A00003,A,30.0,II,1.0,1.0,0.0,-0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00004-08,A00004,A,27.0,III,1.0,1.0,0.0,-0.0,0.0,0.0,...,0.0,0.0,0.000514,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00004-12,A00004,A,29.0,III,6.94884,4.07385,2.78896,3.13422,1.2199,15.5185,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000183


## Define functions

In [13]:
def tensor_generator(data):
    X_data = data[data.columns[2:]].copy()
    X_data['CST'] = X_data['CST'].cat.codes
    X_feature = X_data.to_numpy().astype('float32')
    X_group = data['participant_id'].astype('category').cat.codes.values.reshape(-1,1)
    input_X = torch.from_numpy(np.hstack((X_feature,X_group)))

    
    return input_X

In [14]:
a = tensor_generator(mydata)

In [15]:
mydata.columns[2:]

Index(['collect_wk', 'CST', 'shannon', 'inv_simpson', 'bwpd', 'phylo_entropy',
       'quadratic', 'unrooted_pd', 'rooted_pd', 'Prevotellaceae',
       'Corynebacteriaceae', 'unclassified Clostridiales', 'Staphylococcaceae',
       'Bifidobacteriaceae', 'Lachnospiraceae', 'Fusobacteriaceae',
       'unclassified Corynebacteriales', 'Ruminococcaceae', 'Mycoplasmataceae',
       'unclassified Tissierellia', 'Veillonellaceae', 'Lactobacillaceae',
       'Bacteroidales', 'Alcaligenaceae', 'pt__00004', 'pt__00090',
       'pt__00056', 'pt__00042', 'pt__00079', 'pt__00024', 'pt__00071',
       'pt__00092', 'pt__00096', 'pt__00007', 'pt__00009', 'pt__00041',
       'pt__00006', 'pt__00021', 'pt__00039', 'pt__00126', 'pt__00052'],
      dtype='object')

In [19]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim1,hidden_dim2,drop_out):
        

        
        #inherit from super class
        super(MLP, self).__init__()
        
        #define layers
        
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1,hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2,2)
        self.dropout = nn.Dropout(drop_out)

        
    def forward(self, x):
        
        X_feature = x[:,:-1]
        X_group = x[:,-1].long()
        
        X_feature = torch.tanh(self.fc1(X_feature))
        X_feature = self.dropout(X_feature)
        X_feature = torch.tanh(self.fc2(X_feature))
        X_feature = self.dropout(X_feature)
        X_feature = torch.tanh(self.fc3(X_feature))

        X_feature = F.softmax(X_feature, dim = 1)

        
        M = torch.zeros(X_group.max()+1, len(X_feature))
        M[X_group, torch.arange(len(X_feature))] = 1
        M = F.normalize(M, p=1, dim=1)
        X_feature = torch.mm(M, X_feature)

        
        return X_feature

In [20]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU not available, CPU used


In [21]:
def test_metrics(model, data):
    
    val_X = tensor_generator(data)
    
    model.eval()
    

    out = model(val_X)
    predicted_props = out[:,0].detach().numpy()
    predicted_labels = 1*(predicted_props >0.5)
    
    
    result_tab = pd.DataFrame(data = {'participant': list(data.groupby('participant_id').first().index),
                                     'was_preterm':list(predicted_labels),
                                     'probability':list(predicted_props)})
    

    
    return result_tab


## Loading models

In [22]:
best_model = load('test_submission.save')

## Results

In [23]:
test_metrics(best_model, mydata)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (3578x41 and 40x10)