In [11]:
# This serves as a template which will guide you through the implementation of this task.  It is advised
# to first read the whole template and get a sense of the overall structure of the code before trying to fill in any of the TODO gaps
# First, we import necessary libraries:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, ElasticNet
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

True


In [13]:
def load_data():
    """
    This function loads the data from the csv files and returns it as numpy arrays.

    input: None
    
    output: x_pretrain: np.ndarray, the features of the pretraining set
            y_pretrain: np.ndarray, the labels of the pretraining set
            x_train: np.ndarray, the features of the training set
            y_train: np.ndarray, the labels of the training set
            x_test: np.ndarray, the features of the test set
    """
    x_pretrain = pd.read_csv("pretrain_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
    y_pretrain = pd.read_csv("pretrain_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_train = pd.read_csv("train_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
    y_train = pd.read_csv("train_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_test = pd.read_csv("test_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1)
    return x_pretrain, y_pretrain, x_train, y_train, x_test

In [44]:
class Net(nn.Module):
    """
    The model class, which defines our feature extractor used in pretraining.
    """
    def __init__(self,in_features):
        """
        The constructor of the model.
        """
        super().__init__()
        # TODO: Define the architecture of the model. It should be able to be trained on pretraing data 
        # and then used to extract features from the training and test data.
        self.fc1 = nn.Linear(in_features,512)
        self.act1 = nn.LeakyReLU()
        self.dropout1 = nn.Dropout(0.05)
        self.fc2 = nn.Linear(512,256)
        self.act2 = nn.PReLU()
        self.dropout2 = nn.Dropout(0.25)
        self.out = nn.Linear(256,1)

    def forward(self, x):
        """
        The forward pass of the model.

        input: x: torch.Tensor, the input to the model

        output: x: torch.Tensor, the output of the model
        """
        # TODO: Implement the forward pass of the model, in accordance with the architecture 
        # defined in the constructor.
        x = self.fc1(x)
        x = self.act1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.act2(x)
        x = self.dropout2(x)
        x = self.out(x)
        x = x.squeeze()
        return x

In [45]:
def make_feature_extractor(x, y, batch_size=256, eval_size=2500):
    """
    This function trains the feature extractor on the pretraining data and returns a function which
    can be used to extract features from the training and test data.

    input: x: np.ndarray, the features of the pretraining set
              y: np.ndarray, the labels of the pretraining set
                batch_size: int, the batch size used for training
                eval_size: int, the size of the validation set
            
    output: make_features: function, a function which can be used to extract features from the training and test data
    """
    # Pretraining data loading
    in_features = x.shape[-1]
    x_tr, x_val, y_tr, y_val = train_test_split(x, y, test_size=eval_size, random_state=0, shuffle=True)
    x_tr, x_val = torch.tensor(x_tr, dtype=torch.float), torch.tensor(x_val, dtype=torch.float)
    y_tr, y_val = torch.tensor(y_tr, dtype=torch.float), torch.tensor(y_val, dtype=torch.float)

    # model declaration
    model = Net(in_features)
    model.train()
    model.to(device)
    # TODO: Implement the training loop. The model should be trained on the pretraining data. Use validation set 
    # to monitor the loss.
    loss_function = nn.MSELoss()
    n_epochs = 300
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    
    for epoch in range(n_epochs):
        train_loss = 0
        model.train()
        for i in range(0,len(x_tr),batch_size):
            X = x_tr[i:i+batch_size].to(device)
            y = y_tr[i:i+batch_size].to(device)
            optimizer.zero_grad()
            output = model(X)
            loss = loss_function(output, y)
            train_loss += loss.detach()
            loss.backward()
            optimizer.step()
            pass
        train_loss = train_loss/len(x_tr)
        
        val_loss = 0
        model.eval()
        with torch.no_grad():
            for j in range(0,len(x_val),batch_size):
                X = x_val[j:j+batch_size].to(device)
                y = y_val[j:j+batch_size].to(device)
                output = model(X)
                loss = loss_function(output, y)
                val_loss += loss.detach()
                pass
            val_loss = val_loss/len(x_val)
        
        print('Epoch {}, Train Loss {}, Validation Loss {}'.format(
                   epoch+1, train_loss, val_loss))

    def make_features(x):
        """
        This function extracts features from the training and test data, used in the actual pipeline 
        after the pretraining.

        input: x: np.ndarray, the features of the training or test set

        output: features: np.ndarray, the features extracted from the training or test set, propagated
        further in the pipeline
        """
        model.eval()
        # TODO: Implement the feature extraction, a part of a pretrained model used later in the pipeline.

        fe_model = torch.nn.Sequential(*(list(model.children())[:-4]))
        print(fe_model)
        with torch.no_grad():
            x = torch.tensor(x, dtype=torch.float).to(device)
            output = fe_model(x)
            features = output.detach().cpu().numpy()
            features = np.reshape(features,(-1,512))
        return features

    return make_features

In [46]:
def make_pretraining_class(feature_extractors):
    """
    The wrapper function which makes pretraining API compatible with sklearn pipeline
    
    input: feature_extractors: dict, a dictionary of feature extractors

    output: PretrainedFeatures: class, a class which implements sklearn API
    """

    class PretrainedFeatures(BaseEstimator, TransformerMixin):
        """
        The wrapper class for Pretraining pipeline.
        """
        def __init__(self, *, feature_extractor=None, mode=None):
            self.feature_extractor = feature_extractor
            self.mode = mode

        def fit(self, X=None, y=None):
            return self

        def transform(self, X):
            assert self.feature_extractor is not None
            X_new = feature_extractors[self.feature_extractor](X)
            return X_new
        
    return PretrainedFeatures

In [47]:
def get_regression_model():
    """
    This function returns the regression model used in the pipeline.

    input: None

    output: model: sklearn compatible model, the regression model
    """
    # TODO: Implement the regression model. It should be able to be trained on the features extracted
    # by the feature extractor.
    model = ElasticNet(alpha=0.002, l1_ratio=0.0001, max_iter=100000, random_state=0)
    return model

In [48]:
# Main function. You don't have to change this
if __name__ == '__main__':
    # Load data
    np.random.seed(0)
    x_pretrain, y_pretrain, x_train, y_train, x_test = load_data()
    print("Data loaded!")
    # Utilize pretraining data by creating feature extractor which extracts lumo energy 
    # features from available initial features
    feature_extractor =  make_feature_extractor(x_pretrain, y_pretrain)
    PretrainedFeatureClass = make_pretraining_class({"pretrain": feature_extractor})
    PretrainedFeature = PretrainedFeatureClass(feature_extractor="pretrain") 
    
    # regression model
    regression_model = get_regression_model()

    y_pred = np.zeros(x_test.shape[0])
    # TODO: Implement the pipeline. It should contain feature extraction and regression. You can optionally
    # use other sklearn tools, such as StandardScaler, FunctionTransformer, etc.
    pipe = Pipeline([("feature_extractor", PretrainedFeature), ("scaler", StandardScaler()), ("regression", regression_model)])
    #pretrained_features = pipe.named_steps['feature_extractor']
    #x_train_transformed = pretrained_features.transform(x_train)
    #print(x_train_transformed.shape)
    pipe.fit(x_train, y_train)
    y_reg_t = pipe.predict(x_train)
    print(mean_squared_error(y_train, y_reg_t, squared=True))
    print(mean_squared_error(y_train, y_reg_t, squared=False))
    print([y_train, y_reg_t])
    y_pred = pipe.predict(torch.tensor(x_test.values, dtype=torch.float))

    assert y_pred.shape == (x_test.shape[0],)
    y_pred = pd.DataFrame({"y": y_pred}, index=x_test.index)
    y_pred.to_csv("results.csv", index_label="Id")
    print("Predictions saved, all done!")

Data loaded!
Epoch 1, Train Loss 0.0019100598292425275, Validation Loss 0.00019057873578276485
Epoch 2, Train Loss 0.0002906440058723092, Validation Loss 0.00014485309657175094
Epoch 3, Train Loss 0.00024217774625867605, Validation Loss 0.00012431360664777458
Epoch 4, Train Loss 0.00021270429715514183, Validation Loss 0.00011072911729570478
Epoch 5, Train Loss 0.00019273134239483625, Validation Loss 0.00010174219642067328
Epoch 6, Train Loss 0.00017806258983910084, Validation Loss 9.483483881922439e-05
Epoch 7, Train Loss 0.0001673249207669869, Validation Loss 8.967838948592544e-05
Epoch 8, Train Loss 0.00015571637777611613, Validation Loss 8.60333675518632e-05
Epoch 9, Train Loss 0.00014673463010694832, Validation Loss 8.247762161772698e-05
Epoch 10, Train Loss 0.00014065482537262142, Validation Loss 7.989040750544518e-05
Epoch 11, Train Loss 0.00013506242248695344, Validation Loss 7.666087913094088e-05
Epoch 12, Train Loss 0.00013119980576448143, Validation Loss 7.423415809171274e-05

Epoch 101, Train Loss 4.3758871470345184e-05, Validation Loss 3.0044486265978776e-05
Epoch 102, Train Loss 4.384389831102453e-05, Validation Loss 2.9995295335538685e-05
Epoch 103, Train Loss 4.347513095126487e-05, Validation Loss 2.983631748065818e-05
Epoch 104, Train Loss 4.332085154601373e-05, Validation Loss 2.974091694341041e-05
Epoch 105, Train Loss 4.293278107070364e-05, Validation Loss 2.9629050914081745e-05
Epoch 106, Train Loss 4.206275843898766e-05, Validation Loss 2.9517552320612594e-05
Epoch 107, Train Loss 4.219762922730297e-05, Validation Loss 2.9369124604272656e-05
Epoch 108, Train Loss 4.219973197905347e-05, Validation Loss 2.936588498414494e-05
Epoch 109, Train Loss 4.1680024878587574e-05, Validation Loss 2.919603866757825e-05
Epoch 110, Train Loss 4.136880306759849e-05, Validation Loss 2.9086213544360362e-05
Epoch 111, Train Loss 4.144551348872483e-05, Validation Loss 2.892263546527829e-05
Epoch 112, Train Loss 4.117645221413113e-05, Validation Loss 2.8763406589860097

Epoch 199, Train Loss 2.856481478374917e-05, Validation Loss 2.269851574965287e-05
Epoch 200, Train Loss 2.8554151867865585e-05, Validation Loss 2.2651729523204267e-05
Epoch 201, Train Loss 2.8645325073739514e-05, Validation Loss 2.2610145606449805e-05
Epoch 202, Train Loss 2.8482747438829392e-05, Validation Loss 2.259878056065645e-05
Epoch 203, Train Loss 2.8286911401664838e-05, Validation Loss 2.2560669094673358e-05
Epoch 204, Train Loss 2.8214142730575986e-05, Validation Loss 2.2539008568855934e-05
Epoch 205, Train Loss 2.810373371175956e-05, Validation Loss 2.2484158762381412e-05
Epoch 206, Train Loss 2.8066428058082238e-05, Validation Loss 2.2404494302463718e-05
Epoch 207, Train Loss 2.7974436306976713e-05, Validation Loss 2.2430142053053714e-05
Epoch 208, Train Loss 2.809642865031492e-05, Validation Loss 2.2318305127555504e-05
Epoch 209, Train Loss 2.772937841655221e-05, Validation Loss 2.2275587980402634e-05
Epoch 210, Train Loss 2.7596681320574135e-05, Validation Loss 2.2235550

Epoch 297, Train Loss 2.1816138541908003e-05, Validation Loss 1.9317256374051794e-05
Epoch 298, Train Loss 2.1630292394547723e-05, Validation Loss 1.9272551071480848e-05
Epoch 299, Train Loss 2.1791396648040973e-05, Validation Loss 1.9260161934653297e-05
Epoch 300, Train Loss 2.176372981921304e-05, Validation Loss 1.9203389456379227e-05
Sequential(
  (0): Linear(in_features=1000, out_features=512, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Dropout(p=0.05, inplace=False)
)
Sequential(
  (0): Linear(in_features=1000, out_features=512, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Dropout(p=0.05, inplace=False)
)
2.3698007208306032e-08
0.00015394157076081182
[array([2.05287228, 1.32552973, 1.83729442, 1.38860147, 0.99185146,
       1.18184813, 1.46986438, 1.78053514, 2.95969531, 2.22497755,
       1.60914091, 2.169881  , 2.62993076, 2.08208275, 1.65118902,
       2.25876781, 1.8219338 , 1.9153037 , 2.08050356, 1.84710994,
       1.71622686, 1.49220775, 1.88388241, 1

  x = torch.tensor(x, dtype=torch.float).to(device)
