Finetuning for CH4 uptake at high pressure

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import sys
import scipy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, random_split
from torch import nn, Tensor

import yaml

For this part, import your labels, PXRD and MOFid data however you want. As long as the input dictionary when creating the train/test loaders are in the form: {id : [2THETA, Intensity]}, it should be fine.

In [2]:
core_df_train = pd.read_csv("core2019/CoRE2019_traindata.csv")
core_df_test = pd.read_csv("core2019/CoRE2019_testdata.csv")

train_ids = np.array(core_df_train['MOFname'].values)
test_ids = np.array(core_df_test['MOFname'].values)

availableIDs = np.array(list(train_ids) + list(test_ids))

concat_df = pd.concat([core_df_train, core_df_test], ignore_index=True)
concat_df = concat_df.loc[:, ~concat_df.columns.str.contains('^Unnamed')]

good_mofs = np.load('core2019/good_mofs.npy') #These are the chemically viable MOFs in CoRE-2019 using MOFChecker
print('Number of valid MOFs: {}'.format(len(good_mofs)))

concat_df = concat_df[concat_df['MOFname'].isin(good_mofs)]

concat_df

Number of valid MOFs: 4944


Unnamed: 0,ASA [m^2/cm^3],CellV [A^3],Df,Di,Dif,NASA [m^2/cm^3],POAV [cm^3/g],POAVF,PONAV [cm^3/g],PONAVF,...,pure_methane_widomHOA,pure_uptake_CO2_298.00_15000,pure_uptake_CO2_298.00_1600000,pure_uptake_methane_298.00_580000,pure_uptake_methane_298.00_6500000,logKH_CO2,logKH_CH4,CH4DC,CH4HPSTP,CH4LPSTP
0,0.000,878.884,3.37000,4.71855,4.71855,516.355,0.000000,0.00000,0.205652,0.22676,...,-15.040840,2.651632,6.959545,1.517200,3.493840,-3.578895,-5.362861,60.807239,107.480722,46.673482
2,1241.530,19186.400,4.32260,11.39486,11.33724,216.420,0.430017,0.40814,0.106519,0.10110,...,-18.634414,0.452906,8.814420,3.103085,8.424232,-4.353803,-4.770093,140.903866,223.073480,82.169614
3,1283.550,18921.000,4.51080,11.27344,11.27344,224.215,0.419845,0.41114,0.104997,0.10282,...,-18.644738,0.428606,8.653067,3.220068,8.341120,-4.388713,-4.755377,139.911701,227.886826,87.975125
6,2163.580,20019.000,7.83744,9.47418,9.45292,0.000,1.074720,0.64186,0.000000,0.00000,...,-10.818739,0.374226,21.311157,5.547521,17.698172,-4.611687,-4.979908,202.459007,294.894019,92.435013
7,641.266,2633.040,4.14331,4.82084,4.80387,0.000,0.136436,0.24024,0.000000,0.00000,...,-16.444655,2.787635,4.347951,1.327258,2.501588,-2.858049,-5.258425,57.689781,122.892268,65.202487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9510,0.000,4420.630,3.25313,7.72066,7.71275,1175.020,0.000000,0.00000,0.296972,0.39664,...,-17.082221,0.503959,6.581669,3.290080,5.694224,-4.448349,-4.887817,89.584728,212.181724,122.596996
9513,0.000,1621.880,3.68666,4.54107,4.49534,1035.710,0.000000,0.00000,0.965610,0.50278,...,-14.868498,0.682418,18.646886,4.918994,16.056714,-4.343718,-4.982866,161.795318,233.252514,71.457196
9514,0.000,8839.200,3.10525,5.11749,5.00698,362.772,0.000000,0.00000,0.098806,0.14858,...,-17.976946,1.033496,3.691148,1.341559,2.358048,-3.126807,-5.202050,42.645374,98.928637,56.283263
9516,2131.480,5977.390,7.51137,9.81493,9.63023,0.000,1.076570,0.69702,0.000000,0.00000,...,-10.904238,2.477879,21.261138,3.965880,15.038594,-2.431352,-5.088533,200.008929,271.645514,71.636585


In [3]:
availableIDs = concat_df['MOFname'].to_numpy()

In [4]:
xrd_uptake = dict()

for id in availableIDs:
    try:
        f = np.load("core2019/core_xrd/{}.npy".format(id))
        uptake = concat_df[concat_df['MOFname'] == id]['pure_uptake_methane_298.00_6500000'].values[0]

        two_theta_ex = f[0].tolist().copy()
        intensity_ex = f[1].tolist().copy()
        theta_start = two_theta_ex[0]

        theta_fill = int(two_theta_ex[0])
        for i in range(theta_fill):
            two_theta_ex.insert(i, i)
            intensity_ex.insert(i, 0)

        xrd_uptake[id] = [two_theta_ex, intensity_ex, uptake]
    except:
        pass

In [5]:
try:
    current_file_path = os.path.abspath('ft_uptake_high_p.ipynb')
except NameError:
    raise NameError("Please check your paths.")

parent_dir = os.path.abspath(os.path.join(os.path.dirname(current_file_path), '../src'))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

Create train/test loaders

In [6]:
from xraypro.preprocess_new import preprocessPXRD

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
train_loader, test_loader = preprocessPXRD(xrd_uptake, 'BW20K_mofids', pickle_file = 'core2019/new_representation/uptake_high_p.pickle', two_theta_bounds = (0, 90)).createLoader(test_ratio = 0.15, batch_size = 32)

File found. Loading data...
File found. Loading data...
The random seed is:  0
Total size: 3904, Train size: 3318, Test size: 585


Finetune model for label

In [8]:
from xraypro.xraypro import loadModel
from xraypro.run import finetune, runTest

In [9]:
model = loadModel(mode = 'cgcnn').regressionMode()

Loaded pre-trained model with success.
/home/sartaaj/Desktop/xraypro/src/SSL/pretrained/cgcnn/model_t.pth


In [10]:
device = 'cuda:0'

In [11]:
import pickle

label = 'CH4 Uptake at 64 bar'
file_path = f'core2019/ft/{label}'

new_dir_path = os.path.join(os.getcwd(), 'ft', label)
os.makedirs(new_dir_path, exist_ok = True)

with open(f'{file_path}/train_loader.pickle', 'wb') as handle:
    pickle.dump(train_loader, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'{file_path}/test_loader.pickle', 'wb') as handle:
    pickle.dump(test_loader, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
model = finetune(model, train_loader, test_loader, save_path = os.path.join(file_path, 'ft_uptake_high_p.h5'))

  return F.conv1d(input, weight, bias, self.stride,
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch: 1, Batch: 17, Loss: 14.545324906562138, Val Loss: 7.934391021728516, SRCC_test = 0.7494705115672856
Epoch: 2, Batch: 17, Loss: 7.815681881117589, Val Loss: 6.207213732931349, SRCC_test = 0.7879398826979471
Epoch: 3, Batch: 17, Loss: 6.484653251842388, Val Loss: 6.782466676500109, SRCC_test = 0.8228046594982077
Epoch: 4, Batch: 17, Loss: 5.9387316449174605, Val Loss: 5.714982125494215, SRCC_test = 0.8447580645161289
Epoch: 5, Batch: 17, Loss: 6.10560528051506, Val Loss: 5.560086356268989, SRCC_test = 0.8477517106549364
Epoch: 6, Batch: 17, Loss: 4.8844666122232825, Val Loss: 5.472069753540887, SRCC_test = 0.8433732486151839
Epoch: 7, Batch: 17, Loss: 4.64577627297744, Val Loss: 4.853808376524183, SRCC_test = 0.8518246985988921
Epoch: 8, Batch: 17, Loss: 5.480188313039761, Val Loss: 6.459626701143053, SRCC_test = 0.8590746171391332
Epoch: 9, Batch: 17, Loss: 4.392026994992229, Val Loss: 4.077734225326115, SRCC_test = 0.8618035190615835
Epoch: 10, Batch: 17, Loss: 4.070973076866668

KeyboardInterrupt: 

In [13]:
predictions_test, actual_test = runTest(model, test_loader, save_path = os.path.join(file_path, 'ft_uptake_high_p.h5'))

In [14]:
from sklearn.metrics import mean_absolute_error

In [15]:
print(f'The SRCC is: {scipy.stats.spearmanr(predictions_test, actual_test)[0]}')
print(f'The MAE is {mean_absolute_error(actual_test, predictions_test)}')

The SRCC is: 0.9011246468038808
The MAE is 1.3655787706375122
