Finetuning for CH4 uptake at high pressure

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import sys
import scipy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, random_split
from torch import nn, Tensor

import yaml

For this part, import your labels, PXRD and MOFid data however you want. As long as the input dictionary when creating the train/test loaders are in the form: {id : [2THETA, Intensity]}, it should be fine.

In [2]:
concat_df = pd.read_csv("arabg/ARABG_alldata.csv")
concat_df = concat_df.loc[:, ~concat_df.columns.str.contains('^Unnamed')]

concat_df

Unnamed: 0,ASA [m^2/cm^3],CellV [A^3],Df,Di,Dif,NASA [m^2/cm^3],POAV [cm^3/g],POAVF,PONAV [cm^3/g],PONAVF,...,pure_methane_widomHOA,pure_uptake_CO2_298.00_15000,pure_uptake_CO2_298.00_1600000,pure_uptake_methane_298.00_580000,pure_uptake_methane_298.00_6500000,logKH_CO2,logKH_CH4,CH4DC,CH4HPSTP,CH4LPSTP
0,2193.29,3210.62,8.76651,9.90110,9.90110,0.00000,0.999867,0.68710,0.000000,0.00000,...,-10.387895,0.383712,19.113279,3.178975,13.712341,-4.607606,-5.172092,201.947449,262.895293,60.947843
1,1563.86,3121.05,7.03051,9.10523,9.02246,0.00000,0.441127,0.53404,0.000000,0.00000,...,-10.397690,0.477182,8.934587,1.409886,5.950109,-4.455274,-5.570723,153.349492,200.969474,47.619982
2,1178.75,3237.39,8.21591,9.64594,9.64594,36.22360,0.252564,0.41742,0.022133,0.03658,...,-12.272472,0.576841,5.403326,1.115493,3.882429,-4.360261,-5.655964,127.583555,179.019025,51.435470
3,1219.24,3095.31,5.82069,7.56734,7.43701,27.93740,0.334506,0.38486,0.021138,0.02432,...,-12.338432,4.093729,9.763483,1.467913,5.299856,-3.140348,-5.471460,123.001583,170.120148,47.118565
4,1997.41,3216.15,5.94302,7.98801,7.97335,0.00000,0.693584,0.58328,0.000000,0.00000,...,-12.866890,0.550473,13.959525,3.273874,9.845145,-4.467378,-5.061310,154.177517,230.990332,76.812815
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,1444.78,8583.13,21.24884,22.24210,22.24210,0.00000,1.337630,0.74298,0.000000,0.00000,...,-13.833288,2.525638,17.467504,2.406949,11.759585,-3.482228,-5.105427,144.933267,182.232589,37.299322
383,1443.50,8599.77,19.92180,20.96820,20.90468,0.00000,1.574600,0.71272,0.000000,0.00000,...,-15.729773,0.858988,16.494019,4.035562,16.133689,-4.160335,-4.836743,152.777713,203.739651,50.961938
384,1383.48,8617.44,19.60137,20.41008,20.41008,0.00000,1.116970,0.66452,0.000000,0.00000,...,-15.373081,0.608031,10.540085,2.029908,9.853569,-4.336095,-5.144194,129.858945,163.551823,33.692878
385,1442.29,8576.84,20.89040,21.56847,21.56847,20.31780,1.589560,0.73598,0.024967,0.01156,...,-14.669824,0.575775,15.123684,3.356310,14.966197,-4.314139,-4.939298,149.972028,193.327534,43.355506


In [3]:
availableIDs = concat_df['MOFname'].to_numpy()

In [4]:
xrd_uptake = dict()

for id in availableIDs:
    try:
        f = np.load("arabg/ARABG_XRD/{}.npy".format(id))
        henry_ch4 = concat_df[concat_df['MOFname'] == id]['pure_uptake_methane_298.00_6500000'].values[0]

        two_theta_ex = f[0].tolist().copy()
        intensity_ex = f[1].tolist().copy()
        theta_start = two_theta_ex[0]

        theta_fill = int(two_theta_ex[0])
        for i in range(theta_fill):
            two_theta_ex.insert(i, i)
            intensity_ex.insert(i, 0)

        xrd_uptake[id] = [two_theta_ex, intensity_ex, henry_ch4]
    except:
        pass

In [5]:
try:
    current_file_path = os.path.abspath('ft_arabg.ipynb')
except NameError:
    raise NameError("Please check your paths.")

parent_dir = os.path.abspath(os.path.join(os.path.dirname(current_file_path), '../src'))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

Create train/test loaders

In [6]:
from xraypro.preprocess_new import preprocessPXRD

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
train_loader, test_loader = preprocessPXRD(xrd_uptake, 'arabg/ARABG_mofids', pickle_file = 'arabg/new_representation/uptake_high_p.pickle', two_theta_bounds = (0, 25)).createLoader(test_ratio = 0.5, batch_size = 32)

File found. Loading data...
File found. Loading data...
The random seed is:  0
Total size: 350, Train size: 175, Test size: 175


Finetune model for label

In [9]:
from xraypro.xraypro import loadModel
from xraypro.run import finetune, runTest

In [10]:
model = loadModel(mode = 'cgcnn').regressionMode()

Loaded pre-trained model with success.
/home/sartaaj/Desktop/xraypro/src/SSL/pretrained/cgcnn/model_t.pth


In [11]:
device = 'cuda:0'

In [13]:
import pickle

label = 'CH4 Uptake at 64 bar'
file_path = f'arabg/ft/{label}'

new_dir_path = os.path.join(os.getcwd(), 'arabg/ft', label)
os.makedirs(new_dir_path, exist_ok = True)

with open(f'{file_path}/train_loader.pickle', 'wb') as handle:
    pickle.dump(train_loader, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'{file_path}/test_loader.pickle', 'wb') as handle:
    pickle.dump(test_loader, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [15]:
model = finetune(model, train_loader, test_loader, save_path = os.path.join(file_path, 'ft_uptake_high_p.h5'))

  return F.conv1d(input, weight, bias, self.stride,
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch: 1, Batch: 4, Loss: 68.89228897094726, Val Loss: 36.49527435302734, SRCC_test = 0.546041055718475
Epoch: 2, Batch: 4, Loss: 19.972186279296874, Val Loss: 15.438066291809083, SRCC_test = 0.6334310850439882
Epoch: 3, Batch: 4, Loss: 15.701948547363282, Val Loss: 13.42382640838623, SRCC_test = 0.7481671554252198
Epoch: 4, Batch: 4, Loss: 13.148514556884766, Val Loss: 11.07529182434082, SRCC_test = 0.8159090909090908
Epoch: 5, Batch: 4, Loss: 10.531471252441406, Val Loss: 9.026131916046143, SRCC_test = 0.8260263929618767
Epoch: 6, Batch: 4, Loss: 9.47649564743042, Val Loss: 9.391464805603027, SRCC_test = 0.8315249266862169
Epoch: 7, Batch: 4, Loss: 9.014867401123047, Val Loss: 10.61583833694458, SRCC_test = 0.8415689149560116
Epoch: 8, Batch: 4, Loss: 8.906636142730713, Val Loss: 8.602376556396484, SRCC_test = 0.8447214076246334
Epoch: 9, Batch: 4, Loss: 7.753448104858398, Val Loss: 8.183556365966798, SRCC_test = 0.8538123167155425
Epoch: 10, Batch: 4, Loss: 8.589826726913453, Val Lo

In [16]:
predictions_test, actual_test = runTest(model, test_loader, save_path = os.path.join(file_path, 'ft_uptake_high_p.h5'))

In [17]:
from sklearn.metrics import mean_absolute_error

In [18]:
print(f'The SRCC is: {scipy.stats.spearmanr(predictions_test, actual_test)[0]}')
print(f'The MAE is {mean_absolute_error(actual_test, predictions_test)}')

The SRCC is: 0.9558713230985584
The MAE is 1.1703370809555054
