# Use the CoRE dataset for the GCMC case study

The current QMOF dataset does not have a lot of data for the gas adsorption properties. Let's switch to CoRE for this.

In [None]:
%load_ext autoreload 
%autoreload 2

In [None]:
from gpt3forchem.data import get_core_mof_data, discretize
from gpt3forchem.baselines import XGBClassificationBaseline
from gpt3forchem.api_wrappers import query_gpt3, fine_tune, extract_prediction
from sklearn.model_selection import train_test_split

from gpt3forchem.input import create_single_property_forward_prompts

from pycm import ConfusionMatrix

import time

In [None]:
data = get_core_mof_data()

In [None]:
FEATURES = [
        f for f in data.columns if f.startswith('features')
]

In [None]:
outputs = [
    f for f in data.columns if f.startswith('outputs')
]

In [None]:
outputs

['outputs.pure_CO2_kH',
 'outputs.pure_CO2_widomHOA',
 'outputs.pure_methane_kH',
 'outputs.pure_methane_widomHOA',
 'outputs.pure_uptake_CO2_298.00_15000',
 'outputs.pure_uptake_CO2_298.00_1600000',
 'outputs.pure_uptake_methane_298.00_580000',
 'outputs.pure_uptake_methane_298.00_6500000',
 'outputs.logKH_CO2',
 'outputs.logKH_CH4',
 'outputs.CH4DC',
 'outputs.CH4HPSTP',
 'outputs.CH4LPSTP']

In [None]:
discretize(data, 'outputs.logKH_CO2')

In [None]:
train, test = train_test_split(data, train_size=0.8, stratify=data['outputs.logKH_CO2_cat'])

In [None]:
baseline = XGBClassificationBaseline(None)
baseline.fit(train[FEATURES], train['outputs.logKH_CO2_cat'])

In [None]:
predictions_baseline = baseline.predict(test[FEATURES])

In [None]:
cm = ConfusionMatrix(test['outputs.logKH_CO2_cat'].to_list(), predictions_baseline)

In [None]:
print(cm)

Predict          large            medium           small            very large       very small       
Actual
large            20               35               6                2                0                

medium           13               124              32               0                0                

small            0                56               73               1                0                

very large       4                8                3                5                0                

very small       0                0                2                0                2                





Overall Statistics : 

95% CI                                                            (0.53108,0.62954)
ACC Macro                                                         0.83212
ARI                                                               0.14258
AUNP                                                              0.65749
AUNU                                 

### GPT-3

In [None]:
train_prompts = create_single_property_forward_prompts(train, 'outputs.logKH_CO2_cat', 
                                                       {'outputs.logKH_CO2_cat': 'Henry coefficient'},
                                                       representation_col='clean_mofid'
                                                      )
                                                       

In [None]:
test_prompts = create_single_property_forward_prompts(test, 'outputs.logKH_CO2_cat', 
                                                       {'outputs.logKH_CO2_cat': 'Henry coefficient'},
                                                       representation_col='clean_mofid'
                                                      )
                                                       

In [None]:
train_size = len(train_prompts)

filename_base = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
train_filename = (
    f"run_files/{filename_base}_train_prompts_mof_kh_{train_size}.jsonl"
)
test_filename = (
    f"run_files/{filename_base}_test_prompts__mof_kh_{train_size}.jsonl"
)


train_prompts.to_json(train_filename, orient="records", lines=True)
test_prompts.to_json(test_filename, orient="records", lines=True)


In [None]:
fine_tune(train_filename, test_filename)

Fine-tune ft-Q4wHLn1KA8ytEslLVvdmn4l5 has the status "running" and will not be logged
🎉 wandb sync completed successfully


'ada:ft-lsmoepfl-2022-09-12-15-33-35'

In [None]:
completion = query_gpt3('ada:ft-lsmoepfl-2022-09-12-15-33-35', test_prompts)

In [None]:
predictions = [int(extract_prediction(completion, i)) for i in range(len(completion['choices']))]

In [None]:
true = test_prompts['completion'].apply(lambda x: int(x.split("@")[0])).values

In [None]:
cm = ConfusionMatrix(true, predictions)

In [None]:
print(cm)

Predict   0         1         2         3         4         
Actual
0         1         3         0         0         0         

1         0         76        50        4         0         

2         0         40        108       20        1         

3         0         6         38        17        2         

4         0         1         8         6         5         





Overall Statistics : 

95% CI                                                            (0.48652,0.58602)
ACC Macro                                                         0.81451
ARI                                                               0.10726
AUNP                                                              0.63061
AUNU                                                              0.62547
Bangdiwala B                                                      0.32891
Bennett S                                                         0.42034
CBA                                                               0