## Import packages

In [1]:
import os
import json

import pandas as pd

from sklearn.model_selection import train_test_split

## Set Params

In [2]:
# Loaded file path
all_compos_dataset_file_path = 'orig-data/training.xlsx'
all_compos_test_dataset_file_path = 'orig-data/test.xlsx'

# Saved file path
train_compo_file_path = 'train_compo.csv'
eval_compo_file_path = 'eval_compo.csv'
test_compo_file_path = 'test_compo.csv'

# Saved file path for LLMs Finetune
# output_JSONdata_RFT_file_path = 'dataRFT/LiSEI_train.jsonl'

## Prepare Datasets

In [3]:
# Load the composition dataset

all_compos_df = pd.read_excel(all_compos_dataset_file_path)

# Retain only 'composition' and 'class' columns
all_compos_df = all_compos_df[['composition', 'class']]

print(all_compos_df.shape)
all_compos_df.head(3)

(50, 2)


Unnamed: 0,composition,class
0,Li6NBr3,True
1,Li2S,True
2,Li7PN4,True


In [4]:
# Split train and test
train_Compo_df, eval_Compo_df = train_test_split(all_compos_df, test_size=0.2, stratify=all_compos_df['class'], random_state=42)

print(train_Compo_df.shape)
train_Compo_df = train_Compo_df.sort_values(by='class', ascending=False)
train_Compo_df.head(3)

(40, 2)


Unnamed: 0,composition,class
7,Li7La3Zr2O12,True
16,Li2PNO2,True
4,LiMgN,True


In [10]:
eval_Compo_df

Unnamed: 0,composition,class
25,Li2TiSiO5,False
33,Li6ZnGe2O8,False
14,LiSmS2,True
47,Li7P3S11,False
26,LiAlSiO4,False
15,LiTaO3,True
9,Li2TiO3,True
13,Li(BH)6,True
29,Li3ErCl6,False
38,LiMnPO4,False


In [5]:
# Load the composition test dataset

all_compos_test_df = pd.read_excel(all_compos_test_dataset_file_path)

# Retain only 'composition' and 'class' columns
test_Compo_df = all_compos_test_df[['composition', 'class']]

print(test_Compo_df.shape)
test_Compo_df.head(3)

(17, 2)


Unnamed: 0,composition,class
0,LiCaAlN2,True
1,LiSiB6,True
2,LiB12PC,True


In [6]:
train_Compo_df.to_csv(train_compo_file_path, index=False)
eval_Compo_df.to_csv(eval_compo_file_path, index=False)
test_Compo_df.to_csv(test_compo_file_path, index=False)

## Prepare datasets for finetuning

In [7]:
# prepare JSON compositions table

# records = []

# for i in range(5):
#     shuffled_df = train_Compo_df.sample(frac=1, random_state=i).reset_index(drop=True)
#     train_compo_json = shuffled_df.to_json(orient="records")
#     # Append eval
#     eval_compo_json = eval_Compo_df.to_json(orient="records")
    
#     records.append({"train_compositions_table": train_compo_json,
#                     "eval_compositions_table": eval_compo_json,})

# train_prompt_df = pd.DataFrame(records)
# train_prompt_df

In [8]:
# # Prepare prompt and datasets for RFT

# template = """<|im_start|>system
# You are an expert solid state chemist studying the stability of interfaces of materials with metallic lithium.<|im_end|>
# <|im_start|>user
# Formulate general rules that could be used to predict the stability of a new material based on the observations in compositions table.
# The stable composition is "True" in "class" column, while the unstable composition is "False" in "class" column. Compositions table is shown below:
# {train_compositions_table}
# Show your work in <think> </think> tags. And return the final general rules in <answer> </answer> tags.<|im_end|>
# <|im_start|>assistant
# Let me solve this step by step.
# <think>
# """

# def format_row(row):
#     return template.format(
#         train_compositions_table=row["train_compositions_table"],
#         # target=row["target"]
#     )

# train_prompt_df["prompt"] = train_prompt_df.apply(format_row, axis=1)

# print(train_prompt_df.iloc[0]["prompt"])

# train_prompt_df.head(3)

In [9]:
# train_prompt_file_path = "train_prompt.csv"

# train_prompt_df.to_csv(train_prompt_file_path, index=False)