In [1]:
import pandas as pd
import random
import re
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
##new data (zeodb) for later use
df_zeodb_af = pd.read_csv('.../zeodb_affinity_full.csv')

In [2]:
df_ori = pd.read_excel('.../ZEOSYN.xlsx')
df_zeos_reduced = pd.read_csv('.../data_reduced_1.csv')
df_zeos_reduced.fillna('0.0', inplace=True)
mask_numeric = df_zeos_reduced.apply(
    lambda col: pd.to_numeric(col, errors='coerce').notna().all()
)
mask_numeric['Code'] = True

df_zeos_reduced = df_zeos_reduced.loc[:, mask_numeric]
print("Remaining columns:", df_zeos_reduced.columns.tolist())

rows_before_cleaning = df_ori.shape[0]
df = df_ori.dropna(subset=['Si'],how="all")
rows_after_cleaning = df.shape[0]
print(f"Number of rows before cleaning: {rows_before_cleaning}")
print(f"Number of rows after cleaning: {rows_after_cleaning}")
print(f"Columns before cleaning: {df_zeos_reduced.shape[1]}")

Remaining columns: ['Code', 'deriv_dist_46', 'deriv_dist_57', 'cum_skewness', 'deriv_dist_55', 'deriv_dist_67', 'deriv_dist_74', 'ring_size_2', 'deriv_dist_53', 'ring_size_3', 'volume', 'a', 'poc_2_sa_a2', 'c', 'num_atoms', 'chan_1_vol_a3', 'deriv_dist_31', 'unitcell_vol', 'deriv_dist_38', 'chan_0_di', 'b', 'deriv_dist_72', 'beta', 'cum_variance', 'ponav_frac', 'cum_kurtosis', 'deriv_dist_35', 'deriv_dist_34', 'chan_0_sa_a2', 'largest_free_sphere_izc', 'deriv_dist_50', 'deriv_dist_56', 'ovlpvfract', 'td_10', 'deriv_dist_42', 'deriv_dist_43', 'nav_cm3_g']
Number of rows before cleaning: 30164
Number of rows after cleaning: 23961
Columns before cleaning: 37


## Match zeolite code with zeolite descriptor

In [3]:
zeo_codes = set(df_zeos_reduced["Code"].dropna().unique())

all_zeolites = df[["Code1", "Code2", "Code3"]].values.flatten()
all_zeolites = {z for z in all_zeolites if pd.notna(z)}

# Find zeolites NOT in df_zeos_reduced["Code"] 
zeo_not_in_descriptors = all_zeolites - zeo_codes

# Function to clean zeolite names by removing leading special characters and slashes
def clean_zeolite_name(name):
    if pd.isna(name):
        return name
    name = re.sub(r'^[-*]+', '', name)  # Remove leading symbols (*, -, etc.)
    name = name.split('/')[0].strip()  # Keep only the first part if separated by '/'
    return name

# Apply cleaning to unmatched zeolites
cleaned_zeo_not_in_descriptors = {clean_zeolite_name(z) for z in zeo_not_in_descriptors}

# Update dataset: Replace old values with cleaned values
for col in ["Code1", "Code2", "Code3"]:
    df[col] = df[col].apply(lambda x: clean_zeolite_name(x) if pd.notna(x) and x in zeo_not_in_descriptors else x)

# Recalculate unmatched zeolites after cleaning
all_zeolites_updated = df[["Code1", "Code2", "Code3"]].values.flatten()
all_zeolites_updated = {z for z in all_zeolites_updated if pd.notna(z)}
zeo_still_not_in_descriptors = all_zeolites_updated - zeo_codes

print("Cleaned unmatched zeolites:", sorted(cleaned_zeo_not_in_descriptors))
print("Cleaned length:", len(cleaned_zeo_not_in_descriptors))
print("Remaining unmatched zeolites after cleaning:", sorted(zeo_still_not_in_descriptors))
print("Remaining unmatched count:", len(zeo_still_not_in_descriptors))


Cleaned unmatched zeolites: ['AEI', 'AFX', 'ASU-12', 'ASU-14', 'ASU-16', 'BEA', 'CHA', 'CLO', 'CTH', 'ERI', 'EUO', 'FAU', 'FDU-4', 'IFT', 'IFU', 'IM-14', 'IRY', 'ISV', 'ITN', 'ITQ-21', 'ITV', 'LIT', 'MEL', 'MFI', 'MOR', 'MRE', 'NUD-1', 'PKU-17', 'RUT', 'SFV', 'STF', 'STO', 'SU-65', 'SU-67', 'SU-74', 'SU-77', 'SU-79', 'SU-M', 'SU-MB', 'SVR', 'SVY', 'SYSU-3', 'TON', 'UOE']
Cleaned length: 44
Remaining unmatched zeolites after cleaning: ['ASU-12', 'ASU-14', 'ASU-16', 'FDU-4', 'IM-14', 'ITQ-21', 'NUD-1', 'PKU-17', 'SU-65', 'SU-67', 'SU-74', 'SU-77', 'SU-79', 'SU-M', 'SU-MB', 'SYSU-3']
Remaining unmatched count: 16


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].apply(lambda x: clean_zeolite_name(x) if pd.notna(x) and x in zeo_not_in_descriptors else x)


In [4]:
df_with_zeo_feats = df[
    df[["Code1", "Code2", "Code3"]].apply(
        lambda row: any(z in zeo_codes for z in row if pd.notna(z)) and
                    all(z not in zeo_still_not_in_descriptors for z in row if pd.notna(z)), 
        axis=1
    )
]

print('Num. synthesis routes with zeolite features:', len(df_with_zeo_feats))

#Add zeolite-free syntheses
df_zeo_free = df[
    df['Code1'].isna() & df['Code2'].isna() & df['Code3'].isna()
]

df_all_zeo_feats = pd.concat([df_with_zeo_feats, df_zeo_free])
print('Num. synthesis routes without products:', len(df_zeo_free))
print('Num. total synthesis routes:', len(df_with_zeo_feats))

df_all_zeo_feats[["Code1", "Code2", "Code3"]]


Num. synthesis routes with zeolite features: 18378
Num. synthesis routes without products: 5371
Num. total synthesis routes: 18378


Unnamed: 0,Code1,Code2,Code3
0,AFI,,
1,AFI,,
2,AFI,,
3,AFI,,
5,BPH,,
...,...,...,...
30015,,,
30016,,,
30026,,,
30084,,,


In [5]:
def calculate_yield(row):
    count_non_nan = row.notna().sum()
    if count_non_nan > 0:
        return 1/count_non_nan
    else:
        return 0

def augment_data_correct_gel_composition(df_all_zeo_feats):
    augmented_rows = []
    
    # gel_composition_columns = df_all_zeo_feats.loc[:, 'Si':'OH'].columns
    all_zeolite_codes = df_all_zeo_feats[['Code1', 'Code2', 'Code3']].stack().dropna().unique()
    
    for index, row in df_all_zeo_feats.iterrows():
        # gel_composition = row[gel_composition_columns]
        osda_smiles = row[['osda1 smiles', 'osda2 smiles', 'osda3 smiles']]
        zeolite_codes = [row['Code1'], row['Code2'], row['Code3']]
        cryst_time = row['cryst_time']
        cryst_temp = row['cryst_temp']
        
        # Calculate the yield based on non-NaN values in the zeolite codes
        calculated_yield = calculate_yield(pd.Series(zeolite_codes))
            
        # Ensure every row contributes, even if all zeolite codes are NaN
        if all(pd.isnull(zeolite_codes)):
            valid_zeolite_codes = [random.choice(all_zeolite_codes)]  # Assign a random zeolite
            calculated_yield = 0
        else:
            # Process and filter zeolite codes, cleaning symbols
            valid_zeolite_codes = [code for code in zeolite_codes if pd.notnull(code)]
        
        for zeolite_code in valid_zeolite_codes:
            augmented_row = osda_smiles.tolist() + [zeolite_code, calculated_yield, cryst_time, cryst_temp]
            augmented_rows.append(augmented_row)
    
    augmented_columns = ['osda1_smiles', 'osda2_smiles', 'osda3_smiles', 'zeolite_code', 'yield', 'cryst_time', 'cryst_temp']
    
    augmented_df = pd.DataFrame(augmented_rows, columns=augmented_columns)
    yield_counts = augmented_df['yield'].round(2).value_counts().sort_index()
    print("\nYield counts:")
    print(yield_counts)
    
    return augmented_df

augmented_df_gel_correct = augment_data_correct_gel_composition(df_all_zeo_feats)
print(augmented_df_gel_correct.head())



Yield counts:
0.00     5371
0.33      504
0.50     4106
1.00    16157
Name: yield, dtype: int64
                                        osda1_smiles  osda2_smiles  \
0  C[N+]12CCCC[C@@H]1[C@H]1C[C@@H](C2)[C@@H]2CCCC...           NaN   
1  C[N+]12CCCC[C@@H]1[C@H]1C[C@@H](C2)[C@@H]2CCCC...           NaN   
2                    C[N+](C)(C)C12CC3CC(CC(C3)C1)C2           NaN   
3                    C[N+](C)(C)C12CC3CC(CC(C3)C1)C2           NaN   
4                                   CC[N+](CC)(CC)CC  C[N+](C)(C)C   

  osda3_smiles zeolite_code  yield  cryst_time  cryst_temp  
0          NaN          AFI    1.0       504.0       150.0  
1          NaN          AFI    1.0       168.0       175.0  
2          NaN          AFI    1.0       144.0       150.0  
3          NaN          AFI    1.0         NaN         NaN  
4          NaN          BPH    1.0        72.0       125.0  


In [6]:
df_with_osda = augmented_df_gel_correct[
    augmented_df_gel_correct[['osda1_smiles','osda2_smiles','osda3_smiles']].notna().any(axis=1)
]

##if temperature OR time is missing, drop that row
df_with_osda = df_with_osda[
    df_with_osda['cryst_time'].notna() & df_with_osda['cryst_temp'].notna()
]
df_with_osda.head()

Unnamed: 0,osda1_smiles,osda2_smiles,osda3_smiles,zeolite_code,yield,cryst_time,cryst_temp
0,C[N+]12CCCC[C@@H]1[C@H]1C[C@@H](C2)[C@@H]2CCCC...,,,AFI,1.0,504.0,150.0
1,C[N+]12CCCC[C@@H]1[C@H]1C[C@@H](C2)[C@@H]2CCCC...,,,AFI,1.0,168.0,175.0
2,C[N+](C)(C)C12CC3CC(CC(C3)C1)C2,,,AFI,1.0,144.0,150.0
4,CC[N+](CC)(CC)CC,C[N+](C)(C)C,,BPH,1.0,72.0,125.0
5,CC[N+](CC)(CC)CC,C[N+](C)(C)C,,UFI,1.0,96.0,150.0


In [7]:
from rdkit import Chem
def is_valid_smiles_or_empty(smiles):
    if pd.isna(smiles):
        return True  # Consider empty, NaN, or '0' as v
    return Chem.MolFromSmiles(smiles) is not None

df_valid = df_with_osda[
    df_with_osda['osda1_smiles'].apply(is_valid_smiles_or_empty) &
    df_with_osda['osda2_smiles'].apply(is_valid_smiles_or_empty) &
    df_with_osda['osda3_smiles'].apply(is_valid_smiles_or_empty)
]

unique_osdas_valid = set(df_valid['osda1_smiles'].value_counts().keys()) |\
                        set(df_valid['osda2_smiles'].value_counts().keys()) |\
                        set(df_valid['osda3_smiles'].value_counts().keys())

print("df_valid", len(unique_osdas_valid))

[09:50:20] SMILES Parse Error: syntax error while parsing: {CCC[N+]12CCN(CC1)CC2}
[09:50:20] SMILES Parse Error: Failed parsing SMILES '{CCC[N+]12CCN(CC1)CC2}' for input: '{CCC[N+]12CCN(CC1)CC2}'
[09:50:20] SMILES Parse Error: syntax error while parsing: {CCC[N+]12CCN(CC1)CC2}
[09:50:20] SMILES Parse Error: Failed parsing SMILES '{CCC[N+]12CCN(CC1)CC2}' for input: '{CCC[N+]12CCN(CC1)CC2}'
[09:50:20] SMILES Parse Error: syntax error while parsing: {CCC[N+]12CCN(CC1)CC2}
[09:50:20] SMILES Parse Error: Failed parsing SMILES '{CCC[N+]12CCN(CC1)CC2}' for input: '{CCC[N+]12CCN(CC1)CC2}'
[09:50:20] SMILES Parse Error: syntax error while parsing: {CCC[N+]12CCN(CC1)CC2}
[09:50:20] SMILES Parse Error: Failed parsing SMILES '{CCC[N+]12CCN(CC1)CC2}' for input: '{CCC[N+]12CCN(CC1)CC2}'
[09:50:20] SMILES Parse Error: syntax error while parsing: {CCC[N+]12CCN(CC1)CC2}
[09:50:20] SMILES Parse Error: Failed parsing SMILES '{CCC[N+]12CCN(CC1)CC2}' for input: '{CCC[N+]12CCN(CC1)CC2}'
[09:50:20] SMILES Pa

df_valid 792


In [8]:
columns_to_extract = ["osda1_smiles", "osda2_smiles", "osda3_smiles", "zeolite_code"]
#gel_comp = df_valid[['Si', 'Al', 'H2O','sda1', 'sda2', 'sda3', 'OH']].copy()
# gel_comp = df_valid.loc[:, 'Si':'OH'].copy()
new_combined = df_valid[columns_to_extract]
new_combined['osda_combined'] = new_combined[['osda1_smiles', 'osda2_smiles', 'osda3_smiles']].fillna("").agg('.'.join, axis=1)
new_combined['osda_combined'] = new_combined['osda_combined'].str.replace(r'\.+', '.', regex=True).str.strip('.')
new_combined.drop(columns=['osda1_smiles', 'osda2_smiles', 'osda3_smiles'], inplace=True)
new_combined = pd.concat([new_combined, df_valid.drop(columns=columns_to_extract)], axis=1)
# new_combined = pd.concat([new_combined, df_valid.drop(columns=columns_to_extract)], axis=1)
# new_combined.drop(columns=['zeolite_code'], inplace=True)
print("New Combined: ", new_combined.shape)
new_combined.head()

New Combined:  (21184, 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_combined['osda_combined'] = new_combined[['osda1_smiles', 'osda2_smiles', 'osda3_smiles']].fillna("").agg('.'.join, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_combined['osda_combined'] = new_combined['osda_combined'].str.replace(r'\.+', '.', regex=True).str.strip('.')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_combined.drop(co

Unnamed: 0,zeolite_code,osda_combined,yield,cryst_time,cryst_temp
0,AFI,C[N+]12CCCC[C@@H]1[C@H]1C[C@@H](C2)[C@@H]2CCCC...,1.0,504.0,150.0
1,AFI,C[N+]12CCCC[C@@H]1[C@H]1C[C@@H](C2)[C@@H]2CCCC...,1.0,168.0,175.0
2,AFI,C[N+](C)(C)C12CC3CC(CC(C3)C1)C2,1.0,144.0,150.0
4,BPH,CC[N+](CC)(CC)CC.C[N+](C)(C)C,1.0,72.0,125.0
5,UFI,CC[N+](CC)(CC)CC.C[N+](C)(C)C,1.0,96.0,150.0


In [9]:
def assign_class(yield_value):
    if pd.isna(yield_value):
        return 0
    elif yield_value == 1:
        return 1
    elif yield_value == 1/2:
        return 1
    elif yield_value == 0.333333333:
        return 1
    else:
        return 0

new_combined['class'] = new_combined['yield'].apply(assign_class)
new_combined.drop(columns=['yield'], inplace=True)
print(new_combined['class'].value_counts())

1    16434
0     4750
Name: class, dtype: int64


In [None]:
##if one row has multiple osdas separated by dot (.) --> split it
def augment_osda_pairs(df):
    records = []  
    for _, row in df.iterrows():
        osdas = row["osda_combined"].split(".")
        for osda in osdas:
            rec = {
                "osda": osda.strip(),
                "class": row["class"],
                "zeolite_code": row["zeolite_code"]
            }
            records.append(rec)
    return pd.DataFrame(records)

new_combined2 = augment_osda_pairs(new_combined)

In [10]:
new_combined.drop_duplicates(subset=['osda_combined','class','zeolite_code','cryst_temp','cryst_time'], inplace=True)

In [None]:
##for adding zeodb data --> if you don't want to add, then skip
from_zeodb = df_zeodb_af[['Host', 'SMILES']]

from_zeodb['class'] = 1.0
from_zeodb.columns.tolist()

new_combined_3 = pd.concat([new_combined2, from_zeodb.rename(columns={'Host': 'zeolite_code', 'SMILES': 'osda', 'class':'class'})], ignore_index=True)

#drop duplicates if any (same OSDA, same zeolite)
new_combined_3.drop_duplicates(subset=['osda', 'zeolite_code'], inplace=True)
print("Final dataset shape:", new_combined_3.shape)

new_combined_3

In [None]:
# new_combined_3.to_csv('.../all_data_ori.csv')

In [None]:
#save to csv for false data generation (refer to to artificial_data_gen.py)
new_combined_3.to_csv(".../want_to_gen.csv")

## After Generation

In [1]:
#take the file from artificial_data_gen

new_combined_gen = pd.read_csv(".../after_gen.csv")
new_combined_gen = new_combined_gen.merge(df_zeos_reduced, left_on='zeolite_code', right_on='Code', how='left')

NameError: name 'pd' is not defined

In [11]:
#only take the rows with class = 1.0
new_combined_gen = new_combined[new_combined['class'] == 1.0]
new_combined_gen = new_combined_gen.merge(df_zeos_reduced, left_on='zeolite_code', right_on='Code', how='left')

In [12]:
#normalizing zeolite descriptors

minmax = MinMaxScaler()
desc_start = 'deriv_dist_46'
desc_end = 'nav_cm3_g'

desc_cols = new_combined_gen.loc[:, desc_start:desc_end].columns
new_combined_gen[desc_cols] = minmax.fit_transform(new_combined_gen[desc_cols])

In [13]:
def stratified_split(data, labels, train_size=0.9, val_size=0.05, test_size=0.05, random_state=42):
    """
    Splits data into train, validation, and test sets with stratified sampling.
    Shuffles within each split while keeping rows intact.
    """
    assert abs(train_size + val_size + test_size - 1.0) < 1e-8, "Train, val, and test sizes must sum to 1.0"
    
    from sklearn.model_selection import train_test_split
    import pandas as pd

    # Separate data by class
    data_by_class = {cls: data[labels == cls] for cls in labels.unique()}
    
    train_data, train_labels = [], []
    val_data, val_labels = [], []
    test_data, test_labels = [], []
    
    for cls, cls_data in data_by_class.items():
        cls_train_size = train_size
        cls_val_size = val_size
        cls_test_size = test_size
        cls_test_adjusted = 1 - cls_train_size
            
        cls_train, cls_remaining = train_test_split(
            cls_data, test_size=cls_test_adjusted, random_state=random_state
        )
        cls_val, cls_test = train_test_split(
            cls_remaining, test_size=cls_test_size / (cls_val_size + cls_test_size), random_state=random_state
        )
            
        train_data.append(cls_train)
        val_data.append(cls_val)
        test_data.append(cls_test)
        train_labels += [cls] * len(cls_train)
        val_labels += [cls] * len(cls_val)
        test_labels += [cls] * len(cls_test)
    
    # Concatenate splits
    train_data = pd.concat(train_data)
    val_data = pd.concat(val_data)
    test_data = pd.concat(test_data)

    # Shuffle each set but keep rows intact
    train_data, train_labels = train_data.sample(frac=1, random_state=random_state), pd.Series(train_labels, index=train_data.index)
    val_data, val_labels = val_data.sample(frac=1, random_state=random_state), pd.Series(val_labels, index=val_data.index)
    test_data, test_labels = test_data.sample(frac=1, random_state=random_state), pd.Series(test_labels, index=test_data.index)

    # Reset indices to keep alignment
    train_data, train_labels = train_data.reset_index(drop=True), train_labels.reset_index(drop=True)
    val_data, val_labels = val_data.reset_index(drop=True), val_labels.reset_index(drop=True)
    test_data, test_labels = test_data.reset_index(drop=True), test_labels.reset_index(drop=True)
    
    return train_data, val_data, test_data, train_labels, val_labels, test_labels

train_data, val_data, test_data, train_labels, val_labels, test_labels = stratified_split(new_combined_gen, new_combined_gen['class'])

print("Class Distribution in Training Data:")
print(pd.Series(train_labels).value_counts(normalize=False))
print("Class Distribution in Validation Data:")
print(pd.Series(val_labels).value_counts(normalize=False))
print("Class Distribution in Test Data:")
print(pd.Series(test_labels).value_counts(normalize=False))

##use below code if you want to split train and val only (after separating test set manually)
# def stratified_train_val_split(data, labels, train_size=0.9, val_size=0.1, random_state=42):
#     """
#     Splits data into train and val sets with stratified sampling.
#     Assumes test set has already been separated manually.
#     """
#     from sklearn.model_selection import train_test_split
#     import pandas as pd

#     assert abs(train_size + val_size - 1.0) < 1e-8, "Train and val sizes must sum to 1.0"

#     data_by_class = {cls: data[labels == cls] for cls in labels.unique()}

#     train_data, val_data = [], []
#     train_labels, val_labels = [], []

#     for cls, cls_data in data_by_class.items():

#         # split each class into train and val
#         cls_train, cls_val = train_test_split(
#             cls_data,
#             test_size=val_size,
#             random_state=random_state
#         )

#         train_data.append(cls_train)
#         val_data.append(cls_val)

#         train_labels += [cls] * len(cls_train)
#         val_labels += [cls] * len(cls_val)

#     # combine
#     train_data = pd.concat(train_data)
#     val_data = pd.concat(val_data)

#     # shuffle
#     train_data = train_data.sample(frac=1, random_state=random_state)
#     val_data = val_data.sample(frac=1, random_state=random_state)

#     # align labels with shuffled indices
#     train_labels = pd.Series(train_labels, index=train_data.index).reset_index(drop=True)
#     val_labels = pd.Series(val_labels, index=val_data.index).reset_index(drop=True)

#     # reset dataframe indices
#     train_data = train_data.reset_index(drop=True)
#     val_data = val_data.reset_index(drop=True)

#     return train_data, val_data, train_labels, val_labels

# train_data, val_data, train_labels, val_labels = stratified_train_val_split(train_val_set, train_val_set['class'])
# print("Class Distribution in Training Data:")
# print(train_labels.value_counts(normalize=False))
# print("Class Distribution in Validation Data:")
# print(val_labels.value_counts(normalize=False))


Class Distribution in Training Data:
1    6090
dtype: int64
Class Distribution in Validation Data:
1    338
dtype: int64
Class Distribution in Test Data:
1    339
dtype: int64


In [14]:
train_data = train_data.loc[:, ~train_data.columns.duplicated()]
val_data = val_data.loc[:, ~val_data.columns.duplicated()]
test_set = test_data.loc[:, ~test_data.columns.duplicated()]

print(train_data.shape) 
print(val_data.shape)
print(test_set.shape)

(6090, 42)
(338, 42)
(339, 42)


In [15]:
log_scaling = lambda x: np.log1p(x)
stdscale = StandardScaler()

# TRAIN
time_T_tr = train_data['cryst_time']
temp_T_tr = train_data['cryst_temp']
# yield_tr = train_data['class']
zeo_desc_tr = train_data.loc[:, 'deriv_dist_46':'nav_cm3_g']
smiles_tr = train_data['osda_combined']
zeo_code_tr = train_data['zeolite_code']

# Fit scalers on train
norm_time_tr = log_scaling(time_T_tr.values.reshape(-1, 1))
norm_temp_tr = log_scaling(temp_T_tr.values.reshape(-1, 1))

smiles_tr = np.array(smiles_tr).reshape(-1, 1)

# Combine time and temperature
# time_temp_tr_concat = np.hstack([norm_time_tr, norm_temp_tr])

chemprop_feat_train = pd.DataFrame(
    np.hstack([zeo_desc_tr]),
    columns=list(zeo_desc_tr)
)


chemprop_input_train = pd.DataFrame(
    np.hstack([smiles_tr, time_temp_tr_concat]),
    columns=['osda_combined'] + ['norm_cryst_time', 'norm_cryst_temp']
)

# Validation

time_T_v = val_data['cryst_time']
temp_T_v = val_data['cryst_temp']
# yield_v = val_data['class']
zeo_desc_v = val_data.loc[:, 'deriv_dist_46':'nav_cm3_g']
smiles_v = val_data['osda_combined']
zeo_code_v = val_data['zeolite_code']


norm_time_v = log_scaling(time_T_v.values.reshape(-1, 1))
norm_temp_v = log_scaling(temp_T_v.values.reshape(-1, 1))
# yield_v = np.array(yield_v).reshape(-1, 1)
smiles_v = np.array(smiles_v).reshape(-1, 1)

# Combine time and temperature
# time_temp_v_concat = np.hstack([norm_time_v, norm_temp_v])

chemprop_feat_val = pd.DataFrame(
    np.hstack([zeo_desc_v]),
    columns=list(zeo_desc_v)
)

chemprop_input_val = pd.DataFrame(
    np.hstack([smiles_v, time_temp_v_concat]),
    columns=['osda_combined', 'norm_cryst_time', 'norm_cryst_temp']
)

# Test
time_T_te = test_data['cryst_time']
temp_T_te = test_data['cryst_temp']
# yield_te = test_data['class']
zeo_desc_te = test_data.loc[:, 'deriv_dist_46':'nav_cm3_g']
smiles_te = test_data['osda_combined']
zeo_code_te = test_data['zeolite_code']

norm_time_te = log_scaling(time_T_te.values.reshape(-1, 1))
norm_temp_te = log_scaling(temp_T_te.values.reshape(-1, 1))
# yield_te = np.array(yield_te).reshape(-1, 1)
smiles_te = np.array(smiles_te).reshape(-1, 1)

# Combine time and temperature
# time_temp_te_concat = np.hstack([norm_time_te, norm_temp_te])

chemprop_feat_test = pd.DataFrame(
    np.hstack([zeo_desc_te]),
    columns=list(zeo_desc_te)
)

chemprop_input_test = pd.DataFrame(
    np.hstack([smiles_te, time_temp_te_concat]),
    columns=['osda_combined', 'norm_cryst_time', 'norm_cryst_temp']
)

chemprop_input_train.to_csv('.../input_train.csv', index=False)
chemprop_input_val.to_csv('.../input_val.csv', index=False)
chemprop_input_test.to_csv('.../input_test.csv', index=False)

zeo_code_tr.to_csv('.../zeo_code_train.csv', index=False)
zeo_code_v.to_csv('.../zeo_code_val.csv', index=False)
zeo_code_te.to_csv('.../zeo_code_test.csv', index=False)


In [16]:
def drop_string_columns(df):
    columns_to_drop = [col for col in df.columns if df[col].apply(lambda x: isinstance(x, str)).any()]
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
    return df

chemprop_feat_train_clean = drop_string_columns(chemprop_feat_train).fillna(0)
chemprop_feat_val_clean = drop_string_columns(chemprop_feat_val).fillna(0)  
chemprop_feat_test_clean = drop_string_columns(chemprop_feat_test).fillna(0)

print("Test shape: ", chemprop_feat_test_clean.shape)

chemprop_feat_train_clean.to_csv('.../feat_train.csv', index=False)
chemprop_feat_val_clean.to_csv('.../feat_val.csv', index=False)
chemprop_feat_test_clean.to_csv('.../feat_test.csv', index=False)

Test shape:  (339, 36)
