# File for dataset preparation 

In [1]:
import sqlite3
import pandas as pd

## 1. SQL data retrieval

In [2]:
conn = sqlite3.connect('cagey.db') 

cursor = conn.cursor() 

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

print("Tables in the database:", tables)

conn.close()

Tables in the database: [('precursors',), ('reactions',), ('nmr_spectra',), ('nmr_aldehyde_peaks',), ('nmr_imine_peaks',), ('mass_spectra',), ('mass_spectrum_peaks',), ('mass_spectrum_topology_assignments',), ('turbidity_dissolved_references',), ('turbidity_measurements',), ('turbidities',)]


In [3]:
conn = sqlite3.connect('cagey.db')

tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables_df = pd.read_sql(tables_query, conn)
table_names = tables_df['name'].tolist()  

tables_dataframes = {}

for table in table_names:
    query = f"SELECT * FROM {table}"  
    df = pd.read_sql(query, conn)    
    tables_dataframes[table] = df     
    print(f"Table {table} loaded successfully with {len(df)} rows.")

conn.close()


Table precursors loaded successfully with 55 rows.
Table reactions loaded successfully with 450 rows.
Table nmr_spectra loaded successfully with 449 rows.
Table nmr_aldehyde_peaks loaded successfully with 751 rows.
Table nmr_imine_peaks loaded successfully with 8897 rows.
Table mass_spectra loaded successfully with 401 rows.
Table mass_spectrum_peaks loaded successfully with 710 rows.
Table mass_spectrum_topology_assignments loaded successfully with 541 rows.
Table turbidity_dissolved_references loaded successfully with 402 rows.
Table turbidity_measurements loaded successfully with 6824 rows.
Table turbidities loaded successfully with 402 rows.


In [4]:
for table_name, df in tables_dataframes.items():
    print(f"Table: {table_name}, Type: {type(df)}")
    print(f"First value in first row: {df.iloc[0, 0]}, Type: {type(df.iloc[0, 0])}")


Table: precursors, Type: <class 'pandas.core.frame.DataFrame'>
First value in first row: Di1, Type: <class 'str'>
Table: reactions, Type: <class 'pandas.core.frame.DataFrame'>
First value in first row: 1, Type: <class 'numpy.int64'>
Table: nmr_spectra, Type: <class 'pandas.core.frame.DataFrame'>
First value in first row: 1, Type: <class 'numpy.int64'>
Table: nmr_aldehyde_peaks, Type: <class 'pandas.core.frame.DataFrame'>
First value in first row: 1, Type: <class 'numpy.int64'>
Table: nmr_imine_peaks, Type: <class 'pandas.core.frame.DataFrame'>
First value in first row: 1, Type: <class 'numpy.int64'>
Table: mass_spectra, Type: <class 'pandas.core.frame.DataFrame'>
First value in first row: 1, Type: <class 'numpy.int64'>
Table: mass_spectrum_peaks, Type: <class 'pandas.core.frame.DataFrame'>
First value in first row: 1, Type: <class 'numpy.int64'>
Table: mass_spectrum_topology_assignments, Type: <class 'pandas.core.frame.DataFrame'>
First value in first row: 1, Type: <class 'numpy.int64'

In [88]:
tables_dataframes['precursors']

Unnamed: 0,name,smiles
0,Di1,O=Cc1cccc(C=O)c1
1,Di2,CC(C)(C)c1cc(C=O)c(O)c(C=O)c1
2,Di3,O=Cc1cc2sc(C=O)cc2s1
3,Di4,O=Cc1ccc(C=O)cc1
4,Di5,O=Cc1c(F)c(F)c(C=O)c(F)c1F
5,Di6,O=Cc1ccc(-c2ccc(C=O)cc2)cc1
6,Di7,O=Cc1c2ccccc2c(C=O)c2ccccc12
7,Di8,O=Cc1ccc2ccc3ccc(C=O)nc3c2n1
8,Di9,O=Cc1ccsc1C=O
9,Di10,O=Cc1ccccc1C=O


# Threshold arounf 55% in turbidity

## 1.1. Building few-shot examples and test set

In [5]:
building_blocks_pairs = pd.DataFrame()
building_blocks_pairs[['experiment_number', 'di_name', 'tri_name']] = tables_dataframes['reactions'][['experiment', 'di_name', 'tri_name']]

building_blocks_pairs['di_smiles'] = ""
building_blocks_pairs['tri_smiles'] = ""


for index, row in building_blocks_pairs.iterrows():

    di_name = row['di_name']
    tri_name = row['tri_name']
    
    di_smiles_row = tables_dataframes['precursors'][tables_dataframes['precursors']['name'] == di_name]
    if not di_smiles_row.empty:
        building_blocks_pairs.at[index, 'di_smiles'] = di_smiles_row['smiles'].values[0]  
    
    tri_smiles_row = tables_dataframes['precursors'][tables_dataframes['precursors']['name'] == tri_name]
    if not tri_smiles_row.empty:
        building_blocks_pairs.at[index, 'tri_smiles'] = tri_smiles_row['smiles'].values[0]  


In [6]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# *** NEW PREP DATA ***

In [7]:
import json

file_path = '/Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/data/precursor_combination_map_to_exp_data.json'  

with open(file_path, 'r') as file:
    json_data = json.load(file)

count = len(json_data)

print(f"Number of instances in the JSON: {count}")

Number of instances in the JSON: 366


In [8]:
import os

In [1]:
import json
import pandas as pd
import os

file_path = '/Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/data/precursor_combination_map_to_exp_data.json'  # Replace with your JSON file path

with open(file_path, 'r') as file:
    json_data = json.load(file)

base_dir = '/Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm'

rows = []

for key, value in json_data.items():
    precursor_combination = value['precursor_combination']
    plate_number = value['plate_number']
    turbidity_data_path = value['turbidity_data_path']
    full_turbidity_path = os.path.join(base_dir, turbidity_data_path.lstrip('/'), 'turbidity_data.csv')

    print(f"[{precursor_combination}] Trying to access file: {full_turbidity_path}")

    if os.path.exists(full_turbidity_path):
        try:
            turbidity_df = pd.read_csv(full_turbidity_path)

            print(f"[{precursor_combination}] Original Columns in {full_turbidity_path}: {turbidity_df.columns}")

            turbidity_df.columns = turbidity_df.columns.str.strip().str.lower()

            print(f"[{precursor_combination}] Normalized Columns in {full_turbidity_path}: {turbidity_df.columns}")

            if 'turbidity' in turbidity_df.columns:
                mean_turbidity = turbidity_df['turbidity'].mean()
                print(f"[{precursor_combination}] Mean turbidity found: {mean_turbidity}")
            else:
                print(f"[{precursor_combination}] 'turbidity' column not found in: {full_turbidity_path}")
                mean_turbidity = None
        except Exception as e:

            print(f"[{precursor_combination}] Error reading file {full_turbidity_path}: {e}")
            mean_turbidity = None
    else:
        print(f"[{precursor_combination}] File does not exist: {full_turbidity_path}")
        mean_turbidity = None

    rows.append({'precursor_combination': precursor_combination, 'plate_number': plate_number, 'turbidity': mean_turbidity})

turbidity_df = pd.DataFrame(rows)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

print(turbidity_df)

turbidity_df.to_csv('mean_turbidity_data.csv', index=False)

[A1] Trying to access file: /Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/turbidity/AB-02-005/01/solubility_study_1/turbidity_data.csv
[A1] Original Columns in /Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/turbidity/AB-02-005/01/solubility_study_1/turbidity_data.csv: Index(['Timestamp', 'Time (s)', 'Time (min)', 'Time (hour)', 'Turbidity'], dtype='object')
[A1] Normalized Columns in /Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/turbidity/AB-02-005/01/solubility_study_1/turbidity_data.csv: Index(['timestamp', 'time (s)', 'time (min)', 'time (hour)', 'turbidity'], dtype='object')
[A1] Mean turbidity found: 89.3897306681133
[A2] Trying to access file: /Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/turbidity/AB-02-005/01/solubility_study_2/turbidity_data.csv
[A2] Original Columns in /Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/turbidity/AB-02-005/01/solubility_study_2/turbidity_data.csv: Index(['Timestamp', 'Ti

In [39]:
import json
import pandas as pd
import os

file_path = '/Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/data/precursor_combination_map_to_exp_data.json'  # Replace with your JSON file path

with open(file_path, 'r') as file:
    json_data = json.load(file)

base_dir = '/Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm'

rows = []

for key, value in json_data.items():
    precursor_combination = value['precursor_combination']
    plate_number = value['plate_number']
    turbidity_data_path = value['turbidity_data_path']
    full_turbidity_path = os.path.join(base_dir, turbidity_data_path.lstrip('/'), 'turbidity_data.csv')

    print(f"[{precursor_combination}] Trying to access file: {full_turbidity_path}")

    if os.path.exists(full_turbidity_path):
        try:
            turbidity_df = pd.read_csv(full_turbidity_path)

            print(f"[{precursor_combination}] Original Columns in {full_turbidity_path}: {turbidity_df.columns}")

            turbidity_df.columns = turbidity_df.columns.str.strip().str.lower()

            print(f"[{precursor_combination}] Normalized Columns in {full_turbidity_path}: {turbidity_df.columns}")

            if 'turbidity' in turbidity_df.columns:
                mean_turbidity = turbidity_df['turbidity'].mean()
                print(f"[{precursor_combination}] Mean turbidity found: {mean_turbidity}")
            else:
                print(f"[{precursor_combination}] 'turbidity' column not found in: {full_turbidity_path}")
                mean_turbidity = None
        except Exception as e:
            print(f"[{precursor_combination}] Error reading file {full_turbidity_path}: {e}")
            mean_turbidity = None
    else:
        modified_turbidity_data_path = turbidity_data_path.rstrip('0123456789') + '0' + turbidity_data_path[-1]
        full_modified_turbidity_path = os.path.join(base_dir, modified_turbidity_data_path.lstrip('/'), 'turbidity_data.csv')
        print(f"[{precursor_combination}] Trying to access modified file path: {full_modified_turbidity_path}")

        if os.path.exists(full_modified_turbidity_path):
            try:
                turbidity_df = pd.read_csv(full_modified_turbidity_path)

                print(f"[{precursor_combination}] Original Columns in {full_modified_turbidity_path}: {turbidity_df.columns}")

                turbidity_df.columns = turbidity_df.columns.str.strip().str.lower()

                print(f"[{precursor_combination}] Normalized Columns in {full_modified_turbidity_path}: {turbidity_df.columns}")

                if 'turbidity' in turbidity_df.columns:
                    mean_turbidity = turbidity_df['turbidity'].mean()
                    print(f"[{precursor_combination}] Mean turbidity found: {mean_turbidity}")
                else:
                    print(f"[{precursor_combination}] 'turbidity' column not found in: {full_modified_turbidity_path}")
                    mean_turbidity = None
            except Exception as e:
                print(f"[{precursor_combination}] Error reading file {full_modified_turbidity_path}: {e}")
                mean_turbidity = None
        else:
            print(f"[{precursor_combination}] File does not exist: {full_turbidity_path} or {full_modified_turbidity_path}")
            mean_turbidity = None

    rows.append({'precursor_combination': precursor_combination, 'plate_number': plate_number, 'turbidity': mean_turbidity})

turbidity_df = pd.DataFrame(rows)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

print(turbidity_df)

turbidity_df.to_csv('mean_turbidity_data.csv', index=False)


[A1] Trying to access file: /Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/turbidity/AB-02-005/01/solubility_study_1/turbidity_data.csv
[A1] Original Columns in /Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/turbidity/AB-02-005/01/solubility_study_1/turbidity_data.csv: Index(['Timestamp', 'Time (s)', 'Time (min)', 'Time (hour)', 'Turbidity'], dtype='object')
[A1] Normalized Columns in /Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/turbidity/AB-02-005/01/solubility_study_1/turbidity_data.csv: Index(['timestamp', 'time (s)', 'time (min)', 'time (hour)', 'turbidity'], dtype='object')
[A1] Mean turbidity found: 89.3897306681133
[A2] Trying to access file: /Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/turbidity/AB-02-005/01/solubility_study_2/turbidity_data.csv
[A2] Original Columns in /Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/turbidity/AB-02-005/01/solubility_study_2/turbidity_data.csv: Index(['Timestamp', 'Ti

In [None]:
turbidity_df

In [43]:
turbidity_df.to_csv('/Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/data/new_turbidity_data.csv')

### Here I mmanually added the remaining ones

In [95]:
turbitiy_final = pd.read_csv('/Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/data/new_turbidity_data.csv')

In [None]:
turbitiy_final

In [None]:
turbitiy_final['turbidity_label'] = turbitiy_final['turbidity'].apply(lambda x: 'turbid' if x >= 55.2 else 'dissolved')

turbitiy_final['turbidity_binary'] = turbitiy_final['turbidity_label'].apply(lambda x: 0 if x == 'turbid' else 1)

turbitiy_final


In [98]:
label_counts = turbitiy_final['turbidity_label'].value_counts()

label_counts

turbidity_label
dissolved    193
turbid       173
Name: count, dtype: int64

In [99]:
turbitiy_final.columns

Index(['Unnamed: 0', 'precursor_combination', 'plate_number', 'turbidity',
       'turbidity_label', 'turbidity_binary'],
      dtype='object')

In [100]:
turbitiy_final.to_csv('/Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/data/new_1_turbidity_data.csv')

In [101]:
duplicate_rows = turbitiy_final[turbitiy_final.duplicated(subset=['precursor_combination', 'plate_number', 'turbidity', 'turbidity_label'], keep=False)]

print(duplicate_rows)

Empty DataFrame
Columns: [Unnamed: 0, precursor_combination, plate_number, turbidity, turbidity_label, turbidity_binary]
Index: []


In [102]:
duplicates = turbitiy_final[turbitiy_final.duplicated(subset=['precursor_combination', 'plate_number', 'turbidity'])]
duplicates

Unnamed: 0.1,Unnamed: 0,precursor_combination,plate_number,turbidity,turbidity_label,turbidity_binary


In [103]:
turbidity_final = pd.read_csv('/Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/data/new_1_turbidity_data.csv')

In [None]:
turbidity_final['tri_code'] = turbidity_final['precursor_combination'].str[0]  
turbidity_final['di_code'] = turbidity_final['precursor_combination'].str[1:].astype(int)  

tri_mapping = {chr(65 + i): f'Tri{chr(65 + i)}' for i in range(21)}  
di_mapping = {i + 1: f'Di{i + 1}' for i in range(34)}  

turbidity_final['tri_name'] = turbidity_final['tri_code'].map(tri_mapping)
turbidity_final['di_name'] = turbidity_final['di_code'].map(di_mapping)

turbidity_final = turbidity_final.drop(columns=['tri_code', 'di_code'])

turbidity_final

In [None]:
di_precursors = tables_dataframes['precursors'][tables_dataframes['precursors']['name'].str.startswith('Di')].rename(
    columns={'name': 'di_name', 'smiles': 'di_smiles'}
)
tri_precursors = tables_dataframes['precursors'][tables_dataframes['precursors']['name'].str.startswith('Tri')].rename(
    columns={'name': 'tri_name', 'smiles': 'tri_smiles'}
)

turbidity_final = turbidity_final.merge(di_precursors[['di_name', 'di_smiles']], on='di_name', how='left')
turbidity_final = turbidity_final.merge(tri_precursors[['tri_name', 'tri_smiles']], on='tri_name', how='left')

turbidity_final

In [107]:
duplicates = turbidity_final[turbidity_final.duplicated(subset=['di_smiles', 'tri_smiles', 'di_name', 'tri_name'])]
duplicates

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,precursor_combination,plate_number,turbidity,turbidity_label,turbidity_binary,tri_name,di_name,di_smiles,tri_smiles


In [109]:
duplicates = turbidity_final[turbidity_final.duplicated(subset=['di_smiles', 'tri_smiles'])]
duplicates

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,precursor_combination,plate_number,turbidity,turbidity_label,turbidity_binary,tri_name,di_name,di_smiles,tri_smiles


In [110]:
turbidity_final.to_csv('/Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/data/new_1_turbidity_data.csv')

# Features augmentation

In [111]:
import pubchempy as pcp

In [None]:
def get_compound_features(smiles):
    try:
        compound = pcp.get_compounds(smiles, 'smiles')[0]  
        return {
            'charge': compound.charge,
            'complexity': compound.complexity,
            'covalent_unit_count': compound.covalent_unit_count,
            'defined_atom_stereo_count': compound.defined_atom_stereo_count,
            'defined_bond_stereo_count': compound.defined_bond_stereo_count,
            'exact_mass': compound.exact_mass,
            'h_bond_acceptor_count': compound.h_bond_acceptor_count,
            'h_bond_donor_count': compound.h_bond_donor_count,
            'heavy_atom_count': compound.heavy_atom_count,
            'xlogp': compound.xlogp,
            'molecular_weight': compound.molecular_weight,
            'rotatable_bond_count': compound.rotatable_bond_count,
            'tpsa': compound.tpsa
        }
    except Exception as e:
        print(f"Error retrieving data for SMILES '{smiles}': {e}")
        return {key: None for key in [
            'charge', 'complexity', 'covalent_unit_count',
            'defined_atom_stereo_count', 'defined_bond_stereo_count', 'exact_mass',
            'h_bond_acceptor_count', 'h_bond_donor_count', 'heavy_atom_count', 'xlogp',
            'molecular_weight', 'rotatable_bond_count', 'tpsa'
        ]}

df_di_features = turbidity_final['di_smiles'].apply(get_compound_features).apply(pd.Series)
df_tri_features = turbidity_final['tri_smiles'].apply(get_compound_features).apply(pd.Series)

turbidity_final = pd.concat([turbidity_final, df_di_features.add_prefix('di_'), df_tri_features.add_prefix('tri_')], axis=1)

turbidity_final

In [113]:
turbidity_final

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,precursor_combination,plate_number,turbidity,turbidity_label,turbidity_binary,tri_name,di_name,di_smiles,tri_smiles,di_cactvs_fingerprint,di_charge,di_complexity,di_covalent_unit_count,di_defined_atom_stereo_count,di_defined_bond_stereo_count,di_exact_mass,di_h_bond_acceptor_count,di_h_bond_donor_count,di_heavy_atom_count,di_xlogp,di_molecular_weight,di_rotatable_bond_count,di_tpsa,tri_cactvs_fingerprint,tri_charge,tri_complexity,tri_covalent_unit_count,tri_defined_atom_stereo_count,tri_defined_bond_stereo_count,tri_exact_mass,tri_h_bond_acceptor_count,tri_h_bond_donor_count,tri_heavy_atom_count,tri_xlogp,tri_molecular_weight,tri_rotatable_bond_count,tri_tpsa
0,0,0,A1,1,89.389731,turbid,0,TriA,Di1,O=Cc1cccc(C=O)c1,Nc1nc(N)nc(N)n1,0000037180703000000000000000000000000000000000...,0,117.0,1,0,0,134.03677943,2,0,10,1.2,134.13,2,34.1,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
1,1,1,A2,1,95.423764,turbid,0,TriA,Di2,CC(C)(C)c1cc(C=O)c(O)c(C=O)c1,Nc1nc(N)nc(N)n1,00000371C0703000000000000000000000000000000000...,0,224.0,1,0,0,206.094294304,3,1,15,2.7,206.24,3,54.4,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
2,2,2,A3,1,85.608188,turbid,0,TriA,Di3,O=Cc1cc2sc(C=O)cc2s1,Nc1nc(N)nc(N)n1,0000037180703000600000000000000000000000000122...,0,185.0,1,0,0,195.96527171,4,0,12,2.3,196.3,2,90.6,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
3,3,3,A4,1,100.885813,turbid,0,TriA,Di4,O=Cc1ccc(C=O)cc1,Nc1nc(N)nc(N)n1,0000037180703000000000000000000000000000000000...,0,107.0,1,0,0,134.03677943,2,0,10,0.9,134.13,2,34.1,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
4,4,4,A5,1,104.259992,turbid,0,TriA,Di5,O=Cc1c(F)c(F)c(C=O)c(F)c1F,Nc1nc(N)nc(N)n1,00000371007031C0000000000000000000000000000000...,0,195.0,1,0,0,205.99909195,6,0,14,1.2,206.09,2,34.1,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
5,5,5,A6,1,74.239664,turbid,0,TriA,Di6,O=Cc1ccc(-c2ccc(C=O)cc2)cc1,Nc1nc(N)nc(N)n1,00000371C0703000000000000000000000000000000000...,0,207.0,1,0,0,210.068079557,2,0,16,3.3,210.23,3,34.1,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
6,6,6,A7,1,79.863862,turbid,0,TriA,Di7,O=Cc1c2ccccc2c(C=O)c2ccccc12,Nc1nc(N)nc(N)n1,00000371C0783000000000000000000000000000000000...,0,269.0,1,0,0,234.068079557,2,0,18,3.3,234.25,2,34.1,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
7,7,7,A8,1,103.7091,turbid,0,TriA,Di8,O=Cc1ccc2ccc3ccc(C=O)nc3c2n1,Nc1nc(N)nc(N)n1,00000371C0733000000000000000000000000000000000...,0,302.0,1,0,0,236.058577502,4,0,18,2.0,236.22,2,59.9,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
8,8,8,A9,3,70.603576,turbid,0,TriA,Di9,O=Cc1ccsc1C=O,Nc1nc(N)nc(N)n1,0000037180603000400000000000000000000000000120...,0,124.0,1,0,0,139.99320054,3,0,9,0.9,140.16,2,62.4,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
9,9,9,A10,3,104.687332,turbid,0,TriA,Di10,O=Cc1ccccc1C=O,Nc1nc(N)nc(N)n1,0000037180703000000000000000000000000000000000...,0,115.0,1,0,0,134.03677943,2,0,10,1.2,134.13,2,34.1,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0


In [114]:
turbidity_final.to_csv('/Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/data/new_1_turbidity_data.csv')

TriU and TriQ cannot be feature-augmented

Replacing TriU from:O=Cc1ccc(C=Cc2cc(C=Cc3ccc(C=O)cc3)cc(C=Cc3ccc(C=O)cc3)c2)cc1 to: C1=CC(=CC=C1/C=C/C2=CC(=CC(=C2)/C=C/C3=CC=C(C=C3)C=O)/C=C/C4=CC=C(C=C4)C=O)C=O


In [None]:
new_tri_smiles = 'C1=CC(=CC=C1/C=C/C2=CC(=CC(=C2)/C=C/C3=CC=C(C=C3)C=O)/C=C/C4=CC=C(C=C4)C=O)C=O'

turbidity_final.loc[turbidity_final['tri_name'] == 'TriU', 'tri_smiles'] = new_tri_smiles

turbidity_final

In [117]:
kept = [
    'Unnamed: 0.1',
    'Unnamed: 0',
    'precursor_combination',
    'plate_number',
    'turbidity',
    'turbidity_label',
    'turbidity_binary',
    'tri_name',
    'di_name',
    'di_smiles',
    'tri_smiles'
]

turbidity_final = turbidity_final[kept]

In [119]:
df_di_features = turbidity_final['di_smiles'].apply(get_compound_features).apply(pd.Series)
df_tri_features = turbidity_final['tri_smiles'].apply(get_compound_features).apply(pd.Series)

turbidity_final = pd.concat([turbidity_final, df_di_features.add_prefix('di_'), df_tri_features.add_prefix('tri_')], axis=1)

turbidity_final

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,precursor_combination,plate_number,turbidity,turbidity_label,turbidity_binary,tri_name,di_name,di_smiles,tri_smiles,di_cactvs_fingerprint,di_charge,di_complexity,di_covalent_unit_count,di_defined_atom_stereo_count,di_defined_bond_stereo_count,di_exact_mass,di_h_bond_acceptor_count,di_h_bond_donor_count,di_heavy_atom_count,di_xlogp,di_molecular_weight,di_rotatable_bond_count,di_tpsa,tri_cactvs_fingerprint,tri_charge,tri_complexity,tri_covalent_unit_count,tri_defined_atom_stereo_count,tri_defined_bond_stereo_count,tri_exact_mass,tri_h_bond_acceptor_count,tri_h_bond_donor_count,tri_heavy_atom_count,tri_xlogp,tri_molecular_weight,tri_rotatable_bond_count,tri_tpsa
0,0,0,A1,1,89.389731,turbid,0,TriA,Di1,O=Cc1cccc(C=O)c1,Nc1nc(N)nc(N)n1,0000037180703000000000000000000000000000000000...,0,117.0,1,0,0,134.03677943,2,0,10,1.2,134.13,2,34.1,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
1,1,1,A2,1,95.423764,turbid,0,TriA,Di2,CC(C)(C)c1cc(C=O)c(O)c(C=O)c1,Nc1nc(N)nc(N)n1,00000371C0703000000000000000000000000000000000...,0,224.0,1,0,0,206.094294304,3,1,15,2.7,206.24,3,54.4,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
2,2,2,A3,1,85.608188,turbid,0,TriA,Di3,O=Cc1cc2sc(C=O)cc2s1,Nc1nc(N)nc(N)n1,0000037180703000600000000000000000000000000122...,0,185.0,1,0,0,195.96527171,4,0,12,2.3,196.3,2,90.6,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
3,3,3,A4,1,100.885813,turbid,0,TriA,Di4,O=Cc1ccc(C=O)cc1,Nc1nc(N)nc(N)n1,0000037180703000000000000000000000000000000000...,0,107.0,1,0,0,134.03677943,2,0,10,0.9,134.13,2,34.1,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
4,4,4,A5,1,104.259992,turbid,0,TriA,Di5,O=Cc1c(F)c(F)c(C=O)c(F)c1F,Nc1nc(N)nc(N)n1,00000371007031C0000000000000000000000000000000...,0,195.0,1,0,0,205.99909195,6,0,14,1.2,206.09,2,34.1,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
5,5,5,A6,1,74.239664,turbid,0,TriA,Di6,O=Cc1ccc(-c2ccc(C=O)cc2)cc1,Nc1nc(N)nc(N)n1,00000371C0703000000000000000000000000000000000...,0,207.0,1,0,0,210.068079557,2,0,16,3.3,210.23,3,34.1,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
6,6,6,A7,1,79.863862,turbid,0,TriA,Di7,O=Cc1c2ccccc2c(C=O)c2ccccc12,Nc1nc(N)nc(N)n1,00000371C0783000000000000000000000000000000000...,0,269.0,1,0,0,234.068079557,2,0,18,3.3,234.25,2,34.1,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
7,7,7,A8,1,103.7091,turbid,0,TriA,Di8,O=Cc1ccc2ccc3ccc(C=O)nc3c2n1,Nc1nc(N)nc(N)n1,00000371C0733000000000000000000000000000000000...,0,302.0,1,0,0,236.058577502,4,0,18,2.0,236.22,2,59.9,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
8,8,8,A9,3,70.603576,turbid,0,TriA,Di9,O=Cc1ccsc1C=O,Nc1nc(N)nc(N)n1,0000037180603000400000000000000000000000000120...,0,124.0,1,0,0,139.99320054,3,0,9,0.9,140.16,2,62.4,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0
9,9,9,A10,3,104.687332,turbid,0,TriA,Di10,O=Cc1ccccc1C=O,Nc1nc(N)nc(N)n1,0000037180703000000000000000000000000000000000...,0,115.0,1,0,0,134.03677943,2,0,10,1.2,134.13,2,34.1,0000037180438000000000000000000000000000000000...,0.0,63.3,1.0,0.0,0.0,126.06539422,6.0,3.0,9.0,-1.4,126.12,0.0,117.0


In [120]:
turbidity_final.shape

(366, 39)

In [123]:
turbidity_final.to_csv('/Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/data/new_1_turbidity_data.csv')

In [122]:
duplicates = turbidity_final[turbidity_final.duplicated(subset=['di_smiles', 'tri_smiles'])]
duplicates

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,precursor_combination,plate_number,turbidity,turbidity_label,turbidity_binary,tri_name,di_name,di_smiles,tri_smiles,di_cactvs_fingerprint,di_charge,di_complexity,di_covalent_unit_count,di_defined_atom_stereo_count,di_defined_bond_stereo_count,di_exact_mass,di_h_bond_acceptor_count,di_h_bond_donor_count,di_heavy_atom_count,di_xlogp,di_molecular_weight,di_rotatable_bond_count,di_tpsa,tri_cactvs_fingerprint,tri_charge,tri_complexity,tri_covalent_unit_count,tri_defined_atom_stereo_count,tri_defined_bond_stereo_count,tri_exact_mass,tri_h_bond_acceptor_count,tri_h_bond_donor_count,tri_heavy_atom_count,tri_xlogp,tri_molecular_weight,tri_rotatable_bond_count,tri_tpsa


### Adding the number fo aromatic rings

In [124]:
info_arom = pd.read_csv('/Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/data/data_supplmentary_hardness_arom_nbd.csv')

In [129]:
aromatic_ring_mapping = info_arom.set_index('precursor')['nbr_atomatic_ring'].to_dict()


turbidity_final['di_arom_group'] = turbidity_final['di_name'].str[2:].map(aromatic_ring_mapping)
turbidity_final['tri_arom_group'] = turbidity_final['tri_name'].str[3:].map(aromatic_ring_mapping)

turbidity_final.to_csv('/Users/davidsegura/git/Leveraging_LLMs_causality/cage_SAD_llm/data/new_1_turbidity_data.csv')