In [1]:
import pandas as pd
import re
import os

In [2]:
# Get REPO_ROOT assuming you're 2 folders down from the root
SCRIPT_DIR = os.getcwd()
REPO_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "..", ".."))

# Path to the CSV
csv_path = os.path.join(REPO_ROOT, "data", "organism_w_llama.csv")

In [3]:
df = pd.read_csv(csv_path)
df.shape

(35042, 53)

In [4]:
#manual curation   # 14 of 277 manually curated descriptions flagged by gpt model
corrections = {
    "Antibacterial activity against Mycobacterium tuberculosis NHN382 clinical isolate harboring Kat G S315t mutant assessed as parasite growth inhibition after 6 days by microplate Alamar Blue assay": "false",
    "Antibacterial activity against Mycobacterium tuberculosis TN587 clinical isolate harboring Kat G 3315T mutant assessed as parasite growth inhibition after 6 days by microplate Alamar Blue assay": "false",
    "Antimycobacterial activity against Mycobacterium tuberculosis 19 harbouring KatG Ser315Thr mutant and wild type InhA assessed as reduction in bacterial growth incubated for 7 days by resazurin dye based assay": "false",
    "Antimycobacterial activity against Mycobacterium tuberculosis 71A harbouring KatG Ser315Thr mutant and wild type InhA assessed as reduction in bacterial growth incubated for 7 days by resazurin dye based assay": "false",
    "Antimycobacterial activity against Mycobacterium tuberculosis 91 harbouring KatG Ser315Thr mutant and wild type InhA assessed as reduction in bacterial growth incubated for 7 days by resazurin dye based assay": "false",
    "Antimycobacterial activity against Mycobacterium tuberculosis BRF14 harbouring KatG Ser315Thr mutant and wild type InhA assessed as reduction in bacterial growth incubated for 7 days by resazurin dye based assay": "false",
    "Antimycobacterial activity against Mycobacterium tuberculosis BRF16 harbouring KatG Ser315Thr mutant and wild type InhA assessed as reduction in bacterial growth incubated for 7 days by resazurin dye based assay": "false",
    "Antimycobacterial activity against Mycobacterium tuberculosis BRF45 harbouring KatG Ser315Thr mutant and wild type InhA assessed as reduction in bacterial growth incubated for 7 days by resazurin dye based assay": "false",
    "Antimycobacterial activity against Mycobacterium tuberculosis BRF57 harbouring KatG Ser315Thr mutant and wild type InhA assessed as reduction in bacterial growth incubated for 7 days by resazurin dye based assay": "false",
    "Antimycobacterial activity against Mycobacterium tuberculosis clinical isolate 581-17 harboring gyrA QRDR c270t (A90V) mutant and wild type gyrB": "false",
    "Antimycobacterial activity against extensively drug-resistant Mycobacterium tuberculosis isolate 2301 horboring rpoB S531L, katG R463L/E217G, gyrA D94N/S95T, rrs A1401G by CLSI method": "xdr",
    "Antimycobacterial activity against multidrug (INH and RIF)-resistant Mycobacterium tuberculosis clinical isolate 1 assessed as inhibition of bacterial growth incubated for 7 days by rapid direct susceptibility test technique": "mdr",
    "Antimycobacterial activity against multidrug (isoniazid and rifampicin) resistant Mycobacterium tuberculosis 1 incubated for 7 days by rapid direct susceptibility test": "mdr",
    "Antimycobacterial activity against multidrug (isoniazid and rifampicin) resistant Mycobacterium tuberculosis 2 incubated for 7 days by rapid direct susceptibility test": "mdr",
    "Antimycobacterial activity against drug-resistant Mycobacterium tuberculosis isolate MDR1 assessed as inhibition of mycobacterial growth incubated for 9 days": "mdr",
    "Antimycobacterial activity against drug-resistant Mycobacterium tuberculosis isolate MDR2 assessed as inhibition of mycobacterial growth incubated for 9 days": "mdr",
}


In [5]:
14*100/277

5.054151624548736

In [6]:
# Apply corrections across all matching rows
df["mentions_resistance"] = df.apply(
    lambda row: corrections.get(row["assay_description"], row["mentions_resistance"]),
    axis=1
)

In [7]:
corrections_drugs = {
    'isoniazide-rifampicin-streptomycin-ethambutol': 'isoniazide, rifampicin, streptomycin, ethambutol',
    'isoniazide-rifampicin-ethambutol': 'isoniazide, rifampicin, ethambutol',
    'multiple drugs': 'multiple, drugs',
    'isoniazide-streptomycin': 'isoniazide, streptomycin',
    'isoniazide-rifampicin': 'isoniazide, rifampicin',
}


In [8]:
df['resistant_to'] = df['resistant_to'].replace(corrections_drugs)

In [9]:
#keep only checkerboard = false
df["checkerboard"].value_counts()

checkerboard
False    34830
True       212
Name: count, dtype: int64

In [10]:
df = df[df["checkerboard"] == False]

In [11]:
# Mapping to normalize values
normalize_map = {
    "False": "False",
    "True": "True",
    "R": "resistant",
    "mdr": "MDR",
    "MDR": "MDR",
    "xdr": "XDR",
    "mdr/xdr": "MDR/XDR",
    "Rifampicin-sensitive": "sensitive"
}

# Apply normalization
df["mentions_resistance_normalized"] = df["mentions_resistance"].map(normalize_map)

# Check counts
df["mentions_resistance_normalized"].value_counts()

mentions_resistance_normalized
False        28163
resistant     3395
MDR           2628
XDR            369
MDR/XDR        155
True            68
sensitive        3
Name: count, dtype: int64

In [12]:
# Now count unique canonical SMILES per category
unique_smiles_counts = (
    df.groupby("mentions_resistance_normalized")["canonical_smiles"]
    .nunique()
    .sort_values(ascending=False)
)

print(unique_smiles_counts)

mentions_resistance_normalized
False        18415
resistant     1181
MDR           1140
XDR            191
True            39
MDR/XDR         13
sensitive        3
Name: canonical_smiles, dtype: int64


In [13]:
unique_assay_descriptions_count = (
    df.groupby("mentions_resistance_normalized")["assay_description"]
    .nunique()
    .sort_values(ascending=False)
)
unique_assay_descriptions_count

mentions_resistance_normalized
False        2419
resistant     703
MDR           412
XDR            79
True           22
MDR/XDR        15
sensitive       2
Name: assay_description, dtype: int64

In [14]:
df["resistant_to"].value_counts()

resistant_to
False                                                                                      31181
isoniazid                                                                                    671
rifampicin                                                                                   369
isoniazid and rifampicin                                                                     123
streptomycin                                                                                 122
                                                                                           ...  
rifampicin/ethambutol                                                                          1
isoniazid, rifampicin, ethambutol, pyrazinamide, streptomycin, kanamycin                       1
qcrB                                                                                           1
streptomycin, isoniazid, rifampicin, ethambutol, pyrazinamide, ethionamide, capreomycin        1
isoniazid/ethambu

In [15]:
# Function to normalize the resistant_to field
def normalize_resistant_to(value):
    if not isinstance(value, str):
        return []
    parts = re.split(r"\s*(?:,|/| and | & | \+ )\s*", value.lower())
    return sorted(set(p.strip() for p in parts if p.strip()))

# Apply to the column
df['resistant_to_normalized'] = df['resistant_to'].apply(normalize_resistant_to)

In [16]:
df['resistant_to_normalized'].value_counts()

resistant_to_normalized
[false]                                                                                         31181
[isoniazid]                                                                                       673
[rifampicin]                                                                                      379
[isoniazid, rifampicin]                                                                           221
[streptomycin]                                                                                    158
                                                                                                ...  
[ethambutol, rifampicin]                                                                            1
[ethambutol, isoniazid, kanamycin, pyrazinamide, rifampicin, streptomycin]                          1
[qcrb]                                                                                              1
[capreomycin, ethambutol, ethionamide, isoniazid, pyrazina

In [17]:
# remove rows with data_validity_comment == 'Outside typical range'
df = df[df.data_validity_comment != 'Outside typical range']

# remove rows with standard_flag == 0
df = df[df.standard_flag != 0]

df['standard_units'].unique()


array(['ug.mL-1', 'nM'], dtype=object)

In [18]:
import sys
sys.path.append('../..')
from utils import convert_to_uM

# add column with mic in uM
mic_um = convert_to_uM(df, "canonical_smiles")

df['MIC_uM'] = mic_um

In [19]:
mentions_resistant_descriptions = df[df['mentions_resistance_normalized'] == 'resistant']
# get only a row per each unique descriptions
mentions_resistant_descriptions = mentions_resistant_descriptions.drop_duplicates(subset=['assay_description'])

In [20]:
df_unique_descriptions = df.drop_duplicates(subset="assay_description")

In [21]:
resistant__to_descriptions = df.drop_duplicates(subset=['assay_description'])

counts_resistant_to = resistant__to_descriptions['resistant_to_normalized'].value_counts()

In [22]:
resistance_df = df[df['mentions_resistance_normalized'] != 'False']
df.shape, resistance_df.shape

((34790, 56), (6664, 56))

In [23]:
resistance_df.mentions_resistance_normalized.value_counts()

mentions_resistance_normalized
resistant    3395
MDR          2625
XDR           369
MDR/XDR       155
True           68
sensitive       3
Name: count, dtype: int64

In [24]:
mdr_df = df[
    (df['mentions_resistance_normalized'] == 'MDR') |
    (df['mentions_resistance_normalized'] == 'XDR') |
    (df['mentions_resistance_normalized'] == 'MDR/XDR')
]

mdr_df.shape

(3149, 56)

In [25]:
mdr_df.canonical_smiles.nunique()

1234

In [26]:
r_df = df[df['mentions_resistance_normalized'] == 'resistant']
r_df.shape, r_df.canonical_smiles.nunique()

((3395, 56), 1181)

In [27]:
drugs_resistant_values = r_df.resistant_to_normalized.value_counts()

In [28]:
# Step 1: Identify rows in r_df where resistant_to_normalized is a list with >1 drug
multi_drug_rows = r_df[r_df['resistant_to_normalized'].apply(
    lambda x: isinstance(x, list) and len(x) > 1
)].copy()  # copy() avoids SettingWithCopyWarning

# Step 2: Convert the column from list to tuple to make it hashable for deduplication
multi_drug_rows['resistant_to_normalized'] = multi_drug_rows['resistant_to_normalized'].apply(tuple)

# Step 3: Also ensure mdr_df has the same structure (convert list to tuple if needed)
mdr_df = mdr_df.copy()
mdr_df['resistant_to_normalized'] = mdr_df['resistant_to_normalized'].apply(
    lambda x: tuple(x) if isinstance(x, list) else x
)

# Step 4: Concatenate and drop duplicates
mdr_df = pd.concat([mdr_df, multi_drug_rows])

# Step 5: Remove multi-drug rows from r_df to fully "move" them
r_df = r_df.drop(multi_drug_rows.index).reset_index(drop=True)

In [29]:
mdr_df.shape, r_df.shape,

((4507, 56), (2037, 56))

In [30]:
mdr_df.canonical_smiles.nunique(), r_df.canonical_smiles.nunique()

(1712, 780)

In [31]:
mdr_df

Unnamed: 0.1,Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,...,value,mtb_strain,mentions_resistance,resistant_to,mutant,mutant_type,checkerboard,mentions_resistance_normalized,resistant_to_normalized,MIC_uM
888,1275,,,1061878,[],CHEMBL745787,Minimum inhibitory concentration against (MDR-...,F,,,...,2.00,Mycobacterium tuberculosis,mdr,False,False,False,False,MDR,"(false,)",4.236192
1430,2256,,,1720616,[],CHEMBL863126,Antimycobacterial activity against multidrug r...,F,,,...,6.25,multidrug resistant Mycobacterium tuberculosis,mdr,False,False,False,False,MDR,"(false,)",19.772262
1432,2258,,,1720678,[],CHEMBL863126,Antimycobacterial activity against multidrug r...,F,,,...,0.78,multidrug resistant Mycobacterium tuberculosis,mdr,False,False,False,False,MDR,"(false,)",2.260283
1434,2260,,,1720681,[],CHEMBL863126,Antimycobacterial activity against multidrug r...,F,,,...,0.78,multidrug resistant Mycobacterium tuberculosis,mdr,False,False,False,False,MDR,"(false,)",2.260283
1436,2262,,,1720684,[],CHEMBL863126,Antimycobacterial activity against multidrug r...,F,,,...,0.39,multidrug resistant Mycobacterium tuberculosis,mdr,False,False,False,False,MDR,"(false,)",1.130141
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35037,54,,,15777196,[],CHEMBL3637834,GSK_TB: Minimum inhibitory concentration again...,F,,,...,5.00,clinical isolates,R,isoniazid and rifampicin,False,False,False,resistant,"(isoniazid, rifampicin)",5.000000
35038,55,,,15777197,[],CHEMBL3637834,GSK_TB: Minimum inhibitory concentration again...,F,,,...,1.30,clinical isolates,R,isoniazid and rifampicin,False,False,False,resistant,"(isoniazid, rifampicin)",1.300000
35039,56,,,15777198,[],CHEMBL3637834,GSK_TB: Minimum inhibitory concentration again...,F,,,...,0.90,clinical isolates,R,isoniazid and rifampicin,False,False,False,resistant,"(isoniazid, rifampicin)",0.900000
35040,57,,,15777199,[],CHEMBL3637834,GSK_TB: Minimum inhibitory concentration again...,F,,,...,1.30,clinical isolates,R,isoniazid and rifampicin,False,False,False,resistant,"(isoniazid, rifampicin)",1.300000


In [32]:
import sys
sys.path.append('../..')
from utils import merge_duplicates, add_log_column

# Merge duplicates in mdr_df
mdr_df = merge_duplicates(mdr_df)
# Merge duplicates in r_df
r_df = merge_duplicates(r_df)

# data without duplicates
mdr_df_filtered_data_no_duplicates = mdr_df.drop_duplicates(subset=['canonical_smiles'])
r_df_filtered_data_no_duplicates = r_df.drop_duplicates(subset=['canonical_smiles'])


# add log column to mdr_df
mdr_df = add_log_column(mdr_df_filtered_data_no_duplicates)
# add log column to r_df
r_df = add_log_column(r_df_filtered_data_no_duplicates)

In [None]:
# Resolve path to repo and subfolder
SCRIPT_DIR = os.getcwd()
REPO_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "..", ".."))
save_path = os.path.join(REPO_ROOT, "data", "res_mdr_xdr")

# Ensure folder exists
os.makedirs(save_path, exist_ok=True)

# Save dataframes
mdr_df.to_csv(os.path.join(save_path, "mdr_no_duplicates.csv"), index=False)
r_df.to_csv(os.path.join(save_path, "resistant_df_no_duplicates.csv"), index=False)