#**Antiviral libraries**

##**1. Prepare the environment**

In [1]:
!pip install rdkit
import pandas as pd
from google.colab import drive
drive.mount("/content/drive")
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Draw, PandasTools, Descriptors, Descriptors3D, rdMolDescriptors, Scaffolds
from rdkit.Chem.Scaffolds.MurckoScaffold import GetScaffoldForMol
PandasTools.RenderImagesInAllDataFrames(images = True) # to molecules visualization
from rdkit.Chem.Draw import IPythonConsole
import numpy as np

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##**2. Load data sets**

In [2]:
# Save filtered DataFrames with new columns
hrv= pd.read_csv('/content/drive/MyDrive/antivirals_machine_learning/Notebooks/Libraries/actives_per_target_per_consensus/cleaned_libraries_actives/ordered_libraries_actives/Libraries_upper_0.5/Individual_libraries/filtered_presence/hrv_filtered_with_presence.csv')
iavm2= pd.read_csv('/content/drive/MyDrive/antivirals_machine_learning/Notebooks/Libraries/actives_per_target_per_consensus/cleaned_libraries_actives/ordered_libraries_actives/Libraries_upper_0.5/Individual_libraries/filtered_presence/iavm2_filtered_with_presence.csv')
iavneu= pd.read_csv('/content/drive/MyDrive/antivirals_machine_learning/Notebooks/Libraries/actives_per_target_per_consensus/cleaned_libraries_actives/ordered_libraries_actives/Libraries_upper_0.5/Individual_libraries/filtered_presence/iavneu_filtered_with_presence.csv')
iavpoly= pd.read_csv('/content/drive/MyDrive/antivirals_machine_learning/Notebooks/Libraries/actives_per_target_per_consensus/cleaned_libraries_actives/ordered_libraries_actives/Libraries_upper_0.5/Individual_libraries/filtered_presence/iavpoly_filtered_with_presence.csv')
ibv= pd.read_csv('/content/drive/MyDrive/antivirals_machine_learning/Notebooks/Libraries/actives_per_target_per_consensus/cleaned_libraries_actives/ordered_libraries_actives/Libraries_upper_0.5/Individual_libraries/filtered_presence/ibv_filtered_with_presence.csv')
sars2= pd.read_csv('/content/drive/MyDrive/antivirals_machine_learning/Notebooks/Libraries/actives_per_target_per_consensus/cleaned_libraries_actives/ordered_libraries_actives/Libraries_upper_0.5/Individual_libraries/filtered_presence/sars2_filtered_with_presence.csv')
sars= pd.read_csv('/content/drive/MyDrive/antivirals_machine_learning/Notebooks/Libraries/actives_per_target_per_consensus/cleaned_libraries_actives/ordered_libraries_actives/Libraries_upper_0.5/Individual_libraries/filtered_presence/sars_filtered_with_presence.csv')

In [3]:
print(hrv.columns)

Index(['Canonical_SMILES_STD', 'ID', 'DB', 'consensus_hrv',
       'HRV_Protease_Quartile_fp', 'HRV_Protease_Quartile_prop',
       'Presence_in_training_set', 'Presence_in_retraining_set'],
      dtype='object')


In [4]:
hrv['Target'] = 'HRV_Protease'
iavm2['Target'] = 'IAV_M2 proton channel'
iavneu['Target'] = 'IAV_Neuraminidase'
iavpoly['Target'] = 'IAV_Polymerase (PA)'
ibv['Target'] = 'IBV_Neuraminidase'
sars2['Target'] = 'SARS-CoV-2_Mpro'
sars['Target'] = 'SARs-CoV_Mpro'

In [5]:
# Rename columns
hrv = hrv.rename(columns={'consensus_hrv': 'Consensus', 'HRV_Protease_Quartile_fp': 'Quartile_fp', "HRV_Protease_Quartile_prop": 'Quartile_prop'})
iavm2 = iavm2.rename(columns={'consensus_iavm2': 'Consensus', 'IAV_M2 proton channel_Quartile_fp': 'Quartile_fp', "IAV_M2 proton channel_Quartile_prop": 'Quartile_prop'})
iavneu = iavneu.rename(columns={'consensus_iavneu': 'Consensus', 'IAV_Neuraminidase_Quartile_fp': 'Quartile_fp', "IAV_Neuraminidase_Quartile_prop": 'Quartile_prop'})
iavpoly = iavpoly.rename(columns={'consensus_iavpoly': 'Consensus', 'IAV_Polymerase (PA)_Quartile_fp': 'Quartile_fp', "IAV_Polymerase (PA)_Quartile_prop": 'Quartile_prop'})
ibv = ibv.rename(columns={'consensus_ibv': 'Consensus', 'IBV_Neuraminidase_Quartile_fp': 'Quartile_fp', "IBV_Neuraminidase_Quartile_prop": 'Quartile_prop'})
sars2 = sars2.rename(columns={'consensus_sars2': 'Consensus', 'SARS-CoV-2_Mpro_Quartile_fp': 'Quartile_fp', "SARS-CoV-2_Mpro_Quartile_prop": 'Quartile_prop'})
sars = sars.rename(columns={'consensus_sars': 'Consensus', 'SARS-CoV_Mpro_Quartile_fp': 'Quartile_fp', "SARS-CoV_Mpro_Quartile_prop": 'Quartile_prop'})

##**3. Generate scaffolds**

In [None]:
# Dictionary according to each loaded DataFrame
dataframes = {
    "HRV_protease_library.csv": hrv,
    "IAV_M2_proton_channel_library.csv": iavm2,
    "IAV_Neuraminidase_library.csv": iavneu,
    "IAV_Polymerase (PA)_library.csv": iavpoly,
    "IBV_Neuraminidase_library.csv": ibv,
    "SARS-CoV-2_library.csv": sars2,
    "SARS-CoV_library.csv": sars
}

# Set groups of 3 to optimize memory
df_keys = list(dataframes.keys())
batch_size = 3  # Change it to 2 or 1

for i in range(0, len(df_keys), batch_size):
    batch = df_keys[i:i+batch_size]  # subset of 3

    for filename in batch:
        df = dataframes[filename]  # Obtain DataFrame

        # Add molecule and scaffolds
        PandasTools.AddMoleculeColumnToFrame(df, smilesCol="Canonical_SMILES_STD", molCol="MOL")
        PandasTools.AddMurckoToFrame(df, molCol="MOL", MurckoCol="SCAFFOLD")

        # Relocate 'SCAFFOLD' column
        col_order = df.columns.tolist()
        col_order.insert(col_order.index("Canonical_SMILES_STD") + 1, col_order.pop(col_order.index("SCAFFOLD")))
        df = df[col_order]  # Reorder

        # Delate mol columns
        df.drop(columns=["MOL", "SCA_MOL"], errors="ignore", inplace=True)

        # Save DataFrame
        df.to_csv(filename, index=False)

        print(f"Saved: {filename}")

    print("Freeing memory...\n")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=["MOL", "SCA_MOL"], errors="ignore", inplace=True)


Saved: HRV_protease_library.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=["MOL", "SCA_MOL"], errors="ignore", inplace=True)


Saved: IAV_M2_proton_channel_library.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=["MOL", "SCA_MOL"], errors="ignore", inplace=True)


Saved: IAV_Neuraminidase_library.csv
Freeing memory...



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=["MOL", "SCA_MOL"], errors="ignore", inplace=True)


Saved: IAV_Polymerase (PA)_library.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=["MOL", "SCA_MOL"], errors="ignore", inplace=True)


Saved: IBV_Neuraminidase_library.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=["MOL", "SCA_MOL"], errors="ignore", inplace=True)
