# Basic filtering
In this section, a basic filtering, standardization, and aggregation of duplicates by dataset is going to be carried out. After that, a rd_filter is going to be applied. 

In [4]:
import pandas as pd
import numpy as np

## Reading of files in different dataframes

In [6]:
df_act = pd.read_csv('./raw_data/actives.csv')
df_inact = pd.read_csv('./raw_data/inactives.csv')
df_inc = pd.read_csv('./raw_data/inconclusive.csv')
df_act

Unnamed: 0,Smiles,Standard_Type,Standard_Relation,Standard_Value,Standard_Units,pChEMBL_Value,BAO_Label,Target_Name
0,Cc1cc(C)n(C(=O)CSc2nc3ccccc3o2)n1,Potency,'=',28183.8,nM,4.55,assay format,Beta-lactamase AmpC
1,CCOc1ccc2ccccc2c1C(=O)N[C@@H]1C(=O)N2[C@@H](C(...,Potency,'=',5011.9,nM,5.30,assay format,Beta-lactamase AmpC
2,Nc1nc2c(s1)CCc1c-2cnn1-c1ccccc1,Potency,'=',89125.1,nM,4.05,assay format,Beta-lactamase AmpC
3,CC(C)[C@H](NC(=O)OC(C)(C)C)c1nnc(S(=O)(=O)Cc2c...,Potency,'=',1800.0,nM,5.75,assay format,Beta-lactamase AmpC
4,CO[C@@]1(NC(=O)C2SC(=C(C(N)=O)C(=O)O)S2)C(=O)N...,Potency,'=',17782.8,nM,4.75,assay format,Beta-lactamase AmpC
...,...,...,...,...,...,...,...,...
160,N#C/C(C(=O)Nc1ccc(Cl)cc1)=C(/S)Nc1ccccc1,Potency,'=',39810.7,nM,4.40,assay format,Beta-lactamase AmpC
161,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccccc3)C(=...,Potency,'=',63095.7,nM,4.20,assay format,Beta-lactamase AmpC
162,O=C1c2ccccc2C(=O)N1OS(=O)(=O)c1ccccc1,Potency,'=',44668.4,nM,4.35,assay format,Beta-lactamase AmpC
163,Cn1cnnc1SCC(=O)Nc1nc(-c2ccccc2)cs1,Potency,'=',79432.8,nM,4.10,assay format,Beta-lactamase AmpC


## Main filtering
This filtering is going to select:
1. 'Target_Name' == 'Beta-lactamase AmpC'
2. 'BAO_Label' == 'assay format'
3. 'Standard_Relation' == '='
4. 'Standard_Type' == 'IC50') | 'Potency' | 'Ki')

Then a filtering searching for delete mising values on Smiles and pChEMBL_Value columns

In [7]:
df_act_fil = main_filtering(df_act, 'Actives')
df_inact_fil = main_filtering(df_inact, 'Inactives')
df_inc_fil = main_filtering(df_inc, 'Inconclusives')


= = = = = = = = = = = = = = = =
There are 165 molecules in the Actives dataset before filtering
164
There are 164 molecules in the Actives dataset after filtering
= = = = = = = = = = = = = = = =
There are 8951 molecules in the Inactives dataset before filtering
8884
There are 8884 molecules in the Inactives dataset after filtering
= = = = = = = = = = = = = = = =
There are 52677 molecules in the Inconclusives dataset before filtering
52664
There are 52664 molecules in the Inconclusives dataset after filtering


## Using the rd_filters by Pat Walters to 
This part was done in the bash terminal. An try of code implementation is shown below. 
The program was executed in the *processed* folder. The final output is the file of signature:
* ./procesed/{actives/inactives/inconclusives}/{actives/inactives/inconclusives}_filtered_lactamase.smi

In [9]:
# TODO: Automatizing rd_filters. There may be problems when converting dataframe into a smi file, making the rd_filters dont parce the file

# import subprocess

# datasets = [df_act_fil, df_inact_fil, df_inc_fil]
# dataset_name = ['actives', 'inactives', 'inconclusives']

# for name, dataset in zip(dataset_name, datasets):
#     command_folder = f'mkdir ./procesed/{name}'
#     subprocess.call(command_folder, shell=True)
#     dataset['Smiles'].to_csv(f'./procesed/{name}/{name}_for_rd.smi', sep='\t' , header=False, index=False)
    
#     command_rd_filters = " ".join(['rd_filters', 'filter', '--in', f'./procesed/{name}/{name}_for_rd.smi', '--prefix', 'rd_filtered_lactamase'])
#     subprocess.call(command_rd_filters, shell=True)


---

# Supplementary code
Used instead of importing

In [2]:
import pandas as pd
import numpy as np


def basic_filtering(df):
    target_pref_name = (df['Target_Name'] == 'Beta-lactamase AmpC')
    bao_label = df['BAO_Label'] == 'assay format'
    standard_relation = df['Standard_Relation'] == "'='"
    standard_type = (df['Standard_Type'] == 'IC50') | (
        df['Standard_Type'] == 'Potency') | (df['Standard_Type'] == 'Ki')

    df_filtered = df[target_pref_name & bao_label & standard_relation &
                     standard_type][['Smiles', 'Standard_Value', 'pChEMBL_Value']]

    # Procedemos ahora a eliminar files con valores perdidos
    smiles_not_null = df_filtered['Smiles'].notnull()
    smiles_not_empty = df_filtered['Smiles'] != ''

    pchembl_value_not_nut = df_filtered['pChEMBL_Value'].notnull()
    pchembl_value_not_empty = df_filtered['pChEMBL_Value'] != ''

    df_without_missing = df_filtered[smiles_not_null &
                                     smiles_not_empty & pchembl_value_not_nut & pchembl_value_not_empty]
    return df_without_missing


def main_filtering(df, name):
    print('= = = = = = = = = = = = = = = =')
    print(
        f'There are {len(df)} molecules in the {name} dataset before filtering')
    df_bf = basic_filtering(df)
    print(
        f'There are {len(df_bf)} molecules in the {name} dataset after filtering')

    return df_bf




