In [None]:
!pip install rdkit
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MACCSkeys, AllChem
from rdkit.Chem import rdMolDescriptors, Descriptors
from scipy.spatial.distance import pdist

Collecting rdkit
  Downloading rdkit-2023.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.3


In [None]:
foodb = pd.read_csv("https://raw.githubusercontent.com/DIFACQUIM/Food_chemicals_characterization/main/foodb_curated.csv")
fda = pd.read_csv("https://raw.githubusercontent.com/DIFACQUIM/Food_chemicals_characterization/main/fda_curated.csv")
unpda = pd.read_csv("https://raw.githubusercontent.com/DIFACQUIM/Food_chemicals_characterization/main/unpda_curated.csv")
purch = pd.read_csv('https://raw.githubusercontent.com/DIFACQUIM/Food_chemicals_characterization/main/purch_curated.csv')

In [None]:
print(foodb.columns)
print(fda.columns)
print(unpda.columns)
print(purch.columns)

Index(['ID', 'SMILES', 'SMILES_chiral', 'SMILES_no_chiral'], dtype='object')
Index(['ID', 'DATABASE_NAME', 'DRUG_GROUPS', 'GENERIC_NAME', 'SYNONYMS',
       'SMILES', 'SMILES_chiral', 'SMILES_no_chiral'],
      dtype='object')
Index(['ID', 'SMILES_chiral', 'SMILES_no_chiral', 'NPL_score'], dtype='object')


In [None]:
foodb = foodb[['ID', "SMILES_chiral"]]
fda = fda[['ID', "SMILES_chiral"]]
unpda = unpda[['ID', "SMILES_chiral"]]
purch = purch[['ID', "SMILES_chiral"]]

In [None]:
foodb.columns = ['ID', 'SMILES']
fda.columns = ['ID', 'SMILES']
unpda.columns = ['ID', 'SMILES']
purch.columns = ['ID', 'SMILES']

In [None]:
# Specify the dataset each compound belongs to
foodb['DATASET'] = 'FooDB'
fda['DATASET'] = 'FDA'
unpda['DATASET'] = 'UNPD-A'
purch['DATASET'] = 'FooDB commercially available'

In [None]:
print(foodb.head(5))
print(fda.head(5))
print(unpda.head(5))
print(purch.head(5))

          ID                                            SMILES       DATASET
0  MUS882023  C[C@@]12CCCC1[C@@H]1CCC3C[C@H](O)CC[C@]3(C)C1CC2  QuimfraganDB
1  AMB882023                          O=C1CCCC/C=C/CCCCCCCCCO1  QuimfraganDB
2  TRA882023              CC(=O)c1cc2c(cc1C)C(C)(C)C(C)C2C(C)C  QuimfraganDB
3  CIS882023                                        CC/C=C\CCO  QuimfraganDB
4  CYA882023                       O=c1[nH]c(=O)[nH]c(=O)[nH]1  QuimfraganDB
        ID                                             SMILES DATASET
0  DB00006  CC[C@H](C)C(NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](C...     FDA
1  DB00007  CCNC(=O)C1CCCN1C(=O)[C@H](CCCN=C(N)N)NC(=O)[C@...     FDA
2  DB00014  CC(C)CC(NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H](Cc...     FDA
3  DB00027  CC(C)CC(NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(=O)N[...     FDA
4  DB00035  N=C(N)NCCCC(NC(=O)[C@@H]1CCCN1C(=O)[C@@H]1CSSC...     FDA
                ID                                             SMILES DATASET
0  UNPD_subset_A_1  Cc1cc2c(c(O[C@@H]3O[

In [None]:
# selection of a test database
"""
foodb = foodb.head(5)
fda = fda.head(5)
unpda = unpda.head(5)
purch = purch.head(5)
"""

'# selection of a test database\nfrag = frag.head(5)\nfda = fda.head(5)\nunpda = unpda.head(5)'

In [None]:
data = pd.concat([foodb, fda, unpda, purch])
data.reset_index(drop=True, inplace=True)
print(data.shape)
data

In [None]:
# Each descriptor is calculated using a 'for' cycle for each molecule builded from the smiles in the database
data["HBA"] = [Descriptors.NumHAcceptors(y) for y in (Chem.MolFromSmiles(x) for x in data["SMILES"])]
data["HBD"] = [Descriptors.NumHDonors(y) for y in (Chem.MolFromSmiles(x) for x in data["SMILES"])]
data["LOGP"] = [Descriptors.MolLogP(y) for y in (Chem.MolFromSmiles(x) for x in data['SMILES'])]
data["TPSA"] = [Descriptors.TPSA(y) for y in (Chem.MolFromSmiles(x) for x in data['SMILES'])]
data["MW"] = [Descriptors.MolWt(y) for y in (Chem.MolFromSmiles(x) for x in data['SMILES'])]
data["CSP3"] = [Descriptors.FractionCSP3(y) for y in (Chem.MolFromSmiles(x) for x in data['SMILES'])]
data["HEAVY"] = [Descriptors.HeavyAtomCount(y) for y in (Chem.MolFromSmiles(x) for x in data['SMILES'])]
data["RING"] = [Descriptors.RingCount(y) for y in (Chem.MolFromSmiles(x) for x in data['SMILES'])]
data["HETATOMS"] = [Descriptors.NumHeteroatoms(y) for y in (Chem.MolFromSmiles(x) for x in data['SMILES'])]
data["ROTBONDS"] = [Descriptors.NumRotatableBonds(y) for y in (Chem.MolFromSmiles(x) for x in data['SMILES'])]
data["CARBOALICYCLIC"] = [Descriptors.NumAliphaticCarbocycles(y) for y in (Chem.MolFromSmiles(x) for x in data['SMILES'])]
data["HETEROALICYCLIC"] = [Descriptors.NumAliphaticHeterocycles(y) for y in (Chem.MolFromSmiles(x) for x in data['SMILES'])]
data["CARBOAROMATIC"] = [Descriptors.NumAromaticCarbocycles(y) for y in (Chem.MolFromSmiles(x) for x in data['SMILES'])]
data["HETEROAROMATIC"] = [Descriptors.NumAromaticHeterocycles(y) for y in (Chem.MolFromSmiles(x) for x in data['SMILES'])]
data["AROMATIC"] = [Descriptors.NumAromaticRings(y) for y in (Chem.MolFromSmiles(x) for x in data['SMILES'])]

In [None]:
print(data.columns)
set(list(data['DATASET']))

In [None]:
data.to_csv('data_rdkit.csv', index = False)

In [None]:
data = pd.read_csv("data_rdkit_moe.csv")

In [None]:
# original databases are saved individually with their correspondant descriptors
foodb = data[data["DATASET"] == str('FooDB')]
fda = data[data["DATASET"] == str('FDA')]
unpda = data[data["DATASET"] == str('UNPD-A')]
purch = data[data["DATASET"] == str('FooDB commercially available')]
print(foodb.head(5))
print(fda.head(5))
print(unpda.head(5))
print(purch.head(5))

In [None]:
foodb.columns

Index(['ID', 'SMILES', 'DATASET', 'HBA', 'HBD', 'LOGP', 'TPSA', 'MW', 'CSP3',
       'HEAVY', 'RING', 'HETATOMS', 'ROTBONDS', 'CARBOALICYCLIC',
       'HETEROALICYCLIC', 'CARBOAROMATIC', 'HETEROAROMATIC', 'AROMATIC',
       'a_acid', 'a_aro', 'a_base', 'a_nBr', 'a_nCl', 'a_nF', 'a_nI', 'a_nN',
       'a_nO', 'b_rotR', 'chiral'],
      dtype='object')

In [None]:
foodb.columns = ['ID', 'SMILES', 'DATASET', 'HBA', 'HBD', 'LOGP', 'TPSA', 'MW', 'CSP3',
              'HEAVY', 'RING', 'HETATOMS', 'ROTBONDS', 'CARBOALICYCLIC',
              'HETEROALICYCLIC', 'CARBOAROMATIC', 'HETEROAROMATIC', 'AROMATIC',
              'ACID_ATOMS', 'AROM_ATOMS', 'BASIC_ATOMS', 'BROMINE', 'CHLORINE',
              'FLUORINE', 'IODINE', 'NITROGEN', 'OXYGEN', 'FRACROTBOND',
              'CHIRALCENTERS']
fda.columns = ['ID', 'SMILES', 'DATASET', 'HBA', 'HBD', 'LOGP', 'TPSA', 'MW', 'CSP3',
              'HEAVY', 'RING', 'HETATOMS', 'ROTBONDS', 'CARBOALICYCLIC',
              'HETEROALICYCLIC', 'CARBOAROMATIC', 'HETEROAROMATIC', 'AROMATIC',
              'ACID_ATOMS', 'AROM_ATOMS', 'BASIC_ATOMS', 'BROMINE', 'CHLORINE',
              'FLUORINE', 'IODINE', 'NITROGEN', 'OXYGEN', 'FRACROTBOND',
              'CHIRALCENTERS']
unpda.columns = ['ID', 'SMILES', 'DATASET', 'HBA', 'HBD', 'LOGP', 'TPSA', 'MW', 'CSP3',
              'HEAVY', 'RING', 'HETATOMS', 'ROTBONDS', 'CARBOALICYCLIC',
              'HETEROALICYCLIC', 'CARBOAROMATIC', 'HETEROAROMATIC', 'AROMATIC',
              'ACID_ATOMS', 'AROM_ATOMS', 'BASIC_ATOMS', 'BROMINE', 'CHLORINE',
              'FLUORINE', 'IODINE', 'NITROGEN', 'OXYGEN', 'FRACROTBOND',
              'CHIRALCENTERS']
purch.columns = ['ID', 'SMILES', 'DATASET', 'HBA', 'HBD', 'LOGP', 'TPSA', 'MW', 'CSP3',
              'HEAVY', 'RING', 'HETATOMS', 'ROTBONDS', 'CARBOALICYCLIC',
              'HETEROALICYCLIC', 'CARBOAROMATIC', 'HETEROAROMATIC', 'AROMATIC',
              'ACID_ATOMS', 'AROM_ATOMS', 'BASIC_ATOMS', 'BROMINE', 'CHLORINE',
              'FLUORINE', 'IODINE', 'NITROGEN', 'OXYGEN', 'FRACROTBOND',
              'CHIRALCENTERS']

In [None]:
foodb['HALOGEN'] = foodb['BROMINE'] + foodb['CHLORINE'] + foodb['FLUORINE'] + foodb['IODINE']
fda['HALOGEN'] = fda['BROMINE'] + fda['CHLORINE'] + fda['FLUORINE'] + fda['IODINE']
unpda['HALOGEN'] = unpda['BROMINE'] + unpda['CHLORINE'] + unpda['FLUORINE'] + unpda['IODINE']
purch['HALOGEN'] = purch['BROMINE'] + purch['CHLORINE'] + purch['FLUORINE'] + purch['IODINE']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frag['HALOGEN'] = frag['BROMINE'] + frag['CHLORINE'] + frag['FLUORINE'] + frag['IODINE']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fda['HALOGEN'] = fda['BROMINE'] + fda['CHLORINE'] + fda['FLUORINE'] + fda['IODINE']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unpda['HALOGEN'] = unpda['BROMIN

In [None]:
foodb.to_csv('foodb_descriptors.csv', index = False)
fda.to_csv('fda_descriptors.csv', index = False)
unpda.to_csv('unpda_descriptors.csv', index = False)
purch.to_csv('purch_descriptors.csv', index = False)

In [None]:
foodb = frag.describe()
fda = fda.describe()
unpda = unpda.describe()
purch = purch.describe()

In [None]:
# create a list with the names of the dataframes
dataframes = [foodb, fda, unpda, purch]

In [None]:
# define a function that add a sufix to the names of the columns in each dataframe in dataframes list
# according to the list position

def add_suffix_to_column_names(dataframes):
    for idx, df in enumerate(dataframes, start=1):
        suffix = f"_{idx}"
        df.columns = [f"{col}{suffix}" for col in df.columns]

In [None]:
# se invoca la función sobre todos los dataframes de la lista dataframe
add_suffix_to_column_names(dataframes)

In [None]:
unpda

Unnamed: 0,HBA_3,HBD_3,LOGP_3,TPSA_3,MW_3,CSP3_3,HEAVY_3,RING_3,HETATOMS_3,ROTBONDS_3,...,BASIC_ATOMS_3,BROMINE_3,CHLORINE_3,FLUORINE_3,IODINE_3,NITROGEN_3,OXYGEN_3,FRACROTBOND_3,CHIRALCENTERS_3,HALOGEN_3
count,14994.0,14994.0,14994.0,14994.0,14994.0,14994.0,14994.0,14994.0,14994.0,14994.0,...,14994.0,14994.0,14994.0,14994.0,14994.0,14994.0,14994.0,14994.0,14994.0,14994.0
mean,5.575764,2.505669,2.940461,90.781102,371.937464,0.518697,26.377151,3.094838,6.019875,4.742497,...,0.0,0.054155,0.038882,0.001267,0.002734,0.484661,5.376484,0.188474,3.806389,0.097039
std,4.952482,3.173353,3.017177,82.736251,196.43152,0.308217,13.903451,2.192149,5.081213,6.018119,...,0.0,0.397084,0.343235,0.053539,0.068761,1.214339,5.053385,0.202243,5.129296,0.542209
min,0.0,0.0,-18.5284,0.0,16.042999,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,1.464075,40.459999,246.306,0.25,18.0,2.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.052632,0.0,0.0
50%,4.0,2.0,2.86835,69.669998,330.2935,0.521739,24.0,3.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.125,2.0,0.0
75%,7.0,3.0,4.324475,112.05,445.59499,0.8,32.0,4.0,7.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.235294,6.0,0.0
max,53.0,36.0,24.432199,877.35999,1887.281,1.0,135.0,21.0,53.0,59.0,...,0.0,7.0,10.0,3.0,3.0,18.0,53.0,0.967213,43.0,10.0


In [None]:
data = pd.concat([foodb, fda, unpda, purch], axis = 1)

In [None]:
column_names = sorted(data.columns.tolist())

In [None]:
column_names

['ACID_ATOMS_1',
 'ACID_ATOMS_2',
 'ACID_ATOMS_3',
 'AROMATIC_1',
 'AROMATIC_2',
 'AROMATIC_3',
 'AROM_ATOMS_1',
 'AROM_ATOMS_2',
 'AROM_ATOMS_3',
 'BASIC_ATOMS_1',
 'BASIC_ATOMS_2',
 'BASIC_ATOMS_3',
 'BROMINE_1',
 'BROMINE_2',
 'BROMINE_3',
 'CARBOALICYCLIC_1',
 'CARBOALICYCLIC_2',
 'CARBOALICYCLIC_3',
 'CARBOAROMATIC_1',
 'CARBOAROMATIC_2',
 'CARBOAROMATIC_3',
 'CHIRALCENTERS_1',
 'CHIRALCENTERS_2',
 'CHIRALCENTERS_3',
 'CHLORINE_1',
 'CHLORINE_2',
 'CHLORINE_3',
 'CSP3_1',
 'CSP3_2',
 'CSP3_3',
 'FLUORINE_1',
 'FLUORINE_2',
 'FLUORINE_3',
 'FRACROTBOND_1',
 'FRACROTBOND_2',
 'FRACROTBOND_3',
 'HALOGEN_1',
 'HALOGEN_2',
 'HALOGEN_3',
 'HBA_1',
 'HBA_2',
 'HBA_3',
 'HBD_1',
 'HBD_2',
 'HBD_3',
 'HEAVY_1',
 'HEAVY_2',
 'HEAVY_3',
 'HETATOMS_1',
 'HETATOMS_2',
 'HETATOMS_3',
 'HETEROALICYCLIC_1',
 'HETEROALICYCLIC_2',
 'HETEROALICYCLIC_3',
 'HETEROAROMATIC_1',
 'HETEROAROMATIC_2',
 'HETEROAROMATIC_3',
 'IODINE_1',
 'IODINE_2',
 'IODINE_3',
 'LOGP_1',
 'LOGP_2',
 'LOGP_3',
 'MW_1',
 'MW

In [None]:
data_sorted = data.reindex(sorted(data.columns), axis=1)

In [None]:
data_sorted

Unnamed: 0,ACID_ATOMS_1,ACID_ATOMS_2,ACID_ATOMS_3,AROMATIC_1,AROMATIC_2,AROMATIC_3,AROM_ATOMS_1,AROM_ATOMS_2,AROM_ATOMS_3,BASIC_ATOMS_1,...,OXYGEN_3,RING_1,RING_2,RING_3,ROTBONDS_1,ROTBONDS_2,ROTBONDS_3,TPSA_1,TPSA_2,TPSA_3
count,143.0,2324.0,14994.0,143.0,2324.0,14994.0,143.0,2324.0,14994.0,143.0,...,14994.0,143.0,2324.0,14994.0,143.0,2324.0,14994.0,143.0,2324.0,14994.0
mean,0.0,0.11833,0.0,0.454545,1.537005,1.278045,2.412587,7.634251,6.189876,0.006993,...,5.376484,0.804196,2.77926,3.094838,2.734266,5.983649,4.742497,27.803916,95.714527,90.781102
std,0.0,0.711041,0.0,0.699369,1.310216,1.494913,3.54911,6.735489,7.427357,0.083624,...,5.053385,0.921257,1.977462,2.192149,2.216936,7.633089,6.018119,21.747263,106.283935,82.736251
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,1.0,2.0,1.0,2.0,1.0,17.07,43.369999,40.459999
50%,0.0,0.0,0.0,0.0,1.0,1.0,0.0,6.0,6.0,0.0,...,4.0,1.0,3.0,3.0,2.0,4.0,3.0,20.23,74.599998,69.669998
75%,0.0,0.0,0.0,1.0,2.0,2.0,6.0,12.0,12.0,0.0,...,7.0,1.0,4.0,4.0,4.0,7.0,6.0,33.754999,110.7675,112.05
max,0.0,12.0,0.0,3.0,10.0,15.0,14.0,60.0,84.0,1.0,...,53.0,4.0,30.0,21.0,10.0,149.0,59.0,181.62,1690.64,877.35999


In [None]:
data_sorted.round(3)

Unnamed: 0,ACID_ATOMS_1,ACID_ATOMS_2,ACID_ATOMS_3,AROMATIC_1,AROMATIC_2,AROMATIC_3,AROM_ATOMS_1,AROM_ATOMS_2,AROM_ATOMS_3,BASIC_ATOMS_1,...,OXYGEN_3,RING_1,RING_2,RING_3,ROTBONDS_1,ROTBONDS_2,ROTBONDS_3,TPSA_1,TPSA_2,TPSA_3
count,143.0,2324.0,14994.0,143.0,2324.0,14994.0,143.0,2324.0,14994.0,143.0,...,14994.0,143.0,2324.0,14994.0,143.0,2324.0,14994.0,143.0,2324.0,14994.0
mean,0.0,0.118,0.0,0.455,1.537,1.278,2.413,7.634,6.19,0.007,...,5.376,0.804,2.779,3.095,2.734,5.984,4.742,27.804,95.715,90.781
std,0.0,0.711,0.0,0.699,1.31,1.495,3.549,6.735,7.427,0.084,...,5.053,0.921,1.977,2.192,2.217,7.633,6.018,21.747,106.284,82.736
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,1.0,2.0,1.0,2.0,1.0,17.07,43.37,40.46
50%,0.0,0.0,0.0,0.0,1.0,1.0,0.0,6.0,6.0,0.0,...,4.0,1.0,3.0,3.0,2.0,4.0,3.0,20.23,74.6,69.67
75%,0.0,0.0,0.0,1.0,2.0,2.0,6.0,12.0,12.0,0.0,...,7.0,1.0,4.0,4.0,4.0,7.0,6.0,33.755,110.768,112.05
max,0.0,12.0,0.0,3.0,10.0,15.0,14.0,60.0,84.0,1.0,...,53.0,4.0,30.0,21.0,10.0,149.0,59.0,181.62,1690.64,877.36


In [None]:
data_sorted.to_csv("data_sorted_stat.csv", index=True)

In [None]:
statistics = pd.read_csv("data_sorted_stat.csv")

In [None]:
statistics

Unnamed: 0.1,Unnamed: 0,ACID_ATOMS_1,ACID_ATOMS_2,ACID_ATOMS_3,AROMATIC_1,AROMATIC_2,AROMATIC_3,AROM_ATOMS_1,AROM_ATOMS_2,AROM_ATOMS_3,...,OXYGEN_3,RING_1,RING_2,RING_3,ROTBONDS_1,ROTBONDS_2,ROTBONDS_3,TPSA_1,TPSA_2,TPSA_3
0,count,143.0,2324.0,14994.0,143.0,2324.0,14994.0,143.0,2324.0,14994.0,...,14994.0,143.0,2324.0,14994.0,143.0,2324.0,14994.0,143.0,2324.0,14994.0
1,mean,0.0,0.11833,0.0,0.454545,1.537005,1.278045,2.412587,7.634251,6.189876,...,5.376484,0.804196,2.77926,3.094838,2.734266,5.983649,4.742497,27.803916,95.714527,90.781102
2,std,0.0,0.711041,0.0,0.699369,1.310216,1.494913,3.54911,6.735489,7.427357,...,5.053385,0.921257,1.977462,2.192149,2.216936,7.633089,6.018119,21.747263,106.283935,82.736251
3,min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,1.0,2.0,1.0,2.0,1.0,17.07,43.369999,40.459999
5,50%,0.0,0.0,0.0,0.0,1.0,1.0,0.0,6.0,6.0,...,4.0,1.0,3.0,3.0,2.0,4.0,3.0,20.23,74.599998,69.669998
6,75%,0.0,0.0,0.0,1.0,2.0,2.0,6.0,12.0,12.0,...,7.0,1.0,4.0,4.0,4.0,7.0,6.0,33.754999,110.7675,112.05
7,max,0.0,12.0,0.0,3.0,10.0,15.0,14.0,60.0,84.0,...,53.0,4.0,30.0,21.0,10.0,149.0,59.0,181.62,1690.64,877.35999


In [None]:
statistics.rename(columns={statistics.columns[0]: 'statistic'}, inplace=True)

In [None]:
stat = list(statistics['statistic'])
array = statistics.to_numpy()
print(stat)
print(array)

['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
[['count' 143.0 2324.0 14994.0 143.0 2324.0 14994.0 143.0 2324.0 14994.0
  143.0 2324.0 14994.0 143.0 2324.0 14994.0 143.0 2324.0 14994.0 143.0
  2324.0 14994.0 143.0 2324.0 14994.0 143.0 2324.0 14994.0 143.0 2324.0
  14994.0 143.0 2324.0 14994.0 143.0 2324.0 14994.0 143.0 2324.0 14994.0
  143.0 2324.0 14994.0 143.0 2324.0 14994.0 143.0 2324.0 14994.0 143.0
  2324.0 14994.0 143.0 2324.0 14994.0 143.0 2324.0 14994.0 143.0 2324.0
  14994.0 143.0 2324.0 14994.0 143.0 2324.0 14994.0 143.0 2324.0 14994.0
  143.0 2324.0 14994.0 143.0 2324.0 14994.0 143.0 2324.0 14994.0 143.0
  2324.0 14994.0]
 ['mean' 0.0 0.1183304647160068 0.0 0.4545454545454545 1.5370051635111877
  1.2780445511537948 2.4125874125874125 7.634251290877797
  6.189875950380152 0.0069930069930069 0.052065404475043 0.0
  0.0139860139860139 0.0180722891566265 0.0541549953314659
  0.2517482517482518 0.5107573149741824 0.9595838335334134
  0.3636363636363636 1.031841652323

In [None]:
array = np.transpose(array)
array

array([['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'],
       [143.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
       [2324.0, 0.1183304647160068, 0.7110405155894923, 0.0, 0.0, 0.0,
        0.0, 12.0],
       [14994.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
       [143.0, 0.4545454545454545, 0.6993686562075336, 0.0, 0.0, 0.0,
        1.0, 3.0],
       [2324.0, 1.5370051635111877, 1.3102164132227458, 0.0, 0.0, 1.0,
        2.0, 10.0],
       [14994.0, 1.2780445511537948, 1.4949130548079794, 0.0, 0.0, 1.0,
        2.0, 15.0],
       [143.0, 2.4125874125874125, 3.5491095318992203, 0.0, 0.0, 0.0,
        6.0, 14.0],
       [2324.0, 7.634251290877797, 6.735489292738116, 0.0, 0.0, 6.0,
        12.0, 60.0],
       [14994.0, 6.189875950380152, 7.427357460277811, 0.0, 0.0, 6.0,
        12.0, 84.0],
       [143.0, 0.0069930069930069, 0.083624201000709, 0.0, 0.0, 0.0, 0.0,
        1.0],
       [2324.0, 0.052065404475043, 0.2662700428358466, 0.0, 0.0, 0.0,
        0.0, 4.0],
       [14994.0, 

In [None]:
df = pd.DataFrame(array, columns = stat)
df

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,count,mean,std,min,25%,50%,75%,max
1,143.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2324.0,0.11833,0.711041,0.0,0.0,0.0,0.0,12.0
3,14994.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,143.0,0.454545,0.699369,0.0,0.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...
77,2324.0,5.983649,7.633089,0.0,2.0,4.0,7.0,149.0
78,14994.0,4.742497,6.018119,0.0,1.0,3.0,6.0,59.0
79,143.0,27.803916,21.747263,0.0,17.07,20.23,33.754999,181.62
80,2324.0,95.714527,106.283935,0.0,43.369999,74.599998,110.7675,1690.64


In [None]:
df2 = df.iloc[1:]
df2 = df2.reset_index(drop = True)
df2

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,143.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2324.0,0.11833,0.711041,0.0,0.0,0.0,0.0,12.0
2,14994.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,143.0,0.454545,0.699369,0.0,0.0,0.0,1.0,3.0
4,2324.0,1.537005,1.310216,0.0,0.0,1.0,2.0,10.0
...,...,...,...,...,...,...,...,...
76,2324.0,5.983649,7.633089,0.0,2.0,4.0,7.0,149.0
77,14994.0,4.742497,6.018119,0.0,1.0,3.0,6.0,59.0
78,143.0,27.803916,21.747263,0.0,17.07,20.23,33.754999,181.62
79,2324.0,95.714527,106.283935,0.0,43.369999,74.599998,110.7675,1690.64


In [None]:
df3 = df2
df3['descriptor'] = column_names

In [None]:
df3.columns

Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max',
       'descriptor'],
      dtype='object')

In [None]:
df3 = df3[['descriptor', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]

In [None]:
df3

Unnamed: 0,descriptor,count,mean,std,min,25%,50%,75%,max
0,ACID_ATOMS_1,143.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ACID_ATOMS_2,2324.0,0.11833,0.711041,0.0,0.0,0.0,0.0,12.0
2,ACID_ATOMS_3,14994.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AROMATIC_1,143.0,0.454545,0.699369,0.0,0.0,0.0,1.0,3.0
4,AROMATIC_2,2324.0,1.537005,1.310216,0.0,0.0,1.0,2.0,10.0
...,...,...,...,...,...,...,...,...,...
76,ROTBONDS_2,2324.0,5.983649,7.633089,0.0,2.0,4.0,7.0,149.0
77,ROTBONDS_3,14994.0,4.742497,6.018119,0.0,1.0,3.0,6.0,59.0
78,TPSA_1,143.0,27.803916,21.747263,0.0,17.07,20.23,33.754999,181.62
79,TPSA_2,2324.0,95.714527,106.283935,0.0,43.369999,74.599998,110.7675,1690.64


In [None]:
df3.to_csv('foodb_statistics.csv', index = False)