### **Libraries Used :**

---



In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
bitter_train = pd.read_csv("/content/drive/MyDrive/Capstone/DataSets/bitter-train.tsv", sep='\t',encoding='utf-8', header=0)
bitter_test = pd.read_csv("/content/drive/MyDrive/Capstone/DataSets/bitter-test.tsv", sep='\t',encoding='utf-8', header=0)
sweet_train = pd.read_csv("/content/drive/MyDrive/Capstone/DataSets/sweet-train.tsv", sep='\t',encoding='utf-8', header=0)
sweet_test = pd.read_csv("/content/drive/MyDrive/Capstone/DataSets/sweet-test.tsv", sep='\t',encoding='utf-8', header=0)

In [None]:
bitter_train['Taste'].unique()

array(['Sweet', 'Bitter', 'Tasteless'], dtype=object)

In [None]:
bitter_test['Taste'].unique()

array(['Non-bitter', 'Bitter'], dtype=object)

In [None]:
sweet_train['Taste'].unique()

array(['Sweet', 'Bitter', 'Tasteless'], dtype=object)

In [None]:
sweet_test['Taste'].unique()

array(['Sweet', 'Bitter', 'Tasteless'], dtype=object)

## **Installing Mordred**

In [None]:
!which python 

/usr/local/bin/python


In [None]:
!python --version

Python 3.7.10


In [None]:
!conda --version

conda 4.9.2


In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()

✨🍰✨ Everything looks OK!


In [None]:
!conda --version

conda 4.9.2


In [None]:
!conda install -c rdkit -c mordred-descriptor mordred

Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / done
Solving environment: \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - mordred


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    boost-1.74.0               |  

In [None]:
from rdkit import Chem
from mordred import Calculator, descriptors
calcAll = Calculator(descriptors)
calc2d = Calculator(descriptors, ignore_3D=True)

In [None]:
len(calcAll.descriptors)

1826

In [None]:
len(calc2d.descriptors)

1613

In [None]:
mol = Chem.MolFromSmiles('CC[N](Cc1ccc(cc1)Cl)(CC(=O)Nc1c(C)cccc1C)CC')

In [None]:
mol

In [None]:
calc2d(mol)

TypeError: ignored

## **Generating Features for Bitter Train:**

In [None]:
mols = [Chem.MolFromSmiles(smi) for smi in bitter_train['SMILES']]

In [None]:
type(mols)

list

In [None]:
mols

[<rdkit.Chem.rdchem.Mol at 0x7fa299e41580>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e41530>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e414e0>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e415d0>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e41620>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e41670>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e416c0>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e41710>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e41760>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e417b0>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e41800>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e41850>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e418a0>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e418f0>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e41940>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e41990>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e419e0>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e41a30>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e41a80>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299e41ad0>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299eb68f0>,
 <rdkit.Chem.rdchem.Mol at 0x7fa299eb6530>,
 <rdkit.Chem.rdchem.Mol at 0x7fa

In [None]:
noneIndices = []
withoutNoneIndices = []
molesWithoutNone = []
for i in range(len(mols)):
  if mols[i]==None:
    noneIndices.append(i)
  else:
    molesWithoutNone.append(mols[i])
    withoutNoneIndices.append(i)

In [None]:
withoutNoneIndices

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [None]:
df_Bitter_Train = calc2d.pandas(molesWithoutNone)

  1%|          | 23/2233 [00:19<55:10,  1.50s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 28%|██▊       | 620/2233 [04:55<50:05,  1.86s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 2233/2233 [16:10<00:00,  2.30it/s]


In [None]:
df_Bitter_Train

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,VE2_A,VE3_A,VR1_A,VR2_A,VR3_A,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,nBridgehead,nHetero,nH,nB,nC,nN,nO,nS,nP,nF,nCl,nBr,nI,nX,ATS0dv,ATS1dv,ATS2dv,ATS3dv,ATS4dv,...,JGI7,JGI8,JGI9,JGI10,JGT10,Diameter,Radius,TopoShapeIndex,PetitjeanIndex,Vabc,VAdjMat,MWC01,MWC02,MWC03,MWC04,MWC05,MWC06,MWC07,MWC08,MWC09,MWC10,TMWC10,SRW02,SRW03,SRW04,SRW05,SRW06,SRW07,SRW08,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,17.213262,16.059815,0,0,28.766735,2.52242,4.95823,28.766735,1.250728,4.052767,4.072546,0.177067,2.237177,157.468009,6.846435,5.892131,0,0,45,23,0,0,11,22,0,12,0,11,0,0,0,0,0,0,0,408.000000,311.000000,532.000000,760.000000,588.000000,...,0.020266,0.014066,0.007946,0.000000,0.628961,10,5,1.0,0.5,288.08786,5.584963,24.0,4.795791,5.686975,6.602588,7.513709,8.434464,9.350189,10.272738,11.190528,12.113908,122.960890,3.891820,0.0,5.262690,2.397895,6.849066,4.94876,8.525360,7.195187,10.247042,72.317821,342.116212,7.602582,1110,43,120.0,147.0,10.451389,5.291667
1,17.213262,16.059815,0,0,28.766735,2.52242,4.95823,28.766735,1.250728,4.052767,4.072546,0.177067,2.237177,157.468009,6.846435,5.892131,0,0,42,23,0,0,11,19,0,12,0,8,0,0,0,3,0,0,3,334.814815,281.444444,477.111111,591.111111,423.333333,...,0.020266,0.014066,0.007946,0.000000,0.628961,10,5,1.0,0.5,307.350383,5.584963,24.0,4.795791,5.686975,6.602588,7.513709,8.434464,9.350189,10.272738,11.190528,12.113908,122.960890,3.891820,0.0,5.262690,2.397895,6.849066,4.94876,8.525360,7.195187,10.247042,72.317821,396.014551,9.428918,1110,43,120.0,147.0,10.451389,5.291667
2,17.920577,15.130873,1,3,27.11314,2.54412,5.088241,27.11314,1.129714,4.067565,2.97276,0.123865,1.96496,30261.323793,1260.888491,11.193094,0,0,57,24,0,0,8,33,0,16,3,4,1,0,0,0,0,0,0,274.444444,215.333333,264.666667,233.666667,226.333333,...,0.018924,0.011938,0.009102,0.007943,0.659261,15,8,0.875,0.466667,356.959519,5.584963,24.0,4.762174,5.549076,6.444131,7.257708,8.172164,9.003685,9.935035,10.780060,11.725590,121.629623,3.891820,0.0,5.262690,0.000000,6.829794,0.00000,8.509363,0.000000,10.256922,58.750589,363.219178,6.372266,1807,27,116.0,128.0,11.069444,5.250000
3,15.315311,13.970681,1,1,25.367667,2.287533,4.575067,25.367667,1.207984,3.915999,4.057263,0.193203,2.142446,121.014646,5.762602,5.537849,6,6,39,21,0,0,7,18,0,14,2,5,0,0,0,0,0,0,0,330.000000,271.000000,348.000000,335.000000,376.000000,...,0.014757,0.012078,0.008125,0.007448,0.489651,11,6,0.833333,0.454545,277.569695,5.392317,21.0,4.574711,5.351858,6.175867,6.978214,7.805882,8.618485,9.447229,10.264966,11.093995,112.311208,3.761200,0.0,5.017280,0.000000,6.447306,0.00000,7.951207,0.000000,9.493487,53.670480,294.121572,7.541579,1000,27,96.0,105.0,8.916667,4.888889
4,8.829880,8.715591,0,0,14.640475,2.418289,4.836579,14.640475,1.22004,3.392014,3.170872,0.264239,1.336328,42.564698,3.547058,3.933347,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,212.000000,155.000000,254.000000,346.000000,191.000000,...,0.000000,0.000000,0.000000,0.000000,0.664926,6,3,1.0,0.5,152.717273,4.584963,12.0,4.110874,4.962845,5.849325,6.722630,7.610358,8.487764,9.375092,10.253827,11.140687,92.513402,3.218876,0.0,4.574711,0.000000,6.163315,0.00000,7.845024,0.000000,9.569063,43.370989,180.063388,7.502641,178,20,60.0,71.0,6.145833,2.722222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2228,6.473351,6.191587,0,0,11.142106,2.200122,4.400244,11.142106,1.238012,3.089776,2.850317,0.316702,0.94207,28.079758,3.119973,3.229688,6,6,18,9,0,0,2,9,0,7,1,1,0,0,0,0,0,0,0,114.000000,104.000000,123.000000,100.000000,46.000000,...,0.000000,0.000000,0.000000,0.000000,0.438078,5,3,0.666667,0.4,118.249507,4.169925,9.0,3.713572,4.465908,5.262690,6.033086,6.834109,7.608374,8.409831,9.185125,9.986495,79.499191,2.944439,0.0,4.143135,0.000000,5.556828,0.00000,7.062192,0.000000,8.606851,37.313444,123.068414,6.837134,88,9,40.0,43.0,3.472222,2.166667
2229,6.473351,6.127583,0,0,11.189957,2.193993,4.387987,11.189957,1.243329,3.089765,2.856388,0.317376,0.944197,28.049653,3.116628,3.228616,6,6,17,9,0,0,2,8,0,7,0,2,0,0,0,0,0,0,0,122.000000,104.000000,128.000000,106.000000,58.000000,...,0.000000,0.000000,0.000000,0.000000,0.475391,6,3,1.0,0.5,116.042974,4.169925,9.0,3.713572,4.465908,5.252273,6.028279,6.816736,7.598399,8.386401,9.169831,9.957265,79.388664,2.944439,0.0,4.143135,0.000000,5.556828,0.00000,7.055313,0.000000,8.590258,37.289972,124.052429,7.297202,90,9,40.0,43.0,3.472222,2.166667
2230,10.635111,9.197041,0,0,18.814625,2.30725,4.614501,18.814625,1.343902,3.558646,3.48721,0.249086,1.585574,63.60762,4.543401,4.489206,12,12,23,14,0,0,2,9,0,12,1,1,0,0,0,0,0,0,0,190.000000,188.000000,244.000000,236.000000,188.000000,...,0.012153,0.000000,0.000000,0.000000,0.279611,8,4,1.0,0.5,170.927139,4.906891,15.0,4.234107,5.043425,5.866468,6.693324,7.524561,8.356790,9.191056,10.025307,10.860786,96.795823,3.433987,0.0,4.672829,0.000000,6.137727,0.00000,7.690286,0.000000,9.282754,45.217583,183.068414,7.959496,307,18,68.0,77.0,3.833333,3.222222
2231,8.106344,7.838945,0,0,13.165395,2.242743,4.296119,13.165395,1.196854,3.28912,2.958768,0.268979,1.180083,49.082458,4.462042,3.988812,5,5,23,11,0,0,2,12,0,9,0,2,0,0,0,0,0,0,0,148.000000,133.000000,139.000000,69.000000,80.000000,...,0.000000,0.000000,0.000000,0.000000,0.513992,7,4,0.75,0.428571,150.634944,4.459432,11.0,3.931826,4.672829,5.484797,6.255750,7.069023,7.854381,8.666130,9.459619,10.269900,85.664256,3.135494,0.0,4.369448,2.397895,5.739793,4.59512,7.172425,6.529419,8.645235,53.584828,152.083730,6.612336,173,9,50.0,53.0,4.583333,2.500000


In [None]:
temp_df = bitter_train.iloc[withoutNoneIndices]

In [None]:
temp_df

Unnamed: 0,Name,Taste,Reference,SMILES,Canonical SMILES,Bitter
0,Sucrose,Sweet,Rojas et al. (2017),OCC1OC(C(C1O)O)(CO)OC1OC(CO)C(C(C1O)O)O,OCC1OC(C(C1O)O)(CO)OC1OC(CO)C(C(C1O)O)O,False
1,"Sucralose / 4,1',6'-Trichloro-galactosucrose",Sweet,Rojas et al. (2017),ClCC1OC(C(C1O)O)(CCl)OC1OC(CO)C(C(C1O)O)Cl,ClCC1OC(C(C1O)O)(CCl)OC1OC(CO)C(C(C1O)O)Cl,False
2,Alitame,Sweet,Rojas et al. (2017),CC(COCNC1C(C)(C)SC1(C)C)NCOCC(CC(=O)O)N,CC(COCNC1C(C)(C)SC1(C)C)NCOCC(CC(=O)O)N,False
3,Aspartame/Aspartyl-phenylalanine methylester,Sweet,Rojas et al. (2017),COC(=O)C(NC(=O)C(CC(=O)O)N)Cc1ccccc1,COC(=O)C(NC(=O)C(CC(=O)O)N)Cc1ccccc1,False
4,Tagatose,Sweet,Rojas et al. (2017),OCC1(O)OCC(C(C1O)O)O,OCC1(O)OCC(C(C1O)O)O,False
...,...,...,...,...,...,...
2252,6-Methyl-2-pyridinemethanol,Bitter,The Good Scents Company Database,CC1=NC(=CC=C1)CO,OCc1cccc(n1)C,True
2253,4-hydroxybenzyl alcohol,Bitter,The Good Scents Company Database,C1=CC(=CC=C1CO)O,OCc1ccc(cc1)O,True
2254,4-Benzoylpyridine,Bitter,The Good Scents Company Database,C1=CC=C(C=C1)C(=O)C2=CC=NC=C2,O=C(c1ccncc1)c1ccccc1,True
2255,4-(5-Methyl-2-furyl)-2-butanone,Bitter,The Good Scents Company Database,CC1=CC=C(O1)CCC(=O)C,CC(=O)CCc1ccc(o1)C,True


In [None]:
df_Bitter_Train['Name'] = temp_df['Name']
df_Bitter_Train['Taste'] = temp_df['Taste']
df_Bitter_Train['Bitter'] = temp_df['Bitter']

In [None]:
df_Bitter_Train

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,VE2_A,VE3_A,VR1_A,VR2_A,VR3_A,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,nBridgehead,nHetero,nH,nB,nC,nN,nO,nS,nP,nF,nCl,nBr,nI,nX,ATS0dv,ATS1dv,ATS2dv,ATS3dv,ATS4dv,...,JGI10,JGT10,Diameter,Radius,TopoShapeIndex,PetitjeanIndex,Vabc,VAdjMat,MWC01,MWC02,MWC03,MWC04,MWC05,MWC06,MWC07,MWC08,MWC09,MWC10,TMWC10,SRW02,SRW03,SRW04,SRW05,SRW06,SRW07,SRW08,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Name,Taste,Bitter
0,17.213262,16.059815,0,0,28.766735,2.52242,4.95823,28.766735,1.250728,4.052767,4.072546,0.177067,2.237177,157.468009,6.846435,5.892131,0,0,45,23,0,0,11,22,0,12,0,11,0,0,0,0,0,0,0,408.000000,311.000000,532.000000,760.000000,588.000000,...,0.000000,0.628961,10,5,1.0,0.5,288.08786,5.584963,24.0,4.795791,5.686975,6.602588,7.513709,8.434464,9.350189,10.272738,11.190528,12.113908,122.960890,3.891820,0.0,5.262690,2.397895,6.849066,4.94876,8.525360,7.195187,10.247042,72.317821,342.116212,7.602582,1110,43,120.0,147.0,10.451389,5.291667,Sucrose,Sweet,False
1,17.213262,16.059815,0,0,28.766735,2.52242,4.95823,28.766735,1.250728,4.052767,4.072546,0.177067,2.237177,157.468009,6.846435,5.892131,0,0,42,23,0,0,11,19,0,12,0,8,0,0,0,3,0,0,3,334.814815,281.444444,477.111111,591.111111,423.333333,...,0.000000,0.628961,10,5,1.0,0.5,307.350383,5.584963,24.0,4.795791,5.686975,6.602588,7.513709,8.434464,9.350189,10.272738,11.190528,12.113908,122.960890,3.891820,0.0,5.262690,2.397895,6.849066,4.94876,8.525360,7.195187,10.247042,72.317821,396.014551,9.428918,1110,43,120.0,147.0,10.451389,5.291667,"Sucralose / 4,1',6'-Trichloro-galactosucrose",Sweet,False
2,17.920577,15.130873,1,3,27.11314,2.54412,5.088241,27.11314,1.129714,4.067565,2.97276,0.123865,1.96496,30261.323793,1260.888491,11.193094,0,0,57,24,0,0,8,33,0,16,3,4,1,0,0,0,0,0,0,274.444444,215.333333,264.666667,233.666667,226.333333,...,0.007943,0.659261,15,8,0.875,0.466667,356.959519,5.584963,24.0,4.762174,5.549076,6.444131,7.257708,8.172164,9.003685,9.935035,10.780060,11.725590,121.629623,3.891820,0.0,5.262690,0.000000,6.829794,0.00000,8.509363,0.000000,10.256922,58.750589,363.219178,6.372266,1807,27,116.0,128.0,11.069444,5.250000,Alitame,Sweet,False
3,15.315311,13.970681,1,1,25.367667,2.287533,4.575067,25.367667,1.207984,3.915999,4.057263,0.193203,2.142446,121.014646,5.762602,5.537849,6,6,39,21,0,0,7,18,0,14,2,5,0,0,0,0,0,0,0,330.000000,271.000000,348.000000,335.000000,376.000000,...,0.007448,0.489651,11,6,0.833333,0.454545,277.569695,5.392317,21.0,4.574711,5.351858,6.175867,6.978214,7.805882,8.618485,9.447229,10.264966,11.093995,112.311208,3.761200,0.0,5.017280,0.000000,6.447306,0.00000,7.951207,0.000000,9.493487,53.670480,294.121572,7.541579,1000,27,96.0,105.0,8.916667,4.888889,Aspartame/Aspartyl-phenylalanine methylester,Sweet,False
4,8.829880,8.715591,0,0,14.640475,2.418289,4.836579,14.640475,1.22004,3.392014,3.170872,0.264239,1.336328,42.564698,3.547058,3.933347,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,212.000000,155.000000,254.000000,346.000000,191.000000,...,0.000000,0.664926,6,3,1.0,0.5,152.717273,4.584963,12.0,4.110874,4.962845,5.849325,6.722630,7.610358,8.487764,9.375092,10.253827,11.140687,92.513402,3.218876,0.0,4.574711,0.000000,6.163315,0.00000,7.845024,0.000000,9.569063,43.370989,180.063388,7.502641,178,20,60.0,71.0,6.145833,2.722222,Tagatose,Sweet,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2228,6.473351,6.191587,0,0,11.142106,2.200122,4.400244,11.142106,1.238012,3.089776,2.850317,0.316702,0.94207,28.079758,3.119973,3.229688,6,6,18,9,0,0,2,9,0,7,1,1,0,0,0,0,0,0,0,114.000000,104.000000,123.000000,100.000000,46.000000,...,0.000000,0.438078,5,3,0.666667,0.4,118.249507,4.169925,9.0,3.713572,4.465908,5.262690,6.033086,6.834109,7.608374,8.409831,9.185125,9.986495,79.499191,2.944439,0.0,4.143135,0.000000,5.556828,0.00000,7.062192,0.000000,8.606851,37.313444,123.068414,6.837134,88,9,40.0,43.0,3.472222,2.166667,2-[2-(methylsulfanyl)ethyl]pyridine,Bitter,True
2229,6.473351,6.127583,0,0,11.189957,2.193993,4.387987,11.189957,1.243329,3.089765,2.856388,0.317376,0.944197,28.049653,3.116628,3.228616,6,6,17,9,0,0,2,8,0,7,0,2,0,0,0,0,0,0,0,122.000000,104.000000,128.000000,106.000000,58.000000,...,0.000000,0.475391,6,3,1.0,0.5,116.042974,4.169925,9.0,3.713572,4.465908,5.252273,6.028279,6.816736,7.598399,8.386401,9.169831,9.957265,79.388664,2.944439,0.0,4.143135,0.000000,5.556828,0.00000,7.055313,0.000000,8.590258,37.289972,124.052429,7.297202,90,9,40.0,43.0,3.472222,2.166667,TERPIN,Bitter,True
2230,10.635111,9.197041,0,0,18.814625,2.30725,4.614501,18.814625,1.343902,3.558646,3.48721,0.249086,1.585574,63.60762,4.543401,4.489206,12,12,23,14,0,0,2,9,0,12,1,1,0,0,0,0,0,0,0,190.000000,188.000000,244.000000,236.000000,188.000000,...,0.000000,0.279611,8,4,1.0,0.5,170.927139,4.906891,15.0,4.234107,5.043425,5.866468,6.693324,7.524561,8.356790,9.191056,10.025307,10.860786,96.795823,3.433987,0.0,4.672829,0.000000,6.137727,0.00000,7.690286,0.000000,9.282754,45.217583,183.068414,7.959496,307,18,68.0,77.0,3.833333,3.222222,MAGNESIUM SULFATE,Bitter,True
2231,8.106344,7.838945,0,0,13.165395,2.242743,4.296119,13.165395,1.196854,3.28912,2.958768,0.268979,1.180083,49.082458,4.462042,3.988812,5,5,23,11,0,0,2,12,0,9,0,2,0,0,0,0,0,0,0,148.000000,133.000000,139.000000,69.000000,80.000000,...,0.000000,0.513992,7,4,0.75,0.428571,150.634944,4.459432,11.0,3.931826,4.672829,5.484797,6.255750,7.069023,7.854381,8.666130,9.459619,10.269900,85.664256,3.135494,0.0,4.369448,2.397895,5.739793,4.59512,7.172425,6.529419,8.645235,53.584828,152.083730,6.612336,173,9,50.0,53.0,4.583333,2.500000,Quinine,Bitter,True


In [None]:
df_Bitter_Train.to_csv('bitter_Train_New.csv', index=False)

## **Generating Features for Bitter Test:**

In [None]:
mols = [Chem.MolFromSmiles(smi) for smi in bitter_test['SMILES']]

In [None]:
type(mols)

list

In [None]:
mols

[<rdkit.Chem.rdchem.Mol at 0x7fd109e489e0>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48990>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48940>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48a30>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48a80>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48ad0>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48b20>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48b70>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48bc0>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48c10>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48c60>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48cb0>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48d00>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48d50>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48da0>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48df0>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48e40>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48e90>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48ee0>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48f30>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109e48f80>,
 <rdkit.Chem.rdchem.Mol at 0x7fd109dd9030>,
 <rdkit.Chem.rdchem.Mol at 0x7fd

In [None]:
noneIndices = []
withoutNoneIndices = []
molesWithoutNone = []
for i in range(len(mols)):
  if mols[i]==None:
    noneIndices.append(i)
  else:
    molesWithoutNone.append(mols[i])
    withoutNoneIndices.append(i)

In [None]:
noneIndices

[]

In [None]:
df_Bitter_Test = calc2d.pandas(molesWithoutNone)

 55%|█████▍    | 94/171 [01:14<00:55,  1.38it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 171/171 [01:52<00:00,  1.52it/s]


In [None]:
df_Bitter_Test

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,VE2_A,VE3_A,VR1_A,VR2_A,VR3_A,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,nBridgehead,nHetero,nH,nB,nC,nN,nO,nS,nP,nF,nCl,nBr,nI,nX,ATS0dv,ATS1dv,ATS2dv,ATS3dv,ATS4dv,...,JGI7,JGI8,JGI9,JGI10,JGT10,Diameter,Radius,TopoShapeIndex,PetitjeanIndex,Vabc,VAdjMat,MWC01,MWC02,MWC03,MWC04,MWC05,MWC06,MWC07,MWC08,MWC09,MWC10,TMWC10,SRW02,SRW03,SRW04,SRW05,SRW06,SRW07,SRW08,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,8.829880,8.715591,0,0,14.640475,2.418289,4.836579,14.640475,1.220040,3.392014,3.170872,0.264239,1.336328,42.564698,3.547058,3.933347,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,212.0,155.0,254.0,346.0,191.0,...,0.000000,0.000000,0.000000,0.000000,0.664926,6,3,1.000000,0.500000,152.717273,4.584963,12.0,4.110874,4.962845,5.849325,6.722630,7.610358,8.487764,9.375092,10.253827,11.140687,92.513402,3.218876,0.0,4.574711,0.0,6.163315,0.0,7.845024,0.0,9.569063,43.370989,180.063388,7.502641,178,20,60.0,71.0,6.145833,2.722222
1,8.761080,8.651650,0,0,14.708146,2.377683,4.755367,14.708146,1.225679,3.384121,3.178505,0.264875,1.338733,43.594643,3.632887,3.957256,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,210.0,148.0,240.0,328.0,206.0,...,0.000000,0.000000,0.000000,0.000000,0.549705,6,4,0.500000,0.333333,152.717273,4.584963,12.0,4.077537,4.919981,5.783825,6.645091,7.511525,8.375860,9.242711,10.107774,10.974643,91.638947,3.218876,0.0,4.532599,0.0,6.070738,0.0,7.709308,0.0,9.396405,42.927926,180.063388,7.502641,182,19,58.0,68.0,6.055556,2.777778
2,8.761080,8.651650,0,0,14.708146,2.377683,4.755367,14.708146,1.225679,3.384121,3.178505,0.264875,1.338733,43.594643,3.632887,3.957256,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,210.0,148.0,240.0,328.0,206.0,...,0.000000,0.000000,0.000000,0.000000,0.549705,6,4,0.500000,0.333333,152.717273,4.584963,12.0,4.077537,4.919981,5.783825,6.645091,7.511525,8.375860,9.242711,10.107774,10.974643,91.638947,3.218876,0.0,4.532599,0.0,6.070738,0.0,7.709308,0.0,9.396405,42.927926,180.063388,7.502641,182,19,58.0,68.0,6.055556,2.777778
3,8.163363,8.029752,0,0,13.137460,2.364871,4.729742,13.137460,1.194315,3.303305,3.092493,0.281136,1.224288,35.727761,3.247978,3.671238,0,0,23,11,0,0,5,12,0,6,0,5,0,0,0,0,0,0,0,182.0,135.0,216.0,272.0,138.0,...,0.000000,0.000000,0.000000,0.000000,0.560975,5,3,0.666667,0.400000,143.927046,4.459432,11.0,4.007333,4.844187,5.707110,6.561031,7.424762,8.281724,9.145375,10.003107,10.866605,88.841233,3.135494,0.0,4.465908,0.0,6.008813,0.0,7.647309,0.0,9.331052,41.588577,164.068473,7.133412,140,17,54.0,63.0,5.805556,2.444444
4,6.611250,7.282959,0,0,11.763639,2.188901,4.377802,11.763639,1.176364,3.134263,2.855225,0.285523,1.049151,30.472997,3.047300,3.416841,0,0,20,10,0,0,5,10,0,5,0,5,0,0,0,0,0,0,0,178.0,109.0,149.0,212.0,141.0,...,0.000000,0.000000,0.000000,0.000000,0.545783,6,3,1.000000,0.500000,136.351061,4.169925,9.0,3.713572,4.465908,5.241747,6.018593,6.801283,7.582229,8.366603,9.148359,9.933095,80.271389,2.944439,0.0,4.143135,0.0,5.556828,0.0,7.048386,0.0,8.575273,38.268062,150.052823,7.502641,125,12,40.0,43.0,5.833333,2.555556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,19.981978,16.397806,0,0,33.251113,2.526538,5.053076,33.251113,1.278889,4.186542,4.120980,0.158499,2.371602,312.886434,12.034094,6.701352,12,12,44,26,0,0,8,18,0,18,0,8,0,0,0,0,0,0,0,450.0,417.0,627.0,806.0,708.0,...,0.017778,0.012689,0.011776,0.009632,0.516379,13,7,0.857143,0.461538,312.881419,5.807355,28.0,4.934474,5.820083,6.727432,7.630947,8.542666,9.452737,10.367253,11.281107,12.197904,130.954603,4.043051,0.0,5.398163,0.0,6.970730,0.0,8.646993,0.0,10.378385,61.437322,362.100168,8.229549,1633,49,138.0,168.0,10.222222,5.833333
167,15.913028,13.077290,1,0,26.247606,2.316776,4.633552,26.247606,1.249886,3.945703,3.825724,0.182177,2.083685,176.194516,8.390215,5.913526,12,12,33,21,0,0,5,12,0,16,1,4,0,0,0,0,0,0,0,348.0,310.0,424.0,398.0,383.0,...,0.015972,0.007716,0.007500,0.008617,0.447955,12,6,1.000000,0.500000,259.890613,5.459432,22.0,4.634729,5.433722,6.270988,7.089243,7.926964,8.751000,9.588571,10.414903,11.252664,114.362784,3.806662,0.0,5.081404,0.0,6.541030,0.0,8.083637,0.0,9.668714,54.181448,282.077181,8.547793,1064,28,102.0,114.0,7.416667,4.722222
168,17.884519,14.823609,1,0,30.336232,2.328747,4.657494,30.336232,1.264010,4.074059,4.563638,0.190152,2.393589,132.574159,5.523923,5.762611,12,12,40,24,0,0,6,16,0,18,1,5,0,0,0,0,0,0,0,404.0,357.0,493.0,502.0,451.0,...,0.014254,0.008642,0.007625,0.008499,0.452254,13,7,0.857143,0.461538,303.272810,5.643856,25.0,4.762174,5.579730,6.428105,7.263330,8.113427,8.952735,9.802617,10.643208,11.492753,122.038079,3.931826,0.0,5.209486,0.0,6.685861,0.0,8.252967,0.0,9.866357,57.946497,326.103396,8.152585,1535,35,116.0,132.0,8.777778,5.583333
169,15.913028,13.206444,1,0,26.180249,2.317144,4.634288,26.180249,1.246679,3.945707,3.854448,0.183545,2.091165,167.692428,7.985354,5.864069,12,12,33,21,0,0,5,12,0,16,1,4,0,0,0,0,0,0,0,348.0,310.0,425.0,405.0,393.0,...,0.011068,0.008802,0.007578,0.008605,0.443369,11,6,0.833333,0.454545,259.890613,5.459432,22.0,4.634729,5.433722,6.274762,7.090910,7.933438,8.754792,9.597030,10.420822,11.262732,114.402937,3.806662,0.0,5.081404,0.0,6.541030,0.0,8.086103,0.0,9.674389,54.189588,282.077181,8.547793,1050,28,102.0,114.0,7.416667,4.722222


In [None]:
temp_df = bitter_test.iloc[withoutNoneIndices]

In [None]:
temp_df

Unnamed: 0,Name,Taste,Reference,SMILES,Canonical SMILES,In Bitter Domain,Bitter
0,D-Fructose,Non-bitter,Wiener et al. (2017) - Phyto-Dictionary,OC[C@]1(O)[C@@H](O)[C@H](O)[C@@H](CO1)O,OC[C@@]1(O)OC[C@H]([C@H]([C@@H]1O)O)O,1.0,False
1,D-Glucose,Non-bitter,Wiener et al. (2017) - Phyto-Dictionary,OC[C@@H]1[C@@H](O)[C@H](O)[C@@H](O)[C@H](O1)O,OC[C@H]1O[C@H](O)[C@@H]([C@H]([C@@H]1O)O)O,1.0,False
2,D-Mannose,Non-bitter,Wiener et al. (2017) - Phyto-Dictionary,OC[C@@H]1[C@@H](O)[C@H](O)[C@H](O)[C@H](O1)O,OC[C@H]1O[C@H](O)[C@H]([C@H]([C@@H]1O)O)O,1.0,False
3,L-Rhamnose,Non-bitter,Wiener et al. (2017) - Phyto-Dictionary,C[C@H]1[C@H](O)[C@@H](O)[C@@H](O)[C@H](O1)O,O[C@H]1[C@H](C)O[C@@H]([C@@H]([C@@H]1O)O)O,1.0,False
4,D-Ribulose,Non-bitter,Wiener et al. (2017) - Phyto-Dictionary,OCC(=O)[C@H](O)[C@H](O)CO,OC[C@H]([C@H](C(=O)CO)O)O,1.0,False
...,...,...,...,...,...,...,...
166,,Bitter,Wiener et al. (2017) - UNIMI,c1cc(O)c(O)cc1C[C@@H](C2=O)COc(c23)c(OC)c(OC)c...,COc1c2OC[C@H](C(=O)c2c(c(c1OC)O)O)Cc1ccc(c(c1)O)O,1.0,True
167,,Bitter,Wiener et al. (2017) - UNIMI,c1cccc(c1C([O-])=O)NC(=O)/C=C/c2ccc(O)cc2,O=C(Nc1ccccc1C(=O)[O-])/C=C/c1ccc(cc1)O,1.0,True
168,,Bitter,Wiener et al. (2017) - UNIMI,c1cccc(c1C([O-])=O)NC(=O)/C=C/c2cc(OC)c(cc2)OC,COc1cc(/C=C/C(=O)Nc2ccccc2C(=O)[O-])ccc1OC,1.0,True
169,,Bitter,Wiener et al. (2017) - UNIMI,c1cccc(c1C([O-])=O)NC(=O)/C=C/c2cc(O)ccc2,O=C(Nc1ccccc1C(=O)[O-])/C=C/c1cccc(c1)O,1.0,True


In [None]:
df_Bitter_Test['Name'] = temp_df['Name']
df_Bitter_Test['Taste'] = temp_df['Taste']
df_Bitter_Test['Bitter'] = temp_df['Bitter']

In [None]:
df_Bitter_Test

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,VE2_A,VE3_A,VR1_A,VR2_A,VR3_A,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,nBridgehead,nHetero,nH,nB,nC,nN,nO,nS,nP,nF,nCl,nBr,nI,nX,ATS0dv,ATS1dv,ATS2dv,ATS3dv,ATS4dv,...,JGI10,JGT10,Diameter,Radius,TopoShapeIndex,PetitjeanIndex,Vabc,VAdjMat,MWC01,MWC02,MWC03,MWC04,MWC05,MWC06,MWC07,MWC08,MWC09,MWC10,TMWC10,SRW02,SRW03,SRW04,SRW05,SRW06,SRW07,SRW08,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Name,Taste,Bitter
0,8.829880,8.715591,0,0,14.640475,2.418289,4.836579,14.640475,1.220040,3.392014,3.170872,0.264239,1.336328,42.564698,3.547058,3.933347,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,212.0,155.0,254.0,346.0,191.0,...,0.000000,0.664926,6,3,1.000000,0.500000,152.717273,4.584963,12.0,4.110874,4.962845,5.849325,6.722630,7.610358,8.487764,9.375092,10.253827,11.140687,92.513402,3.218876,0.0,4.574711,0.0,6.163315,0.0,7.845024,0.0,9.569063,43.370989,180.063388,7.502641,178,20,60.0,71.0,6.145833,2.722222,D-Fructose,Non-bitter,False
1,8.761080,8.651650,0,0,14.708146,2.377683,4.755367,14.708146,1.225679,3.384121,3.178505,0.264875,1.338733,43.594643,3.632887,3.957256,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,210.0,148.0,240.0,328.0,206.0,...,0.000000,0.549705,6,4,0.500000,0.333333,152.717273,4.584963,12.0,4.077537,4.919981,5.783825,6.645091,7.511525,8.375860,9.242711,10.107774,10.974643,91.638947,3.218876,0.0,4.532599,0.0,6.070738,0.0,7.709308,0.0,9.396405,42.927926,180.063388,7.502641,182,19,58.0,68.0,6.055556,2.777778,D-Glucose,Non-bitter,False
2,8.761080,8.651650,0,0,14.708146,2.377683,4.755367,14.708146,1.225679,3.384121,3.178505,0.264875,1.338733,43.594643,3.632887,3.957256,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,210.0,148.0,240.0,328.0,206.0,...,0.000000,0.549705,6,4,0.500000,0.333333,152.717273,4.584963,12.0,4.077537,4.919981,5.783825,6.645091,7.511525,8.375860,9.242711,10.107774,10.974643,91.638947,3.218876,0.0,4.532599,0.0,6.070738,0.0,7.709308,0.0,9.396405,42.927926,180.063388,7.502641,182,19,58.0,68.0,6.055556,2.777778,D-Mannose,Non-bitter,False
3,8.163363,8.029752,0,0,13.137460,2.364871,4.729742,13.137460,1.194315,3.303305,3.092493,0.281136,1.224288,35.727761,3.247978,3.671238,0,0,23,11,0,0,5,12,0,6,0,5,0,0,0,0,0,0,0,182.0,135.0,216.0,272.0,138.0,...,0.000000,0.560975,5,3,0.666667,0.400000,143.927046,4.459432,11.0,4.007333,4.844187,5.707110,6.561031,7.424762,8.281724,9.145375,10.003107,10.866605,88.841233,3.135494,0.0,4.465908,0.0,6.008813,0.0,7.647309,0.0,9.331052,41.588577,164.068473,7.133412,140,17,54.0,63.0,5.805556,2.444444,L-Rhamnose,Non-bitter,False
4,6.611250,7.282959,0,0,11.763639,2.188901,4.377802,11.763639,1.176364,3.134263,2.855225,0.285523,1.049151,30.472997,3.047300,3.416841,0,0,20,10,0,0,5,10,0,5,0,5,0,0,0,0,0,0,0,178.0,109.0,149.0,212.0,141.0,...,0.000000,0.545783,6,3,1.000000,0.500000,136.351061,4.169925,9.0,3.713572,4.465908,5.241747,6.018593,6.801283,7.582229,8.366603,9.148359,9.933095,80.271389,2.944439,0.0,4.143135,0.0,5.556828,0.0,7.048386,0.0,8.575273,38.268062,150.052823,7.502641,125,12,40.0,43.0,5.833333,2.555556,D-Ribulose,Non-bitter,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,19.981978,16.397806,0,0,33.251113,2.526538,5.053076,33.251113,1.278889,4.186542,4.120980,0.158499,2.371602,312.886434,12.034094,6.701352,12,12,44,26,0,0,8,18,0,18,0,8,0,0,0,0,0,0,0,450.0,417.0,627.0,806.0,708.0,...,0.009632,0.516379,13,7,0.857143,0.461538,312.881419,5.807355,28.0,4.934474,5.820083,6.727432,7.630947,8.542666,9.452737,10.367253,11.281107,12.197904,130.954603,4.043051,0.0,5.398163,0.0,6.970730,0.0,8.646993,0.0,10.378385,61.437322,362.100168,8.229549,1633,49,138.0,168.0,10.222222,5.833333,,Bitter,True
167,15.913028,13.077290,1,0,26.247606,2.316776,4.633552,26.247606,1.249886,3.945703,3.825724,0.182177,2.083685,176.194516,8.390215,5.913526,12,12,33,21,0,0,5,12,0,16,1,4,0,0,0,0,0,0,0,348.0,310.0,424.0,398.0,383.0,...,0.008617,0.447955,12,6,1.000000,0.500000,259.890613,5.459432,22.0,4.634729,5.433722,6.270988,7.089243,7.926964,8.751000,9.588571,10.414903,11.252664,114.362784,3.806662,0.0,5.081404,0.0,6.541030,0.0,8.083637,0.0,9.668714,54.181448,282.077181,8.547793,1064,28,102.0,114.0,7.416667,4.722222,,Bitter,True
168,17.884519,14.823609,1,0,30.336232,2.328747,4.657494,30.336232,1.264010,4.074059,4.563638,0.190152,2.393589,132.574159,5.523923,5.762611,12,12,40,24,0,0,6,16,0,18,1,5,0,0,0,0,0,0,0,404.0,357.0,493.0,502.0,451.0,...,0.008499,0.452254,13,7,0.857143,0.461538,303.272810,5.643856,25.0,4.762174,5.579730,6.428105,7.263330,8.113427,8.952735,9.802617,10.643208,11.492753,122.038079,3.931826,0.0,5.209486,0.0,6.685861,0.0,8.252967,0.0,9.866357,57.946497,326.103396,8.152585,1535,35,116.0,132.0,8.777778,5.583333,,Bitter,True
169,15.913028,13.206444,1,0,26.180249,2.317144,4.634288,26.180249,1.246679,3.945707,3.854448,0.183545,2.091165,167.692428,7.985354,5.864069,12,12,33,21,0,0,5,12,0,16,1,4,0,0,0,0,0,0,0,348.0,310.0,425.0,405.0,393.0,...,0.008605,0.443369,11,6,0.833333,0.454545,259.890613,5.459432,22.0,4.634729,5.433722,6.274762,7.090910,7.933438,8.754792,9.597030,10.420822,11.262732,114.402937,3.806662,0.0,5.081404,0.0,6.541030,0.0,8.086103,0.0,9.674389,54.189588,282.077181,8.547793,1050,28,102.0,114.0,7.416667,4.722222,,Bitter,True


In [None]:
df_Bitter_Test.to_csv('bitter_Test_New.csv',index=False)

## **Generating Features for Sweet Train:**

In [None]:
mols = [Chem.MolFromSmiles(smi) for smi in sweet_train['SMILES']]

In [None]:
type(mols)

list

In [None]:
mols

[<rdkit.Chem.rdchem.Mol at 0x7fa284d9cd00>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9cd50>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9cda0>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9ccb0>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9cc60>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9cc10>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9cbc0>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9cb70>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9cb20>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9cad0>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9ca80>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9ca30>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9c9e0>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9c990>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9c940>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9c8f0>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9c8a0>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9c850>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9c800>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9c7b0>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9c760>,
 <rdkit.Chem.rdchem.Mol at 0x7fa284d9c710>,
 <rdkit.Chem.rdchem.Mol at 0x7fa

In [None]:
noneIndices = []
withoutNoneIndices = []
molesWithoutNone = []
for i in range(len(mols)):
  if mols[i]==None:
    noneIndices.append(i)
  else:
    molesWithoutNone.append(mols[i])
    withoutNoneIndices.append(i)

In [None]:
noneIndices

[300,
 301,
 302,
 303,
 304,
 305,
 306,
 307,
 308,
 309,
 310,
 462,
 470,
 471,
 480,
 482,
 1568,
 1681,
 1916]

In [None]:
df_Sweet_Train = calc2d.pandas(molesWithoutNone)

  1%|          | 18/2186 [00:18<58:08,  1.61s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 36%|███▋      | 797/2186 [05:47<36:47,  1.59s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 2186/2186 [16:40<00:00,  2.19it/s]


In [None]:
df_Sweet_Train

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,VE2_A,VE3_A,VR1_A,VR2_A,VR3_A,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,nBridgehead,nHetero,nH,nB,nC,nN,nO,nS,nP,nF,nCl,nBr,nI,nX,ATS0dv,ATS1dv,ATS2dv,ATS3dv,ATS4dv,...,JGI7,JGI8,JGI9,JGI10,JGT10,Diameter,Radius,TopoShapeIndex,PetitjeanIndex,Vabc,VAdjMat,MWC01,MWC02,MWC03,MWC04,MWC05,MWC06,MWC07,MWC08,MWC09,MWC10,TMWC10,SRW02,SRW03,SRW04,SRW05,SRW06,SRW07,SRW08,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,17.213262,16.059815,0,0,28.766735,2.52242,4.95823,28.766735,1.250728,4.052767,4.072546,0.177067,2.237177,157.468009,6.846435,5.892131,0,0,45,23,0,0,11,22,0,12,0,11,0,0,0,0,0,0,0,408.000000,311.000000,532.000000,760.000000,588.000000,...,0.020266,0.014066,0.007946,0.000000,0.628961,10,5,1.0,0.5,288.08786,5.584963,24.0,4.795791,5.686975,6.602588,7.513709,8.434464,9.350189,10.272738,11.190528,12.113908,122.960890,3.891820,0.0,5.262690,2.397895,6.849066,4.94876,8.525360,7.195187,10.247042,72.317821,342.116212,7.602582,1110,43,120.0,147.0,10.451389,5.291667
1,17.213262,16.059815,0,0,28.766735,2.52242,4.95823,28.766735,1.250728,4.052767,4.072546,0.177067,2.237177,157.468009,6.846435,5.892131,0,0,42,23,0,0,11,19,0,12,0,8,0,0,0,3,0,0,3,334.814815,281.444444,477.111111,591.111111,423.333333,...,0.020266,0.014066,0.007946,0.000000,0.628961,10,5,1.0,0.5,307.350383,5.584963,24.0,4.795791,5.686975,6.602588,7.513709,8.434464,9.350189,10.272738,11.190528,12.113908,122.960890,3.891820,0.0,5.262690,2.397895,6.849066,4.94876,8.525360,7.195187,10.247042,72.317821,396.014551,9.428918,1110,43,120.0,147.0,10.451389,5.291667
2,15.315311,13.970681,1,1,25.367667,2.287533,4.575067,25.367667,1.207984,3.915999,4.057263,0.193203,2.142446,121.014646,5.762602,5.537849,6,6,39,21,0,0,7,18,0,14,2,5,0,0,0,0,0,0,0,330.000000,271.000000,348.000000,335.000000,376.000000,...,0.014757,0.012078,0.008125,0.007448,0.489651,11,6,0.833333,0.454545,277.569695,5.392317,21.0,4.574711,5.351858,6.175867,6.978214,7.805882,8.618485,9.447229,10.264966,11.093995,112.311208,3.761200,0.0,5.017280,0.000000,6.447306,0.00000,7.951207,0.000000,9.493487,53.670480,294.121572,7.541579,1000,27,96.0,105.0,8.916667,4.888889
3,8.829880,8.715591,0,0,14.640475,2.418289,4.836579,14.640475,1.22004,3.392014,3.170872,0.264239,1.336328,42.564698,3.547058,3.933347,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,212.000000,155.000000,254.000000,346.000000,191.000000,...,0.000000,0.000000,0.000000,0.000000,0.664926,6,3,1.0,0.5,152.717273,4.584963,12.0,4.110874,4.962845,5.849325,6.722630,7.610358,8.487764,9.375092,10.253827,11.140687,92.513402,3.218876,0.0,4.574711,0.000000,6.163315,0.00000,7.845024,0.000000,9.569063,43.370989,180.063388,7.502641,178,20,60.0,71.0,6.145833,2.722222
4,17.372180,15.638061,0,0,28.650077,2.478404,4.876167,28.650077,1.245656,4.0524,3.877075,0.168568,2.18799,200.516278,8.718099,6.133805,0,0,45,23,0,0,11,22,0,12,0,11,0,0,0,0,0,0,0,408.000000,309.000000,518.000000,711.000000,478.000000,...,0.021089,0.017343,0.013681,0.005045,0.636724,11,6,0.833333,0.454545,288.08786,5.584963,24.0,4.795791,5.673323,6.575076,7.467942,8.371242,9.267193,10.170725,11.068184,11.971710,122.361186,3.891820,0.0,5.262690,2.397895,6.836259,4.94876,8.493105,7.181592,10.192007,72.204129,342.116212,7.602582,1220,41,120.0,145.0,10.451389,5.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2181,6.473351,6.191587,0,0,11.142106,2.200122,4.400244,11.142106,1.238012,3.089776,2.850317,0.316702,0.94207,28.079758,3.119973,3.229688,6,6,18,9,0,0,2,9,0,7,1,1,0,0,0,0,0,0,0,114.000000,104.000000,123.000000,100.000000,46.000000,...,0.000000,0.000000,0.000000,0.000000,0.438078,5,3,0.666667,0.4,118.249507,4.169925,9.0,3.713572,4.465908,5.262690,6.033086,6.834109,7.608374,8.409831,9.185125,9.986495,79.499191,2.944439,0.0,4.143135,0.000000,5.556828,0.00000,7.062192,0.000000,8.606851,37.313444,123.068414,6.837134,88,9,40.0,43.0,3.472222,2.166667
2182,6.473351,6.127583,0,0,11.189957,2.193993,4.387987,11.189957,1.243329,3.089765,2.856388,0.317376,0.944197,28.049653,3.116628,3.228616,6,6,17,9,0,0,2,8,0,7,0,2,0,0,0,0,0,0,0,122.000000,104.000000,128.000000,106.000000,58.000000,...,0.000000,0.000000,0.000000,0.000000,0.475391,6,3,1.0,0.5,116.042974,4.169925,9.0,3.713572,4.465908,5.252273,6.028279,6.816736,7.598399,8.386401,9.169831,9.957265,79.388664,2.944439,0.0,4.143135,0.000000,5.556828,0.00000,7.055313,0.000000,8.590258,37.289972,124.052429,7.297202,90,9,40.0,43.0,3.472222,2.166667
2183,10.635111,9.197041,0,0,18.814625,2.30725,4.614501,18.814625,1.343902,3.558646,3.48721,0.249086,1.585574,63.60762,4.543401,4.489206,12,12,23,14,0,0,2,9,0,12,1,1,0,0,0,0,0,0,0,190.000000,188.000000,244.000000,236.000000,188.000000,...,0.012153,0.000000,0.000000,0.000000,0.279611,8,4,1.0,0.5,170.927139,4.906891,15.0,4.234107,5.043425,5.866468,6.693324,7.524561,8.356790,9.191056,10.025307,10.860786,96.795823,3.433987,0.0,4.672829,0.000000,6.137727,0.00000,7.690286,0.000000,9.282754,45.217583,183.068414,7.959496,307,18,68.0,77.0,3.833333,3.222222
2184,8.106344,7.838945,0,0,13.165395,2.242743,4.296119,13.165395,1.196854,3.28912,2.958768,0.268979,1.180083,49.082458,4.462042,3.988812,5,5,23,11,0,0,2,12,0,9,0,2,0,0,0,0,0,0,0,148.000000,133.000000,139.000000,69.000000,80.000000,...,0.000000,0.000000,0.000000,0.000000,0.513992,7,4,0.75,0.428571,150.634944,4.459432,11.0,3.931826,4.672829,5.484797,6.255750,7.069023,7.854381,8.666130,9.459619,10.269900,85.664256,3.135494,0.0,4.369448,2.397895,5.739793,4.59512,7.172425,6.529419,8.645235,53.584828,152.083730,6.612336,173,9,50.0,53.0,4.583333,2.500000


In [None]:
temp_df = sweet_train.iloc[withoutNoneIndices]

In [None]:
temp_df

In [None]:
df_Sweet_Train['Name'] = temp_df['Name']
df_Sweet_Train['Taste'] = temp_df['Taste']
df_Sweet_Train['Sweet'] = temp_df['Sweet']

In [None]:
df_Sweet_Train

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,VE2_A,VE3_A,VR1_A,VR2_A,VR3_A,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,nBridgehead,nHetero,nH,nB,nC,nN,nO,nS,nP,nF,nCl,nBr,nI,nX,ATS0dv,ATS1dv,ATS2dv,ATS3dv,ATS4dv,...,JGI10,JGT10,Diameter,Radius,TopoShapeIndex,PetitjeanIndex,Vabc,VAdjMat,MWC01,MWC02,MWC03,MWC04,MWC05,MWC06,MWC07,MWC08,MWC09,MWC10,TMWC10,SRW02,SRW03,SRW04,SRW05,SRW06,SRW07,SRW08,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Name,Taste,Sweet
0,17.213262,16.059815,0,0,28.766735,2.52242,4.95823,28.766735,1.250728,4.052767,4.072546,0.177067,2.237177,157.468009,6.846435,5.892131,0,0,45,23,0,0,11,22,0,12,0,11,0,0,0,0,0,0,0,408.000000,311.000000,532.000000,760.000000,588.000000,...,0.000000,0.628961,10,5,1.0,0.5,288.08786,5.584963,24.0,4.795791,5.686975,6.602588,7.513709,8.434464,9.350189,10.272738,11.190528,12.113908,122.960890,3.891820,0.0,5.262690,2.397895,6.849066,4.94876,8.525360,7.195187,10.247042,72.317821,342.116212,7.602582,1110,43,120.0,147.0,10.451389,5.291667,Sucrose,Sweet,True
1,17.213262,16.059815,0,0,28.766735,2.52242,4.95823,28.766735,1.250728,4.052767,4.072546,0.177067,2.237177,157.468009,6.846435,5.892131,0,0,42,23,0,0,11,19,0,12,0,8,0,0,0,3,0,0,3,334.814815,281.444444,477.111111,591.111111,423.333333,...,0.000000,0.628961,10,5,1.0,0.5,307.350383,5.584963,24.0,4.795791,5.686975,6.602588,7.513709,8.434464,9.350189,10.272738,11.190528,12.113908,122.960890,3.891820,0.0,5.262690,2.397895,6.849066,4.94876,8.525360,7.195187,10.247042,72.317821,396.014551,9.428918,1110,43,120.0,147.0,10.451389,5.291667,"Sucralose / 4,1',6'-Trichloro-galactosucrose",Sweet,True
2,15.315311,13.970681,1,1,25.367667,2.287533,4.575067,25.367667,1.207984,3.915999,4.057263,0.193203,2.142446,121.014646,5.762602,5.537849,6,6,39,21,0,0,7,18,0,14,2,5,0,0,0,0,0,0,0,330.000000,271.000000,348.000000,335.000000,376.000000,...,0.007448,0.489651,11,6,0.833333,0.454545,277.569695,5.392317,21.0,4.574711,5.351858,6.175867,6.978214,7.805882,8.618485,9.447229,10.264966,11.093995,112.311208,3.761200,0.0,5.017280,0.000000,6.447306,0.00000,7.951207,0.000000,9.493487,53.670480,294.121572,7.541579,1000,27,96.0,105.0,8.916667,4.888889,Aspartame/Aspartyl-phenylalanine methylester,Sweet,True
3,8.829880,8.715591,0,0,14.640475,2.418289,4.836579,14.640475,1.22004,3.392014,3.170872,0.264239,1.336328,42.564698,3.547058,3.933347,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,212.000000,155.000000,254.000000,346.000000,191.000000,...,0.000000,0.664926,6,3,1.0,0.5,152.717273,4.584963,12.0,4.110874,4.962845,5.849325,6.722630,7.610358,8.487764,9.375092,10.253827,11.140687,92.513402,3.218876,0.0,4.574711,0.000000,6.163315,0.00000,7.845024,0.000000,9.569063,43.370989,180.063388,7.502641,178,20,60.0,71.0,6.145833,2.722222,Tagatose,Sweet,True
4,17.372180,15.638061,0,0,28.650077,2.478404,4.876167,28.650077,1.245656,4.0524,3.877075,0.168568,2.18799,200.516278,8.718099,6.133805,0,0,45,23,0,0,11,22,0,12,0,11,0,0,0,0,0,0,0,408.000000,309.000000,518.000000,711.000000,478.000000,...,0.005045,0.636724,11,6,0.833333,0.454545,288.08786,5.584963,24.0,4.795791,5.673323,6.575076,7.467942,8.371242,9.267193,10.170725,11.068184,11.971710,122.361186,3.891820,0.0,5.262690,2.397895,6.836259,4.94876,8.493105,7.181592,10.192007,72.204129,342.116212,7.602582,1220,41,120.0,145.0,10.451389,5.166667,Isomaltulose/Palatinose,Sweet,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2181,6.473351,6.191587,0,0,11.142106,2.200122,4.400244,11.142106,1.238012,3.089776,2.850317,0.316702,0.94207,28.079758,3.119973,3.229688,6,6,18,9,0,0,2,9,0,7,1,1,0,0,0,0,0,0,0,114.000000,104.000000,123.000000,100.000000,46.000000,...,0.000000,0.438078,5,3,0.666667,0.4,118.249507,4.169925,9.0,3.713572,4.465908,5.262690,6.033086,6.834109,7.608374,8.409831,9.185125,9.986495,79.499191,2.944439,0.0,4.143135,0.000000,5.556828,0.00000,7.062192,0.000000,8.606851,37.313444,123.068414,6.837134,88,9,40.0,43.0,3.472222,2.166667,2-(2-Ethoxyethyl)pyridine,Bitter,False
2182,6.473351,6.127583,0,0,11.189957,2.193993,4.387987,11.189957,1.243329,3.089765,2.856388,0.317376,0.944197,28.049653,3.116628,3.228616,6,6,17,9,0,0,2,8,0,7,0,2,0,0,0,0,0,0,0,122.000000,104.000000,128.000000,106.000000,58.000000,...,0.000000,0.475391,6,3,1.0,0.5,116.042974,4.169925,9.0,3.713572,4.465908,5.252273,6.028279,6.816736,7.598399,8.386401,9.169831,9.957265,79.388664,2.944439,0.0,4.143135,0.000000,5.556828,0.00000,7.055313,0.000000,8.590258,37.289972,124.052429,7.297202,90,9,40.0,43.0,3.472222,2.166667,4-BENZYLPYRIDINE,Bitter,False
2183,10.635111,9.197041,0,0,18.814625,2.30725,4.614501,18.814625,1.343902,3.558646,3.48721,0.249086,1.585574,63.60762,4.543401,4.489206,12,12,23,14,0,0,2,9,0,12,1,1,0,0,0,0,0,0,0,190.000000,188.000000,244.000000,236.000000,188.000000,...,0.000000,0.279611,8,4,1.0,0.5,170.927139,4.906891,15.0,4.234107,5.043425,5.866468,6.693324,7.524561,8.356790,9.191056,10.025307,10.860786,96.795823,3.433987,0.0,4.672829,0.000000,6.137727,0.00000,7.690286,0.000000,9.282754,45.217583,183.068414,7.959496,307,18,68.0,77.0,3.833333,3.222222,TRIDECANAL,Bitter,False
2184,8.106344,7.838945,0,0,13.165395,2.242743,4.296119,13.165395,1.196854,3.28912,2.958768,0.268979,1.180083,49.082458,4.462042,3.988812,5,5,23,11,0,0,2,12,0,9,0,2,0,0,0,0,0,0,0,148.000000,133.000000,139.000000,69.000000,80.000000,...,0.000000,0.513992,7,4,0.75,0.428571,150.634944,4.459432,11.0,3.931826,4.672829,5.484797,6.255750,7.069023,7.854381,8.666130,9.459619,10.269900,85.664256,3.135494,0.0,4.369448,2.397895,5.739793,4.59512,7.172425,6.529419,8.645235,53.584828,152.083730,6.612336,173,9,50.0,53.0,4.583333,2.500000,L-phenylalanine,Bitter,False


In [None]:
df_Sweet_Train.to_csv('sweet_Train_New.csv',index=False)

## **Generating Features for Sweet Test:**

In [None]:
mols = [Chem.MolFromSmiles(smi) for smi in sweet_test['SMILES']]

In [None]:
type(mols)

In [None]:
mols

In [None]:
noneIndices = []
withoutNoneIndices = []
molesWithoutNone = []
for i in range(len(mols)):
  if mols[i]==None:
    noneIndices.append(i)
  else:
    molesWithoutNone.append(mols[i])
    withoutNoneIndices.append(i)

In [None]:
noneIndices

In [None]:
df_Sweet_Test = calc2d.pandas(molesWithoutNone)

In [None]:
df_Sweet_Test

In [None]:
temp_df = sweet_test.iloc[withoutNoneIndices]

In [None]:
temp_df

In [None]:
df_Sweet_Test['Name'] = temp_df['Name']
df_Sweet_Test['Taste'] = temp_df['Taste']
df_Sweet_Test['Sweet'] = temp_df['Sweet']

In [None]:
df_Sweet_Test

In [None]:
df_Sweet_Test.to_csv('sweet_Test_New.csv',index=False)

In [None]:
df_Sweet_Train



---



---



---



## **MODELS**
---


In [None]:
bitter_train = pd.read_csv("/content/BitterTrain.csv", encoding='cp1252')
bitter_test = pd.read_csv("/content/BitterTest.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
bitter_train = bitter_train.fillna(0)

In [None]:
bitter_test = bitter_test.fillna(0)

In [None]:
bitter_train = bitter_train.drop(bitter_train.columns[[1147, 1148]], axis=1)

In [None]:
bitter_test = bitter_test.drop(bitter_test.columns[[1147, 1148]], axis=1)

In [None]:
bitter_train

Unnamed: 0,Name,Taste,Bitter,Name.1,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,nB,nC,nN,nO,nS,nP,nF,nCl,nBr,nI,nX,ATS0m,ATS1m,ATS2m,ATS3m,ATS4m,ATS5m,ATS6m,ATS7m,ATS8m,ATS0v,ATS1v,ATS2v,ATS3v,ATS4v,ATS5v,...,VE1_D,VE2_D,VE3_D,VR1_D,VR2_D,VR3_D,TopoPSA,VABC,VAdjMat,MWC2,MWC3,MWC4,MWC5,MWC6,MWC7,MWC8,MWC9,MWC10,TWC,SRW2,SRW3,SRW4,SRW5,SRW6,SRW7,SRW8,SRW9,SRW10,TSRW,MW,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,Sucrose,Sweet,False,AUTOGEN_molecule_1,0,-4.3105,18.580410,68.7741,44.611446,0,0,45,23,22,0,12,0,11,0,0,0,0,0,0,0,4569.170871,4431.452224,7114.671704,9413.865359,8120.238308,7276.770528,6578.022258,5961.156114,4399.062485,8146.343745,10735.803720,16705.063590,19942.507530,19284.912590,17534.110760,...,2.684680e-02,1.167252e-03,-8.320500,1.618695e+02,7.037806,11.699619,189.53,288.087851,5.523562,4.795791,5.686975,6.602588,7.513709,8.434464,9.350189,10.272738,11.190528,12.113908,122.960890,3.891820,0.0,5.262690,2.397895,6.849066,4.94876,8.525360,7.195187,10.247042,10.489217,342.116212,7.602582,45.298746,1.969511,29.320458,29.320458,0.000000,1110,43,-3.277,120
1,"Sucralose / 4,1',6'-Trichloro-galactosucrose",Sweet,False,AUTOGEN_molecule_2,0,-0.6967,0.485391,77.9271,46.745067,0,0,42,23,19,0,12,0,8,0,0,0,3,0,0,3,7568.326176,5083.949131,8110.887324,11909.663100,10752.308410,9566.843748,8835.602606,8037.380222,7310.746603,8915.833056,10967.562050,17213.656780,20663.867840,19992.504010,17766.478060,...,2.684680e-02,1.167252e-03,-8.320500,1.618695e+02,7.037806,11.699619,128.84,307.350374,5.523562,4.795791,5.686975,6.602588,7.513709,8.434464,9.350189,10.272738,11.190528,12.113908,122.960890,3.891820,0.0,5.262690,2.397895,6.849066,4.94876,8.525360,7.195187,10.247042,10.489217,396.014551,9.428918,45.298746,1.969511,29.320458,21.851311,0.000000,1110,43,-0.734,120
2,Alitame,Sweet,False,AUTOGEN_molecule_3,1,-0.3908,0.152725,95.1429,59.572169,0,0,57,24,33,0,16,3,4,1,0,0,0,0,0,0,4982.059799,4762.826650,7112.351239,6262.950338,5803.021582,5156.299055,4010.222294,4292.396619,4734.474871,9994.316858,12728.093930,18303.361410,20380.866260,19163.103760,17269.072270,...,1.603734e-01,6.682226e-03,-4.392600,2.679675e+02,11.165310,13.418077,131.14,356.959507,5.584963,4.762174,5.549076,6.444131,7.257708,8.172164,9.003685,9.935035,10.780060,11.725590,121.629623,3.891820,0.0,5.262690,0.000000,6.829794,0.00000,8.509363,0.000000,10.256922,10.452389,363.219178,6.372266,46.302152,1.929256,22.105073,10.719972,8.482492,1807,27,0.775,116
3,Aspartame/Aspartyl-phenylalanine methylester,Sweet,False,AUTOGEN_molecule_4,1,-1.8303,3.349998,46.9315,42.852274,6,6,39,21,18,0,14,2,5,0,0,0,0,0,0,0,3710.218949,3616.851009,4967.348867,5232.341659,5146.402186,5628.940448,5039.273318,4384.347258,3546.935836,8057.326256,9810.835761,13414.539880,14935.465050,14511.800190,14228.206850,...,6.945707e-02,3.307480e-03,-5.600797,1.664503e+02,7.926203,10.740863,118.72,277.569687,5.392317,4.574711,5.351858,6.175867,6.978214,7.805882,8.618485,9.447229,10.264966,11.093995,112.311208,3.761200,0.0,5.017280,0.000000,6.447306,0.00000,7.951207,0.000000,9.493487,9.738259,294.121572,7.541579,40.865946,1.945997,17.972641,12.463725,5.508916,1000,27,1.743,96
4,Tagatose,Sweet,False,AUTOGEN_molecule_5,0,-2.5636,6.572045,36.3608,23.373516,0,0,24,12,12,0,6,0,6,0,0,0,0,0,0,0,2413.585500,2231.853104,3579.783866,4575.372616,2920.173451,1269.486060,406.279953,37.334304,1.016064,4212.451005,5449.915491,8324.041459,9656.045420,7524.925029,3984.916942,...,1.141617e-02,9.510000e-04,-5.367269,1.051832e+02,8.765268,5.586845,110.38,152.717268,4.584963,4.110874,4.962845,5.849325,6.722630,7.610358,8.487764,9.375092,10.253827,11.140687,92.513402,3.218876,0.0,4.574711,0.000000,6.163315,0.00000,7.845024,0.000000,9.569063,9.768412,180.063388,7.502641,23.000999,1.916750,15.355991,15.355991,0.000000,178,20,-1.740,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2247,6-Methyl-2-pyridinemethanol,Bitter,True,AUTOGEN_molecule_617,0,-0.0092,0.000085,13.0093,20.223137,6,6,18,9,9,0,7,1,1,0,0,0,0,0,0,0,1471.157473,1507.208565,1912.304319,1698.267489,1049.188149,576.050586,100.914912,4.064256,0.000000,3704.076962,4485.758236,5793.074073,5504.236825,4039.507402,2480.287731,...,7.999138e-02,8.887931e-03,-2.273253,4.743521e+01,5.270579,3.473428,33.12,118.249503,4.169925,3.713572,4.465908,5.262690,6.033086,6.834109,7.608374,8.409831,9.185125,9.986495,79.499191,2.944439,0.0,4.143135,0.000000,5.556828,0.00000,7.062192,0.000000,8.606851,8.851091,123.068414,6.837134,17.549446,1.949938,5.429766,2.454019,2.975746,88,9,1.262,40
2248,4-hydroxybenzyl alcohol,Bitter,True,AUTOGEN_molecule_618,0,-0.9090,0.826281,9.8414,19.258344,6,6,17,9,8,0,7,0,2,0,0,0,0,0,0,0,1529.913361,1499.073337,1909.160039,1732.115297,962.891800,487.278042,314.468289,34.286112,1.016064,3646.069822,4422.524276,5868.274122,5567.387234,3514.742515,1654.006792,...,5.682378e-02,6.313753e-03,-2.581020,5.878058e+01,6.531176,3.666430,40.46,116.042970,4.169925,3.713572,4.465908,5.252273,6.028279,6.816736,7.598399,8.386401,9.169831,9.957265,79.388664,2.944439,0.0,4.143135,0.000000,5.556828,0.00000,7.055313,0.000000,8.590258,8.836955,124.052429,7.297202,17.549594,1.949955,4.961358,4.961358,0.000000,90,9,0.813,40
2249,4-Benzoylpyridine,Bitter,True,AUTOGEN_molecule_619,0,-0.2638,0.069590,5.3460,29.023137,12,12,23,14,9,0,12,1,1,0,0,0,0,0,0,0,2192.478078,2368.773387,3106.717467,3088.803862,2511.695970,2108.841516,1428.072624,806.656076,251.000925,5821.661471,7059.582633,9609.951504,9692.815256,7808.659973,6608.815445,...,7.220000e-16,5.150000e-17,-48.810998,3.000048e+06,214289.114600,20.879794,29.96,170.927133,4.807355,4.234107,5.043425,5.866468,6.693324,7.524561,8.356790,9.191056,10.025307,10.860786,96.795823,3.433987,0.0,4.672829,0.000000,6.137727,0.00000,7.690286,0.000000,9.282754,9.514068,183.068414,7.959496,28.405439,2.028960,5.566157,2.571148,2.995009,307,18,3.648,68
2250,4-(5-Methyl-2-furyl)-2-butanone,Bitter,True,AUTOGEN_molecule_620,0,0.3219,0.103620,27.1230,25.445516,5,5,23,11,12,0,9,0,2,0,0,0,0,0,0,0,1822.505859,1875.889991,2533.152998,1810.708633,1423.475015,1400.060527,1010.827701,482.770558,124.071696,4617.438603,5673.163117,7518.441277,6774.918701,5687.751552,4526.318695,...,1.206707e-01,1.097006e-02,-2.326159,5.602929e+01,5.093572,4.428462,30.21,150.634939,4.459432,3.931826,4.672829,5.484797,6.255750,7.069023,7.854381,8.666130,9.459619,10.269900,85.664256,3.135494,0.0,4.369448,2.397895,5.739793,4.59512,7.172425,6.529419,8.645235,9.011646,152.083730,6.612336,21.308457,1.937132,5.377103,5.377103,0.000000,173,9,1.752,50


In [None]:
bitter_test

Unnamed: 0,Name,Taste,Bitter,Name.1,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,nB,nC,nN,nO,nS,nP,nF,nCl,nBr,nI,nX,ATS0m,ATS1m,ATS2m,ATS3m,ATS4m,ATS5m,ATS6m,ATS7m,ATS8m,ATS0v,ATS1v,ATS2v,ATS3v,ATS4v,ATS5v,...,VE1_D,VE2_D,VE3_D,VR1_D,VR2_D,VR3_D,TopoPSA,VABC,VAdjMat,MWC2,MWC3,MWC4,MWC5,MWC6,MWC7,MWC8,MWC9,MWC10,TWC,SRW2,SRW3,SRW4,SRW5,SRW6,SRW7,SRW8,SRW9,SRW10,TSRW,MW,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,D-Fructose,Non-bitter,False,AUTOGEN_molecule_1,0,-2.5636,6.572045,36.3608,23.373516,0,0,24,12,12,0,6,0,6,0,0,0,0,0,0,0,2413.585500,2231.853104,3579.783866,4575.372616,2920.173451,1269.486060,406.279953,37.334304,1.016064,4212.451005,5449.915491,8324.041459,9656.045420,7524.925029,3984.916942,...,1.141617e-02,9.510000e-04,-5.367269,1.051832e+02,8.765268,5.586845,110.38,152.717268,4.584963,4.110874,4.962845,5.849325,6.722630,7.610358,8.487764,9.375092,10.253827,11.140687,92.513402,3.218876,0.0,4.574711,0.0,6.163315,0.0,7.845024,0.0,9.569063,9.768412,180.063388,7.502641,23.000999,1.916750,15.355991,15.355991,0.000000,178,20,-1.740,60
1,D-Glucose,Non-bitter,False,AUTOGEN_molecule_2,0,-2.5134,6.317180,35.9234,23.373516,0,0,24,12,12,0,6,0,6,0,0,0,0,0,0,0,2413.585500,2231.853104,3414.837893,4294.218727,3051.849376,1549.623885,439.550001,38.350368,1.016064,4212.451005,5449.915491,8186.978460,9336.155630,7466.877023,4273.722989,...,8.965070e-02,7.470892e-03,-2.894201,6.635284e+01,5.529404,5.033984,110.38,152.717268,4.584963,4.077537,4.919981,5.783825,6.645091,7.511525,8.375860,9.242711,10.107774,10.974643,91.638947,3.218876,0.0,4.532599,0.0,6.070738,0.0,7.709308,0.0,9.396405,9.604610,180.063388,7.502641,23.078977,1.923248,15.422349,15.422349,0.000000,182,19,-1.697,58
2,D-Mannose,Non-bitter,False,AUTOGEN_molecule_3,0,-2.5134,6.317180,35.9234,23.373516,0,0,24,12,12,0,6,0,6,0,0,0,0,0,0,0,2413.585500,2231.853104,3414.837893,4294.218727,3051.849376,1549.623885,439.550001,38.350368,1.016064,4212.451005,5449.915491,8186.978460,9336.155630,7466.877023,4273.722989,...,8.965070e-02,7.470892e-03,-2.894201,6.635284e+01,5.529404,5.033984,110.38,152.717268,4.584963,4.077537,4.919981,5.783825,6.645091,7.511525,8.375860,9.242711,10.107774,10.974643,91.638947,3.218876,0.0,4.532599,0.0,6.070738,0.0,7.709308,0.0,9.396405,9.604610,180.063388,7.502641,23.078977,1.923248,15.422349,15.422349,0.000000,182,19,-1.697,58
3,L-Rhamnose,Non-bitter,False,AUTOGEN_molecule_4,0,-2.0134,4.053780,34.5453,22.571516,0,0,23,11,12,0,6,0,5,0,0,0,0,0,0,0,2157.617499,2035.669211,3192.452048,3845.070673,2407.533493,803.194950,106.966944,4.064256,0.000000,3996.060228,5179.908975,7782.389185,8785.951606,6677.750227,3321.615544,...,6.660000e-16,6.060000e-17,-38.439545,5.000018e+06,454547.089700,16.967447,90.15,143.927041,4.459432,4.007333,4.844187,5.707110,6.561031,7.424762,8.281724,9.145375,10.003107,10.866605,88.841233,3.135494,0.0,4.465908,0.0,6.008813,0.0,7.647309,0.0,9.331052,9.539788,164.068474,7.133412,21.063169,1.914834,12.869416,12.869416,0.000000,140,17,-0.994,54
4,D-Ribulose,Non-bitter,False,AUTOGEN_molecule_5,0,-2.5027,6.263507,31.5993,19.477930,0,0,20,10,10,0,5,0,5,0,0,0,0,0,0,0,2011.321250,1675.026925,2214.183411,2760.936836,2020.690432,1152.444156,402.215697,37.334304,1.016064,3510.375838,4224.189338,5623.475391,5956.412682,4776.904303,2972.370623,...,7.770000e-16,7.770000e-17,-34.790890,3.000019e+06,300001.905800,14.914129,97.99,136.351057,4.321928,3.713572,4.465908,5.241747,6.018593,6.801283,7.582229,8.366603,9.148359,9.933095,80.271389,2.944439,0.0,4.143135,0.0,5.556828,0.0,7.048386,0.0,8.575273,8.824237,150.052823,7.502641,18.311102,1.831110,12.085422,12.085422,0.000000,125,12,-3.167,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,0,Bitter,True,AUTOGEN_molecule_167,0,-2.1746,4.728885,44.9435,50.098274,12,12,44,26,18,0,18,0,8,0,0,0,0,0,0,0,4662.787338,4800.301136,7096.305837,9163.016189,8877.665461,7580.809254,5934.187877,5017.691637,4327.168078,9913.937847,12464.179000,18395.581890,22023.626840,21823.542360,20381.351190,...,2.205120e-01,8.481230e-03,-3.930689,3.114271e+02,11.977964,14.927029,125.68,312.881409,5.700440,4.934474,5.820083,6.727432,7.630947,8.542666,9.452737,10.367253,11.281107,12.197904,130.954603,4.043051,0.0,5.398163,0.0,6.970730,0.0,8.646993,0.0,10.378385,10.576815,362.100167,8.229549,52.405345,2.015590,21.484941,21.484941,0.000000,1633,49,0.095,138
167,0,Bitter,True,AUTOGEN_molecule_168,0,-1.6516,2.727783,25.1911,40.469516,12,12,33,21,12,0,16,1,4,0,0,0,0,0,0,0,3540.486757,3564.674974,4965.081493,4956.331084,4501.090560,4045.778518,3626.570811,2668.023835,2803.413024,8258.152642,9945.555168,13608.290670,14383.262570,12569.931610,10889.073800,...,1.432766e-01,6.822697e-03,-4.080254,1.814326e+02,8.639649,10.921857,89.46,259.890605,5.392317,4.634729,5.433722,6.270988,7.089243,7.926964,8.751000,9.588571,10.414903,11.252664,114.362784,3.806662,0.0,5.081404,0.0,6.541030,0.0,8.083637,0.0,9.668714,9.902087,282.076633,8.547777,41.983572,1.999218,12.963296,9.907891,3.055405,1064,28,2.975,102
168,0,Bitter,True,AUTOGEN_molecule_169,0,-1.2949,1.676766,37.2581,47.458688,12,12,40,24,16,0,18,1,5,0,0,0,0,0,0,0,4089.047256,4185.575389,5704.474785,6196.881659,5980.952680,5103.439199,4313.124866,3505.346535,3580.969888,9445.912200,11345.413130,15395.157880,17097.569440,16611.045790,15095.723460,...,4.516750e-02,1.881979e-03,-7.433706,4.729745e+02,19.707271,14.781700,87.69,303.272801,5.584963,4.762174,5.579730,6.428105,7.263330,8.113427,8.952735,9.802617,10.643208,11.492753,122.038078,3.931826,0.0,5.209486,0.0,6.685861,0.0,8.252967,0.0,9.866357,10.092661,326.102848,8.152571,47.879283,1.994970,16.037180,12.980349,3.056831,1535,35,2.839,116
169,0,Bitter,True,AUTOGEN_molecule_170,0,-1.6516,2.727783,25.1911,40.469516,12,12,33,21,12,0,16,1,4,0,0,0,0,0,0,0,3540.486757,3564.674974,4965.081493,4956.331084,4666.036533,4087.091398,3628.602939,2922.675952,2565.883803,8258.152642,9945.555168,13608.290670,14383.262570,12706.994610,11074.586450,...,1.149516e-01,5.473885e-03,-4.542813,1.973474e+02,9.397497,11.098428,89.46,259.890605,5.392317,4.634729,5.433722,6.274762,7.090910,7.933438,8.754792,9.597030,10.420822,11.262732,114.402937,3.806662,0.0,5.081404,0.0,6.541030,0.0,8.086103,0.0,9.674389,9.906981,282.076633,8.547777,41.983151,1.999198,12.967464,9.912138,3.055326,1050,28,2.975,102


In [None]:
bitter_train.to_csv('bitterTrainCleaned.csv', index=False)

In [None]:
bitter_test.to_csv('bitterTestCleaned.csv', index=False)

In [None]:
y_Train = bitter_train['Bitter']

In [None]:
x_Train = bitter_train.drop(['Bitter', 'Taste','Name','Name.1'], axis=1)

In [None]:
x_Train

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,nB,nC,nN,nO,nS,nP,nF,nCl,nBr,nI,nX,ATS0m,ATS1m,ATS2m,ATS3m,ATS4m,ATS5m,ATS6m,ATS7m,ATS8m,ATS0v,ATS1v,ATS2v,ATS3v,ATS4v,ATS5v,ATS6v,ATS7v,ATS8v,ATS0e,...,VE1_D,VE2_D,VE3_D,VR1_D,VR2_D,VR3_D,TopoPSA,VABC,VAdjMat,MWC2,MWC3,MWC4,MWC5,MWC6,MWC7,MWC8,MWC9,MWC10,TWC,SRW2,SRW3,SRW4,SRW5,SRW6,SRW7,SRW8,SRW9,SRW10,TSRW,MW,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,0,-4.3105,18.580410,68.7741,44.611446,0,0,45,23,22,0,12,0,11,0,0,0,0,0,0,0,4569.170871,4431.452224,7114.671704,9413.865359,8120.238308,7276.770528,6578.022258,5961.156114,4399.062485,8146.343745,10735.803720,16705.063590,19942.507530,19284.912590,17534.110760,16403.945420,15033.972430,11540.936470,385.161276,...,2.684680e-02,1.167252e-03,-8.320500,1.618695e+02,7.037806,11.699619,189.53,288.087851,5.523562,4.795791,5.686975,6.602588,7.513709,8.434464,9.350189,10.272738,11.190528,12.113908,122.960890,3.891820,0.0,5.262690,2.397895,6.849066,4.94876,8.525360,7.195187,10.247042,10.489217,342.116212,7.602582,45.298746,1.969511,29.320458,29.320458,0.000000,1110,43,-3.277,120
1,0,-0.6967,0.485391,77.9271,46.745067,0,0,42,23,19,0,12,0,8,0,0,0,3,0,0,3,7568.326176,5083.949131,8110.887324,11909.663100,10752.308410,9566.843748,8835.602606,8037.380222,7310.746603,8915.833056,10967.562050,17213.656780,20663.867840,19992.504010,17766.478060,16710.435390,15421.851270,12177.149890,361.177611,...,2.684680e-02,1.167252e-03,-8.320500,1.618695e+02,7.037806,11.699619,128.84,307.350374,5.523562,4.795791,5.686975,6.602588,7.513709,8.434464,9.350189,10.272738,11.190528,12.113908,122.960890,3.891820,0.0,5.262690,2.397895,6.849066,4.94876,8.525360,7.195187,10.247042,10.489217,396.014551,9.428918,45.298746,1.969511,29.320458,21.851311,0.000000,1110,43,-0.734,120
2,1,-0.3908,0.152725,95.1429,59.572169,0,0,57,24,33,0,16,3,4,1,0,0,0,0,0,0,4982.059799,4762.826650,7112.351239,6262.950338,5803.021582,5156.299055,4010.222294,4292.396619,4734.474871,9994.316858,12728.093930,18303.361410,20380.866260,19163.103760,17269.072270,14710.322680,14277.887290,15164.506090,435.113189,...,1.603734e-01,6.682226e-03,-4.392600,2.679675e+02,11.165310,13.418077,131.14,356.959507,5.584963,4.762174,5.549076,6.444131,7.257708,8.172164,9.003685,9.935035,10.780060,11.725590,121.629623,3.891820,0.0,5.262690,0.000000,6.829794,0.00000,8.509363,0.000000,10.256922,10.452389,363.219178,6.372266,46.302152,1.929256,22.105073,10.719972,8.482492,1807,27,0.775,116
3,1,-1.8303,3.349998,46.9315,42.852274,6,6,39,21,18,0,14,2,5,0,0,0,0,0,0,0,3710.218949,3616.851009,4967.348867,5232.341659,5146.402186,5628.940448,5039.273318,4384.347258,3546.935836,8057.326256,9810.835761,13414.539880,14935.465050,14511.800190,14228.206850,13960.678770,12918.097010,10546.991010,313.661428,...,6.945707e-02,3.307480e-03,-5.600797,1.664503e+02,7.926203,10.740863,118.72,277.569687,5.392317,4.574711,5.351858,6.175867,6.978214,7.805882,8.618485,9.447229,10.264966,11.093995,112.311208,3.761200,0.0,5.017280,0.000000,6.447306,0.00000,7.951207,0.000000,9.493487,9.738259,294.121572,7.541579,40.865946,1.945997,17.972641,12.463725,5.508916,1000,27,1.743,96
4,0,-2.5636,6.572045,36.3608,23.373516,0,0,24,12,12,0,6,0,6,0,0,0,0,0,0,0,2413.585500,2231.853104,3579.783866,4575.372616,2920.173451,1269.486060,406.279953,37.334304,1.016064,4212.451005,5449.915491,8324.041459,9656.045420,7524.925029,3984.916942,1424.048102,319.445983,31.083744,205.974960,...,1.141617e-02,9.510000e-04,-5.367269,1.051832e+02,8.765268,5.586845,110.38,152.717268,4.584963,4.110874,4.962845,5.849325,6.722630,7.610358,8.487764,9.375092,10.253827,11.140687,92.513402,3.218876,0.0,4.574711,0.000000,6.163315,0.00000,7.845024,0.000000,9.569063,9.768412,180.063388,7.502641,23.000999,1.916750,15.355991,15.355991,0.000000,178,20,-1.740,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2247,0,-0.0092,0.000085,13.0093,20.223137,6,6,18,9,9,0,7,1,1,0,0,0,0,0,0,0,1471.157473,1507.208565,1912.304319,1698.267489,1049.188149,576.050586,100.914912,4.064256,0.000000,3704.076962,4485.758236,5793.074073,5504.236825,4039.507402,2480.287731,930.532688,124.334978,0.000000,136.803140,...,7.999138e-02,8.887931e-03,-2.273253,4.743521e+01,5.270579,3.473428,33.12,118.249503,4.169925,3.713572,4.465908,5.262690,6.033086,6.834109,7.608374,8.409831,9.185125,9.986495,79.499191,2.944439,0.0,4.143135,0.000000,5.556828,0.00000,7.062192,0.000000,8.606851,8.851091,123.068414,6.837134,17.549446,1.949938,5.429766,2.454019,2.975746,88,9,1.262,40
2248,0,-0.9090,0.826281,9.8414,19.258344,6,6,17,9,8,0,7,0,2,0,0,0,0,0,0,0,1529.913361,1499.073337,1909.160039,1732.115297,962.891800,487.278042,314.468289,34.286112,1.016064,3646.069822,4422.524276,5868.274122,5567.387234,3514.742515,1654.006792,672.058760,226.194750,31.083744,133.234756,...,5.682378e-02,6.313753e-03,-2.581020,5.878058e+01,6.531176,3.666430,40.46,116.042970,4.169925,3.713572,4.465908,5.252273,6.028279,6.816736,7.598399,8.386401,9.169831,9.957265,79.388664,2.944439,0.0,4.143135,0.000000,5.556828,0.00000,7.055313,0.000000,8.590258,8.836955,124.052429,7.297202,17.549594,1.949955,4.961358,4.961358,0.000000,90,9,0.813,40
2249,0,-0.2638,0.069590,5.3460,29.023137,12,12,23,14,9,0,12,1,1,0,0,0,0,0,0,0,2192.478078,2368.773387,3106.717467,3088.803862,2511.695970,2108.841516,1428.072624,806.656076,251.000925,5821.661471,7059.582633,9609.951504,9692.815256,7808.659973,6608.815445,5339.993259,3288.496267,1140.391663,174.505720,...,7.220000e-16,5.150000e-17,-48.810998,3.000048e+06,214289.114600,20.879794,29.96,170.927133,4.807355,4.234107,5.043425,5.866468,6.693324,7.524561,8.356790,9.191056,10.025307,10.860786,96.795823,3.433987,0.0,4.672829,0.000000,6.137727,0.00000,7.690286,0.000000,9.282754,9.514068,183.068414,7.959496,28.405439,2.028960,5.566157,2.571148,2.995009,307,18,3.648,68
2250,0,0.3219,0.103620,27.1230,25.445516,5,5,23,11,12,0,9,0,2,0,0,0,0,0,0,0,1822.505859,1875.889991,2533.152998,1810.708633,1423.475015,1400.060527,1010.827701,482.770558,124.071696,4617.438603,5673.163117,7518.441277,6774.918701,5687.751552,4526.318695,3600.100351,2235.379898,1027.711823,175.189644,...,1.206707e-01,1.097006e-02,-2.326159,5.602929e+01,5.093572,4.428462,30.21,150.634939,4.459432,3.931826,4.672829,5.484797,6.255750,7.069023,7.854381,8.666130,9.459619,10.269900,85.664256,3.135494,0.0,4.369448,2.397895,5.739793,4.59512,7.172425,6.529419,8.645235,9.011646,152.083730,6.612336,21.308457,1.937132,5.377103,5.377103,0.000000,173,9,1.752,50


In [None]:
y_Test = bitter_test['Bitter']

In [None]:
x_Test = bitter_test.drop(['Bitter', 'Taste','Name','Name.1'], axis=1)

In [None]:
x_Test

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,nB,nC,nN,nO,nS,nP,nF,nCl,nBr,nI,nX,ATS0m,ATS1m,ATS2m,ATS3m,ATS4m,ATS5m,ATS6m,ATS7m,ATS8m,ATS0v,ATS1v,ATS2v,ATS3v,ATS4v,ATS5v,ATS6v,ATS7v,ATS8v,ATS0e,...,VE1_D,VE2_D,VE3_D,VR1_D,VR2_D,VR3_D,TopoPSA,VABC,VAdjMat,MWC2,MWC3,MWC4,MWC5,MWC6,MWC7,MWC8,MWC9,MWC10,TWC,SRW2,SRW3,SRW4,SRW5,SRW6,SRW7,SRW8,SRW9,SRW10,TSRW,MW,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,0,-2.5636,6.572045,36.3608,23.373516,0,0,24,12,12,0,6,0,6,0,0,0,0,0,0,0,2413.585500,2231.853104,3579.783866,4575.372616,2920.173451,1269.486060,406.279953,37.334304,1.016064,4212.451005,5449.915491,8324.041459,9656.045420,7524.925029,3984.916942,1424.048102,319.445983,31.083744,205.974960,...,1.141617e-02,9.510000e-04,-5.367269,1.051832e+02,8.765268,5.586845,110.38,152.717268,4.584963,4.110874,4.962845,5.849325,6.722630,7.610358,8.487764,9.375092,10.253827,11.140687,92.513402,3.218876,0.0,4.574711,0.0,6.163315,0.0,7.845024,0.0,9.569063,9.768412,180.063388,7.502641,23.000999,1.916750,15.355991,15.355991,0.000000,178,20,-1.740,60
1,0,-2.5134,6.317180,35.9234,23.373516,0,0,24,12,12,0,6,0,6,0,0,0,0,0,0,0,2413.585500,2231.853104,3414.837893,4294.218727,3051.849376,1549.623885,439.550001,38.350368,1.016064,4212.451005,5449.915491,8186.978460,9336.155630,7466.877023,4273.722989,1619.159107,350.529728,31.083744,205.974960,...,8.965070e-02,7.470892e-03,-2.894201,6.635284e+01,5.529404,5.033984,110.38,152.717268,4.584963,4.077537,4.919981,5.783825,6.645091,7.511525,8.375860,9.242711,10.107774,10.974643,91.638947,3.218876,0.0,4.532599,0.0,6.070738,0.0,7.709308,0.0,9.396405,9.604610,180.063388,7.502641,23.078977,1.923248,15.422349,15.422349,0.000000,182,19,-1.697,58
2,0,-2.5134,6.317180,35.9234,23.373516,0,0,24,12,12,0,6,0,6,0,0,0,0,0,0,0,2413.585500,2231.853104,3414.837893,4294.218727,3051.849376,1549.623885,439.550001,38.350368,1.016064,4212.451005,5449.915491,8186.978460,9336.155630,7466.877023,4273.722989,1619.159107,350.529728,31.083744,205.974960,...,8.965070e-02,7.470892e-03,-2.894201,6.635284e+01,5.529404,5.033984,110.38,152.717268,4.584963,4.077537,4.919981,5.783825,6.645091,7.511525,8.375860,9.242711,10.107774,10.974643,91.638947,3.218876,0.0,4.532599,0.0,6.070738,0.0,7.709308,0.0,9.396405,9.604610,180.063388,7.502641,23.078977,1.923248,15.422349,15.422349,0.000000,182,19,-1.697,58
3,0,-2.0134,4.053780,34.5453,22.571516,0,0,23,11,12,0,6,0,5,0,0,0,0,0,0,0,2157.617499,2035.669211,3192.452048,3845.070673,2407.533493,803.194950,106.966944,4.064256,0.000000,3996.060228,5179.908975,7782.389185,8785.951606,6677.750227,3321.615544,959.977191,124.334978,0.000000,192.623244,...,6.660000e-16,6.060000e-17,-38.439545,5.000018e+06,454547.089700,16.967447,90.15,143.927041,4.459432,4.007333,4.844187,5.707110,6.561031,7.424762,8.281724,9.145375,10.003107,10.866605,88.841233,3.135494,0.0,4.465908,0.0,6.008813,0.0,7.647309,0.0,9.331052,9.539788,164.068474,7.133412,21.063169,1.914834,12.869416,12.869416,0.000000,140,17,-0.994,54
4,0,-2.5027,6.263507,31.5993,19.477930,0,0,20,10,10,0,5,0,5,0,0,0,0,0,0,0,2011.321250,1675.026925,2214.183411,2760.936836,2020.690432,1152.444156,402.215697,37.334304,1.016064,3510.375838,4224.189338,5623.475391,5956.412682,4776.904303,2972.370623,1299.713124,319.445983,31.083744,171.645800,...,7.770000e-16,7.770000e-17,-34.790890,3.000019e+06,300001.905800,14.914129,97.99,136.351057,4.321928,3.713572,4.465908,5.241747,6.018593,6.801283,7.582229,8.366603,9.148359,9.933095,80.271389,2.944439,0.0,4.143135,0.0,5.556828,0.0,7.048386,0.0,8.575273,8.824237,150.052823,7.502641,18.311102,1.831110,12.085422,12.085422,0.000000,125,12,-3.167,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,0,-2.1746,4.728885,44.9435,50.098274,12,12,44,26,18,0,18,0,8,0,0,0,0,0,0,0,4662.787338,4800.301136,7096.305837,9163.016189,8877.665461,7580.809254,5934.187877,5017.691637,4327.168078,9913.937847,12464.179000,18395.581890,22023.626840,21823.542360,20381.351190,17904.887890,14497.475580,11597.455620,363.475368,...,2.205120e-01,8.481230e-03,-3.930689,3.114271e+02,11.977964,14.927029,125.68,312.881409,5.700440,4.934474,5.820083,6.727432,7.630947,8.542666,9.452737,10.367253,11.281107,12.197904,130.954603,4.043051,0.0,5.398163,0.0,6.970730,0.0,8.646993,0.0,10.378385,10.576815,362.100167,8.229549,52.405345,2.015590,21.484941,21.484941,0.000000,1633,49,0.095,138
167,0,-1.6516,2.727783,25.1911,40.469516,12,12,33,21,12,0,16,1,4,0,0,0,0,0,0,0,3540.486757,3564.674974,4965.081493,4956.331084,4501.090560,4045.778518,3626.570811,2668.023835,2803.413024,8258.152642,9945.555168,13608.290670,14383.262570,12569.931610,10889.073800,9296.243432,8111.190020,8090.352892,264.878324,...,1.432766e-01,6.822697e-03,-4.080254,1.814326e+02,8.639649,10.921857,89.46,259.890605,5.392317,4.634729,5.433722,6.270988,7.089243,7.926964,8.751000,9.588571,10.414903,11.252664,114.362784,3.806662,0.0,5.081404,0.0,6.541030,0.0,8.083637,0.0,9.668714,9.902087,282.076633,8.547777,41.983572,1.999218,12.963296,9.907891,3.055405,1064,28,2.975,102
168,0,-1.2949,1.676766,37.2581,47.458688,12,12,40,24,16,0,18,1,5,0,0,0,0,0,0,0,4089.047256,4185.575389,5704.474785,6196.881659,5980.952680,5103.439199,4313.124866,3505.346535,3580.969888,9445.912200,11345.413130,15395.157880,17097.569440,16611.045790,15095.723460,12497.739570,10473.421460,10220.370870,320.184928,...,4.516750e-02,1.881979e-03,-7.433706,4.729745e+02,19.707271,14.781700,87.69,303.272801,5.584963,4.762174,5.579730,6.428105,7.263330,8.113427,8.952735,9.802617,10.643208,11.492753,122.038078,3.931826,0.0,5.209486,0.0,6.685861,0.0,8.252967,0.0,9.866357,10.092661,326.102848,8.152571,47.879283,1.994970,16.037180,12.980349,3.056831,1535,35,2.839,116
169,0,-1.6516,2.727783,25.1911,40.469516,12,12,33,21,12,0,16,1,4,0,0,0,0,0,0,0,3540.486757,3564.674974,4965.081493,4956.331084,4666.036533,4087.091398,3628.602939,2922.675952,2565.883803,8258.152642,9945.555168,13608.290670,14383.262570,12706.994610,11074.586450,9358.410921,8149.136152,8075.566204,264.878324,...,1.149516e-01,5.473885e-03,-4.542813,1.973474e+02,9.397497,11.098428,89.46,259.890605,5.392317,4.634729,5.433722,6.274762,7.090910,7.933438,8.754792,9.597030,10.420822,11.262732,114.402937,3.806662,0.0,5.081404,0.0,6.541030,0.0,8.086103,0.0,9.674389,9.906981,282.076633,8.547777,41.983151,1.999198,12.967464,9.912138,3.055326,1050,28,2.975,102


In [None]:
dfCombinedX = pd.concat([x_Test, x_Train])

In [None]:
dfCombinedX

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,nB,nC,nN,nO,nS,nP,nF,nCl,nBr,nI,nX,ATS0m,ATS1m,ATS2m,ATS3m,ATS4m,ATS5m,ATS6m,ATS7m,ATS8m,ATS0v,ATS1v,ATS2v,ATS3v,ATS4v,ATS5v,ATS6v,ATS7v,ATS8v,ATS0e,...,VE1_D,VE2_D,VE3_D,VR1_D,VR2_D,VR3_D,TopoPSA,VABC,VAdjMat,MWC2,MWC3,MWC4,MWC5,MWC6,MWC7,MWC8,MWC9,MWC10,TWC,SRW2,SRW3,SRW4,SRW5,SRW6,SRW7,SRW8,SRW9,SRW10,TSRW,MW,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,0,-2.5636,6.572045,36.3608,23.373516,0,0,24,12,12,0,6,0,6,0,0,0,0,0,0,0,2413.585500,2231.853104,3579.783866,4575.372616,2920.173451,1269.486060,406.279953,37.334304,1.016064,4212.451005,5449.915491,8324.041459,9656.045420,7524.925029,3984.916942,1424.048102,319.445983,31.083744,205.974960,...,1.141617e-02,9.510000e-04,-5.367269,1.051832e+02,8.765268,5.586845,110.38,152.717268,4.584963,4.110874,4.962845,5.849325,6.722630,7.610358,8.487764,9.375092,10.253827,11.140687,92.513402,3.218876,0.0,4.574711,0.000000,6.163315,0.00000,7.845024,0.000000,9.569063,9.768412,180.063388,7.502641,23.000999,1.916750,15.355991,15.355991,0.000000,178,20,-1.740,60
1,0,-2.5134,6.317180,35.9234,23.373516,0,0,24,12,12,0,6,0,6,0,0,0,0,0,0,0,2413.585500,2231.853104,3414.837893,4294.218727,3051.849376,1549.623885,439.550001,38.350368,1.016064,4212.451005,5449.915491,8186.978460,9336.155630,7466.877023,4273.722989,1619.159107,350.529728,31.083744,205.974960,...,8.965070e-02,7.470892e-03,-2.894201,6.635284e+01,5.529404,5.033984,110.38,152.717268,4.584963,4.077537,4.919981,5.783825,6.645091,7.511525,8.375860,9.242711,10.107774,10.974643,91.638947,3.218876,0.0,4.532599,0.000000,6.070738,0.00000,7.709308,0.000000,9.396405,9.604610,180.063388,7.502641,23.078977,1.923248,15.422349,15.422349,0.000000,182,19,-1.697,58
2,0,-2.5134,6.317180,35.9234,23.373516,0,0,24,12,12,0,6,0,6,0,0,0,0,0,0,0,2413.585500,2231.853104,3414.837893,4294.218727,3051.849376,1549.623885,439.550001,38.350368,1.016064,4212.451005,5449.915491,8186.978460,9336.155630,7466.877023,4273.722989,1619.159107,350.529728,31.083744,205.974960,...,8.965070e-02,7.470892e-03,-2.894201,6.635284e+01,5.529404,5.033984,110.38,152.717268,4.584963,4.077537,4.919981,5.783825,6.645091,7.511525,8.375860,9.242711,10.107774,10.974643,91.638947,3.218876,0.0,4.532599,0.000000,6.070738,0.00000,7.709308,0.000000,9.396405,9.604610,180.063388,7.502641,23.078977,1.923248,15.422349,15.422349,0.000000,182,19,-1.697,58
3,0,-2.0134,4.053780,34.5453,22.571516,0,0,23,11,12,0,6,0,5,0,0,0,0,0,0,0,2157.617499,2035.669211,3192.452048,3845.070673,2407.533493,803.194950,106.966944,4.064256,0.000000,3996.060228,5179.908975,7782.389185,8785.951606,6677.750227,3321.615544,959.977191,124.334978,0.000000,192.623244,...,6.660000e-16,6.060000e-17,-38.439545,5.000018e+06,454547.089700,16.967447,90.15,143.927041,4.459432,4.007333,4.844187,5.707110,6.561031,7.424762,8.281724,9.145375,10.003107,10.866605,88.841233,3.135494,0.0,4.465908,0.000000,6.008813,0.00000,7.647309,0.000000,9.331052,9.539788,164.068474,7.133412,21.063169,1.914834,12.869416,12.869416,0.000000,140,17,-0.994,54
4,0,-2.5027,6.263507,31.5993,19.477930,0,0,20,10,10,0,5,0,5,0,0,0,0,0,0,0,2011.321250,1675.026925,2214.183411,2760.936836,2020.690432,1152.444156,402.215697,37.334304,1.016064,3510.375838,4224.189338,5623.475391,5956.412682,4776.904303,2972.370623,1299.713124,319.445983,31.083744,171.645800,...,7.770000e-16,7.770000e-17,-34.790890,3.000019e+06,300001.905800,14.914129,97.99,136.351057,4.321928,3.713572,4.465908,5.241747,6.018593,6.801283,7.582229,8.366603,9.148359,9.933095,80.271389,2.944439,0.0,4.143135,0.000000,5.556828,0.00000,7.048386,0.000000,8.575273,8.824237,150.052823,7.502641,18.311102,1.831110,12.085422,12.085422,0.000000,125,12,-3.167,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2247,0,-0.0092,0.000085,13.0093,20.223137,6,6,18,9,9,0,7,1,1,0,0,0,0,0,0,0,1471.157473,1507.208565,1912.304319,1698.267489,1049.188149,576.050586,100.914912,4.064256,0.000000,3704.076962,4485.758236,5793.074073,5504.236825,4039.507402,2480.287731,930.532688,124.334978,0.000000,136.803140,...,7.999138e-02,8.887931e-03,-2.273253,4.743521e+01,5.270579,3.473428,33.12,118.249503,4.169925,3.713572,4.465908,5.262690,6.033086,6.834109,7.608374,8.409831,9.185125,9.986495,79.499191,2.944439,0.0,4.143135,0.000000,5.556828,0.00000,7.062192,0.000000,8.606851,8.851091,123.068414,6.837134,17.549446,1.949938,5.429766,2.454019,2.975746,88,9,1.262,40
2248,0,-0.9090,0.826281,9.8414,19.258344,6,6,17,9,8,0,7,0,2,0,0,0,0,0,0,0,1529.913361,1499.073337,1909.160039,1732.115297,962.891800,487.278042,314.468289,34.286112,1.016064,3646.069822,4422.524276,5868.274122,5567.387234,3514.742515,1654.006792,672.058760,226.194750,31.083744,133.234756,...,5.682378e-02,6.313753e-03,-2.581020,5.878058e+01,6.531176,3.666430,40.46,116.042970,4.169925,3.713572,4.465908,5.252273,6.028279,6.816736,7.598399,8.386401,9.169831,9.957265,79.388664,2.944439,0.0,4.143135,0.000000,5.556828,0.00000,7.055313,0.000000,8.590258,8.836955,124.052429,7.297202,17.549594,1.949955,4.961358,4.961358,0.000000,90,9,0.813,40
2249,0,-0.2638,0.069590,5.3460,29.023137,12,12,23,14,9,0,12,1,1,0,0,0,0,0,0,0,2192.478078,2368.773387,3106.717467,3088.803862,2511.695970,2108.841516,1428.072624,806.656076,251.000925,5821.661471,7059.582633,9609.951504,9692.815256,7808.659973,6608.815445,5339.993259,3288.496267,1140.391663,174.505720,...,7.220000e-16,5.150000e-17,-48.810998,3.000048e+06,214289.114600,20.879794,29.96,170.927133,4.807355,4.234107,5.043425,5.866468,6.693324,7.524561,8.356790,9.191056,10.025307,10.860786,96.795823,3.433987,0.0,4.672829,0.000000,6.137727,0.00000,7.690286,0.000000,9.282754,9.514068,183.068414,7.959496,28.405439,2.028960,5.566157,2.571148,2.995009,307,18,3.648,68
2250,0,0.3219,0.103620,27.1230,25.445516,5,5,23,11,12,0,9,0,2,0,0,0,0,0,0,0,1822.505859,1875.889991,2533.152998,1810.708633,1423.475015,1400.060527,1010.827701,482.770558,124.071696,4617.438603,5673.163117,7518.441277,6774.918701,5687.751552,4526.318695,3600.100351,2235.379898,1027.711823,175.189644,...,1.206707e-01,1.097006e-02,-2.326159,5.602929e+01,5.093572,4.428462,30.21,150.634939,4.459432,3.931826,4.672829,5.484797,6.255750,7.069023,7.854381,8.666130,9.459619,10.269900,85.664256,3.135494,0.0,4.369448,2.397895,5.739793,4.59512,7.172425,6.529419,8.645235,9.011646,152.083730,6.612336,21.308457,1.937132,5.377103,5.377103,0.000000,173,9,1.752,50


In [None]:
y_Test_df = y_Test.to_frame()

In [None]:
y_Train_df = y_Train.to_frame()

In [None]:
dfCombinedY = pd.concat([y_Test_df, y_Train_df])

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
pip install Boruta

Collecting Boruta
  Downloading Boruta-0.3-py3-none-any.whl (56 kB)
[?25l[K     |█████▉                          | 10 kB 14.9 MB/s eta 0:00:01[K     |███████████▋                    | 20 kB 17.7 MB/s eta 0:00:01[K     |█████████████████▍              | 30 kB 15.7 MB/s eta 0:00:01[K     |███████████████████████▏        | 40 kB 11.8 MB/s eta 0:00:01[K     |█████████████████████████████   | 51 kB 11.2 MB/s eta 0:00:01[K     |████████████████████████████████| 56 kB 3.2 MB/s 
Installing collected packages: Boruta
Successfully installed Boruta-0.3


In [None]:
from boruta import BorutaPy

In [None]:
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1, max_iter=10)

In [None]:
x = dfCombinedX.iloc[:,:].values
y = dfCombinedY.iloc[:,:].values

In [None]:
y = y.ravel()

In [None]:
feat_selector.fit(x, y)

Iteration: 	1 / 10
Confirmed: 	0
Tentative: 	1442
Rejected: 	0
Iteration: 	2 / 10
Confirmed: 	0
Tentative: 	1442
Rejected: 	0
Iteration: 	3 / 10
Confirmed: 	0
Tentative: 	1442
Rejected: 	0
Iteration: 	4 / 10
Confirmed: 	0
Tentative: 	1442
Rejected: 	0
Iteration: 	5 / 10
Confirmed: 	0
Tentative: 	1442
Rejected: 	0
Iteration: 	6 / 10
Confirmed: 	0
Tentative: 	1442
Rejected: 	0
Iteration: 	7 / 10
Confirmed: 	0
Tentative: 	1442
Rejected: 	0
Iteration: 	8 / 10
Confirmed: 	611
Tentative: 	355
Rejected: 	476
Iteration: 	9 / 10
Confirmed: 	611
Tentative: 	355
Rejected: 	476


BorutaPy finished running.

Iteration: 	10 / 10
Confirmed: 	611
Tentative: 	240
Rejected: 	476


BorutaPy(estimator=RandomForestClassifier(class_weight='balanced', max_depth=5,
                                          n_estimators=879, n_jobs=-1,
                                          random_state=RandomState(MT19937) at 0x7FD3BA737050),
         max_iter=10, n_estimators='auto',
         random_state=RandomState(MT19937) at 0x7FD3BA737050, verbose=2)

In [None]:
feat_selector.support_

array([False, False, False, ...,  True,  True,  True])

In [None]:
feat_selector.ranking_

array([154,   2,   2, ...,   1,   1,   1])

In [None]:
X_filtered = feat_selector.transform(x)

In [None]:
X_filtered

array([[36.3608  , 23.373516, 24.      , ..., 20.      , -1.74    ,
        60.      ],
       [35.9234  , 23.373516, 24.      , ..., 19.      , -1.697   ,
        58.      ],
       [35.9234  , 23.373516, 24.      , ..., 19.      , -1.697   ,
        58.      ],
       ...,
       [ 5.346   , 29.023137, 23.      , ..., 18.      ,  3.648   ,
        68.      ],
       [27.123   , 25.445516, 23.      , ...,  9.      ,  1.752   ,
        50.      ],
       [ 0.      ,  1.468793,  2.      , ...,  0.      , -0.467   ,
         0.      ]])

In [None]:
len(X_filtered[0])

611

In [None]:
x_test_filtered = x[0:171]
x_train_filtered = x[171:]

In [None]:
y_test = y[0:171]
y_train = y[171:]

In [None]:
clf = RandomForestClassifier(max_depth=5, random_state=0)
clf.fit(x_train_filtered, y_train)
predYTrain = clf.predict(x_train_filtered)

In [None]:
predYTest = clf.predict(x_test_filtered)

In [None]:
from sklearn.metrics import classification_report, precision_score

In [None]:
print(classification_report(y_train, predYTrain))

              precision    recall  f1-score   support

       False       0.88      0.95      0.92      1443
        True       0.91      0.77      0.83       809

    accuracy                           0.89      2252
   macro avg       0.89      0.86      0.88      2252
weighted avg       0.89      0.89      0.89      2252



In [None]:
print(classification_report(y_test, predYTest))

              precision    recall  f1-score   support

       False       0.60      0.88      0.71        66
        True       0.89      0.63      0.74       105

    accuracy                           0.73       171
   macro avg       0.74      0.75      0.72       171
weighted avg       0.78      0.73      0.73       171



In [None]:
precision_score(y_test, predYTest)

0.8918918918918919

In [None]:
sweet_train = pd.read_csv("/content/drive/MyDrive/Capstone/Features Extracted DataSets/Mordred/sweet_Train_Latest.csv")

## **Cleaning Mordred Data**

---



In [None]:
bitter_test = pd.read_csv("/content/bitter_Test_New.csv")

In [None]:
sweet_test = pd.read_csv("/content/sweet_Test_New.csv")

In [None]:
bitter_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171 entries, 0 to 170
Columns: 1616 entries, ABC to Bitter
dtypes: bool(3), float64(897), int64(324), object(392)
memory usage: 2.1+ MB


In [None]:
sweet_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Columns: 1616 entries, ABC to Sweet
dtypes: bool(2), float64(627), int64(322), object(665)
memory usage: 1.9+ MB


In [None]:
bitter_test.iloc[:, 1615].isna().sum()

0

In [None]:
sweet_test.iloc[:, 1615].isna().sum()

3

In [None]:
bitter_test = bitter_test[bitter_test['Bitter'].notna()]


In [None]:
sweet_train.iloc[:, 1615].isna().sum()

0

In [None]:
bitter_test.iloc[:, 1615].isna().sum()

0

In [None]:
df = bitter_test.apply(pd.to_numeric, errors='coerce')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 171 entries, 0 to 170
Columns: 1616 entries, ABC to Bitter
dtypes: bool(3), float64(1289), int64(324)
memory usage: 2.1 MB


In [None]:
df = df.fillna(0)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 171 entries, 0 to 170
Columns: 1616 entries, ABC to Bitter
dtypes: bool(3), float64(1289), int64(324)
memory usage: 2.1 MB


In [None]:
df.iloc[:, 1615].unique()

array([False,  True])

In [None]:
df

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,VE2_A,VE3_A,VR1_A,VR2_A,VR3_A,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,nBridgehead,nHetero,nH,nB,nC,nN,nO,nS,nP,nF,nCl,nBr,nI,nX,ATS0dv,ATS1dv,ATS2dv,ATS3dv,ATS4dv,...,JGI10,JGT10,Diameter,Radius,TopoShapeIndex,PetitjeanIndex,Vabc,VAdjMat,MWC01,MWC02,MWC03,MWC04,MWC05,MWC06,MWC07,MWC08,MWC09,MWC10,TMWC10,SRW02,SRW03,SRW04,SRW05,SRW06,SRW07,SRW08,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Name,Taste,Bitter
0,8.829880,8.715591,0,0,14.640475,2.418289,4.836579,14.640475,1.220040,3.392014,3.170872,0.264239,1.336328,42.564698,3.547058,3.933347,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,212.0,155.0,254.0,346.0,191.0,...,0.000000,0.664926,6,3,1.000000,0.500000,152.717273,4.584963,12.0,4.110874,4.962845,5.849325,6.722630,7.610358,8.487764,9.375092,10.253827,11.140687,92.513402,3.218876,0.0,4.574711,0.0,6.163315,0.0,7.845024,0.0,9.569063,43.370989,180.063388,7.502641,178,20,60.0,71.0,6.145833,2.722222,0.0,0.0,False
1,8.761080,8.651650,0,0,14.708146,2.377683,4.755367,14.708146,1.225679,3.384121,3.178505,0.264875,1.338733,43.594643,3.632887,3.957256,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,210.0,148.0,240.0,328.0,206.0,...,0.000000,0.549705,6,4,0.500000,0.333333,152.717273,4.584963,12.0,4.077537,4.919981,5.783825,6.645091,7.511525,8.375860,9.242711,10.107774,10.974643,91.638947,3.218876,0.0,4.532599,0.0,6.070738,0.0,7.709308,0.0,9.396405,42.927926,180.063388,7.502641,182,19,58.0,68.0,6.055556,2.777778,0.0,0.0,False
2,8.761080,8.651650,0,0,14.708146,2.377683,4.755367,14.708146,1.225679,3.384121,3.178505,0.264875,1.338733,43.594643,3.632887,3.957256,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,210.0,148.0,240.0,328.0,206.0,...,0.000000,0.549705,6,4,0.500000,0.333333,152.717273,4.584963,12.0,4.077537,4.919981,5.783825,6.645091,7.511525,8.375860,9.242711,10.107774,10.974643,91.638947,3.218876,0.0,4.532599,0.0,6.070738,0.0,7.709308,0.0,9.396405,42.927926,180.063388,7.502641,182,19,58.0,68.0,6.055556,2.777778,0.0,0.0,False
3,8.163363,8.029752,0,0,13.137460,2.364871,4.729742,13.137460,1.194315,3.303305,3.092493,0.281136,1.224288,35.727761,3.247978,3.671238,0,0,23,11,0,0,5,12,0,6,0,5,0,0,0,0,0,0,0,182.0,135.0,216.0,272.0,138.0,...,0.000000,0.560975,5,3,0.666667,0.400000,143.927046,4.459432,11.0,4.007333,4.844187,5.707110,6.561031,7.424762,8.281724,9.145375,10.003107,10.866605,88.841233,3.135494,0.0,4.465908,0.0,6.008813,0.0,7.647309,0.0,9.331052,41.588577,164.068473,7.133412,140,17,54.0,63.0,5.805556,2.444444,0.0,0.0,False
4,6.611250,7.282959,0,0,11.763639,2.188901,4.377802,11.763639,1.176364,3.134263,2.855225,0.285523,1.049151,30.472997,3.047300,3.416841,0,0,20,10,0,0,5,10,0,5,0,5,0,0,0,0,0,0,0,178.0,109.0,149.0,212.0,141.0,...,0.000000,0.545783,6,3,1.000000,0.500000,136.351061,4.169925,9.0,3.713572,4.465908,5.241747,6.018593,6.801283,7.582229,8.366603,9.148359,9.933095,80.271389,2.944439,0.0,4.143135,0.0,5.556828,0.0,7.048386,0.0,8.575273,38.268062,150.052823,7.502641,125,12,40.0,43.0,5.833333,2.555556,0.0,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,19.981978,16.397806,0,0,33.251113,2.526538,5.053076,33.251113,1.278889,4.186542,4.120980,0.158499,2.371602,312.886434,12.034094,6.701352,12,12,44,26,0,0,8,18,0,18,0,8,0,0,0,0,0,0,0,450.0,417.0,627.0,806.0,708.0,...,0.009632,0.516379,13,7,0.857143,0.461538,312.881419,5.807355,28.0,4.934474,5.820083,6.727432,7.630947,8.542666,9.452737,10.367253,11.281107,12.197904,130.954603,4.043051,0.0,5.398163,0.0,6.970730,0.0,8.646993,0.0,10.378385,61.437322,362.100168,8.229549,1633,49,138.0,168.0,10.222222,5.833333,0.0,0.0,True
167,15.913028,13.077290,1,0,26.247606,2.316776,4.633552,26.247606,1.249886,3.945703,3.825724,0.182177,2.083685,176.194516,8.390215,5.913526,12,12,33,21,0,0,5,12,0,16,1,4,0,0,0,0,0,0,0,348.0,310.0,424.0,398.0,383.0,...,0.008617,0.447955,12,6,1.000000,0.500000,259.890613,5.459432,22.0,4.634729,5.433722,6.270988,7.089243,7.926964,8.751000,9.588571,10.414903,11.252664,114.362784,3.806662,0.0,5.081404,0.0,6.541030,0.0,8.083637,0.0,9.668714,54.181448,282.077181,8.547793,1064,28,102.0,114.0,7.416667,4.722222,0.0,0.0,True
168,17.884519,14.823609,1,0,30.336232,2.328747,4.657494,30.336232,1.264010,4.074059,4.563638,0.190152,2.393589,132.574159,5.523923,5.762611,12,12,40,24,0,0,6,16,0,18,1,5,0,0,0,0,0,0,0,404.0,357.0,493.0,502.0,451.0,...,0.008499,0.452254,13,7,0.857143,0.461538,303.272810,5.643856,25.0,4.762174,5.579730,6.428105,7.263330,8.113427,8.952735,9.802617,10.643208,11.492753,122.038079,3.931826,0.0,5.209486,0.0,6.685861,0.0,8.252967,0.0,9.866357,57.946497,326.103396,8.152585,1535,35,116.0,132.0,8.777778,5.583333,0.0,0.0,True
169,15.913028,13.206444,1,0,26.180249,2.317144,4.634288,26.180249,1.246679,3.945707,3.854448,0.183545,2.091165,167.692428,7.985354,5.864069,12,12,33,21,0,0,5,12,0,16,1,4,0,0,0,0,0,0,0,348.0,310.0,425.0,405.0,393.0,...,0.008605,0.443369,11,6,0.833333,0.454545,259.890613,5.459432,22.0,4.634729,5.433722,6.274762,7.090910,7.933438,8.754792,9.597030,10.420822,11.262732,114.402937,3.806662,0.0,5.081404,0.0,6.541030,0.0,8.086103,0.0,9.674389,54.189588,282.077181,8.547793,1050,28,102.0,114.0,7.416667,4.722222,0.0,0.0,True


In [None]:
sweet_train

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,VE2_A,VE3_A,VR1_A,VR2_A,VR3_A,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,nBridgehead,nHetero,nH,nB,nC,nN,nO,nS,nP,nF,nCl,nBr,nI,nX,ATS0dv,ATS1dv,ATS2dv,ATS3dv,ATS4dv,...,JGI10,JGT10,Diameter,Radius,TopoShapeIndex,PetitjeanIndex,Vabc,VAdjMat,MWC01,MWC02,MWC03,MWC04,MWC05,MWC06,MWC07,MWC08,MWC09,MWC10,TMWC10,SRW02,SRW03,SRW04,SRW05,SRW06,SRW07,SRW08,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Name,Taste,Sweet
0,17.213262,16.059815,0,0,28.766735,2.522420,4.958230,28.766735,1.250728,4.052767,4.072546,0.177067,2.237177,157.468009,6.846435,5.892131,0,0,45,23,0,0,11,22,0,12,0,11,0,0,0,0,0,0,0,408.000000,311.000000,532.000000,760.000000,588.000000,...,0.000000,0.628961,10,5,1.0,0.5,288.08786020494125,5.584963,24.0,4.795791,5.686975,6.602588,7.513709,8.434464,9.350189,10.272738,11.190528,12.113908,122.960890,3.891820,0.0,5.262690,2.397895,6.849066,4.94876,8.525360,7.195187,10.247042,72.317821,342.116212,7.602582,1110,43,120.0,147.0,10.451389,5.291667,Sucrose,Sweet,True
1,17.213262,16.059815,0,0,28.766735,2.522420,4.958230,28.766735,1.250728,4.052767,4.072546,0.177067,2.237177,157.468009,6.846435,5.892131,0,0,42,23,0,0,11,19,0,12,0,8,0,0,0,3,0,0,3,334.814815,281.444444,477.111111,591.111111,423.333333,...,0.000000,0.628961,10,5,1.0,0.5,307.3503834401883,5.584963,24.0,4.795791,5.686975,6.602588,7.513709,8.434464,9.350189,10.272738,11.190528,12.113908,122.960890,3.891820,0.0,5.262690,2.397895,6.849066,4.94876,8.525360,7.195187,10.247042,72.317821,396.014551,9.428918,1110,43,120.0,147.0,10.451389,5.291667,"Sucralose / 4,1',6'-Trichloro-galactosucrose",Sweet,True
2,15.315311,13.970681,1,1,25.367667,2.287533,4.575067,25.367667,1.207984,3.915999,4.057263,0.193203,2.142446,121.014646,5.762602,5.537849,6,6,39,21,0,0,7,18,0,14,2,5,0,0,0,0,0,0,0,330.000000,271.000000,348.000000,335.000000,376.000000,...,0.007448,0.489651,11,6,0.833333,0.454545,277.56969540044366,5.392317,21.0,4.574711,5.351858,6.175867,6.978214,7.805882,8.618485,9.447229,10.264966,11.093995,112.311208,3.761200,0.0,5.017280,0.000000,6.447306,0.00000,7.951207,0.000000,9.493487,53.670480,294.121572,7.541579,1000,27,96.0,105.0,8.916667,4.888889,Aspartame/Aspartyl-phenylalanine methylester,Sweet,True
3,8.829880,8.715591,0,0,14.640475,2.418289,4.836579,14.640475,1.220040,3.392014,3.170872,0.264239,1.336328,42.564698,3.547058,3.933347,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,212.000000,155.000000,254.000000,346.000000,191.000000,...,0.000000,0.664926,6,3,1.0,0.5,152.71727305208674,4.584963,12.0,4.110874,4.962845,5.849325,6.722630,7.610358,8.487764,9.375092,10.253827,11.140687,92.513402,3.218876,0.0,4.574711,0.000000,6.163315,0.00000,7.845024,0.000000,9.569063,43.370989,180.063388,7.502641,178,20,60.0,71.0,6.145833,2.722222,Tagatose,Sweet,True
4,17.372180,15.638061,0,0,28.650077,2.478404,4.876167,28.650077,1.245656,4.052400,3.877075,0.168568,2.187990,200.516278,8.718099,6.133805,0,0,45,23,0,0,11,22,0,12,0,11,0,0,0,0,0,0,0,408.000000,309.000000,518.000000,711.000000,478.000000,...,0.005045,0.636724,11,6,0.833333,0.454545,288.08786020494125,5.584963,24.0,4.795791,5.673323,6.575076,7.467942,8.371242,9.267193,10.170725,11.068184,11.971710,122.361186,3.891820,0.0,5.262690,2.397895,6.836259,4.94876,8.493105,7.181592,10.192007,72.204129,342.116212,7.602582,1220,41,120.0,145.0,10.451389,5.166667,Isomaltulose/Palatinose,Sweet,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1975,9.928005,8.626445,0,0,16.890993,2.380585,4.761170,16.890993,1.299307,3.498356,3.361404,0.258570,1.474723,55.950594,4.303892,4.28683332334275,10,11,25,13,0,0,2,12,0,11,2,0,0,0,0,0,0,0,0,156.000000,177.000000,239.000000,212.000000,109.000000,...,0.000000,0.448109,7,4,0.75,0.42857142857142855,161.11060497141938,4.807354922057604,14.0,4.204693,5.043425,5.905362,6.765039,7.631917,8.496786,9.364520,10.230811,11.098637,95.741189,3.367296,0.0,4.653960,0.000000,6.171701,0.00000,7.793587,0.000000,9.464905,44.451449,172.100048,6.884002,232,19,66.0,77.0,4.194444444444445,2.972222,2-Benzoylpyridine,Bitter,False
1976,6.473351,6.191587,0,0,11.142106,2.200122,4.400244,11.142106,1.238012,3.089776,2.850317,0.316702,0.942070,28.079758,3.119973,3.2296884584603025,6,6,18,9,0,0,2,9,0,7,1,1,0,0,0,0,0,0,0,114.000000,104.000000,123.000000,100.000000,46.000000,...,0.000000,0.438078,5,3,0.6666666666666666,0.4,118.24950727298612,4.169925001442312,9.0,3.713572,4.465908,5.262690,6.033086,6.834109,7.608374,8.409831,9.185125,9.986495,79.499191,2.944439,0.0,4.143135,0.000000,5.556828,0.00000,7.062192,0.000000,8.606851,37.313444,123.068414,6.837134,88,9,40.0,43.0,3.4722222222222223,2.166667,2-(2-Ethoxyethyl)pyridine,Bitter,False
1977,6.473351,6.127583,0,0,11.189957,2.193993,4.387987,11.189957,1.243329,3.089765,2.856388,0.317376,0.944197,28.049653,3.116628,3.228615755024593,6,6,17,9,0,0,2,8,0,7,0,2,0,0,0,0,0,0,0,122.000000,104.000000,128.000000,106.000000,58.000000,...,0.000000,0.475391,6,3,1.0,0.5,116.04297362675679,4.169925001442312,9.0,3.713572,4.465908,5.252273,6.028279,6.816736,7.598399,8.386401,9.169831,9.957265,79.388664,2.944439,0.0,4.143135,0.000000,5.556828,0.00000,7.055313,0.000000,8.590258,37.289972,124.052429,7.297202,90,9,40.0,43.0,3.4722222222222223,2.166667,4-BENZYLPYRIDINE,Bitter,False
1978,10.635111,9.197041,0,0,18.814625,2.307250,4.614501,18.814625,1.343902,3.558646,3.487210,0.249086,1.585574,63.607620,4.543401,4.489205511255389,12,12,23,14,0,0,2,9,0,12,1,1,0,0,0,0,0,0,0,190.000000,188.000000,244.000000,236.000000,188.000000,...,0.000000,0.279611,8,4,1.0,0.5,170.9271386535638,4.906890595608519,15.0,4.234107,5.043425,5.866468,6.693324,7.524561,8.356790,9.191056,10.025307,10.860786,96.795823,3.433987,0.0,4.672829,0.000000,6.137727,0.00000,7.690286,0.000000,9.282754,45.217583,183.068414,7.959496,307,18,68.0,77.0,3.8333333333333335,3.222222,TRIDECANAL,Bitter,False


In [None]:
st = df

In [None]:
st

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,VE2_A,VE3_A,VR1_A,VR2_A,VR3_A,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,nBridgehead,nHetero,nH,nB,nC,nN,nO,nS,nP,nF,nCl,nBr,nI,nX,ATS0dv,ATS1dv,ATS2dv,ATS3dv,ATS4dv,...,JGI10,JGT10,Diameter,Radius,TopoShapeIndex,PetitjeanIndex,Vabc,VAdjMat,MWC01,MWC02,MWC03,MWC04,MWC05,MWC06,MWC07,MWC08,MWC09,MWC10,TMWC10,SRW02,SRW03,SRW04,SRW05,SRW06,SRW07,SRW08,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Name,Taste,Bitter
0,8.829880,8.715591,0,0,14.640475,2.418289,4.836579,14.640475,1.220040,3.392014,3.170872,0.264239,1.336328,42.564698,3.547058,3.933347,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,212.0,155.0,254.0,346.0,191.0,...,0.000000,0.664926,6,3,1.000000,0.500000,152.717273,4.584963,12.0,4.110874,4.962845,5.849325,6.722630,7.610358,8.487764,9.375092,10.253827,11.140687,92.513402,3.218876,0.0,4.574711,0.0,6.163315,0.0,7.845024,0.0,9.569063,43.370989,180.063388,7.502641,178,20,60.0,71.0,6.145833,2.722222,0.0,0.0,False
1,8.761080,8.651650,0,0,14.708146,2.377683,4.755367,14.708146,1.225679,3.384121,3.178505,0.264875,1.338733,43.594643,3.632887,3.957256,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,210.0,148.0,240.0,328.0,206.0,...,0.000000,0.549705,6,4,0.500000,0.333333,152.717273,4.584963,12.0,4.077537,4.919981,5.783825,6.645091,7.511525,8.375860,9.242711,10.107774,10.974643,91.638947,3.218876,0.0,4.532599,0.0,6.070738,0.0,7.709308,0.0,9.396405,42.927926,180.063388,7.502641,182,19,58.0,68.0,6.055556,2.777778,0.0,0.0,False
2,8.761080,8.651650,0,0,14.708146,2.377683,4.755367,14.708146,1.225679,3.384121,3.178505,0.264875,1.338733,43.594643,3.632887,3.957256,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,210.0,148.0,240.0,328.0,206.0,...,0.000000,0.549705,6,4,0.500000,0.333333,152.717273,4.584963,12.0,4.077537,4.919981,5.783825,6.645091,7.511525,8.375860,9.242711,10.107774,10.974643,91.638947,3.218876,0.0,4.532599,0.0,6.070738,0.0,7.709308,0.0,9.396405,42.927926,180.063388,7.502641,182,19,58.0,68.0,6.055556,2.777778,0.0,0.0,False
3,8.163363,8.029752,0,0,13.137460,2.364871,4.729742,13.137460,1.194315,3.303305,3.092493,0.281136,1.224288,35.727761,3.247978,3.671238,0,0,23,11,0,0,5,12,0,6,0,5,0,0,0,0,0,0,0,182.0,135.0,216.0,272.0,138.0,...,0.000000,0.560975,5,3,0.666667,0.400000,143.927046,4.459432,11.0,4.007333,4.844187,5.707110,6.561031,7.424762,8.281724,9.145375,10.003107,10.866605,88.841233,3.135494,0.0,4.465908,0.0,6.008813,0.0,7.647309,0.0,9.331052,41.588577,164.068473,7.133412,140,17,54.0,63.0,5.805556,2.444444,0.0,0.0,False
4,6.611250,7.282959,0,0,11.763639,2.188901,4.377802,11.763639,1.176364,3.134263,2.855225,0.285523,1.049151,30.472997,3.047300,3.416841,0,0,20,10,0,0,5,10,0,5,0,5,0,0,0,0,0,0,0,178.0,109.0,149.0,212.0,141.0,...,0.000000,0.545783,6,3,1.000000,0.500000,136.351061,4.169925,9.0,3.713572,4.465908,5.241747,6.018593,6.801283,7.582229,8.366603,9.148359,9.933095,80.271389,2.944439,0.0,4.143135,0.0,5.556828,0.0,7.048386,0.0,8.575273,38.268062,150.052823,7.502641,125,12,40.0,43.0,5.833333,2.555556,0.0,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,19.981978,16.397806,0,0,33.251113,2.526538,5.053076,33.251113,1.278889,4.186542,4.120980,0.158499,2.371602,312.886434,12.034094,6.701352,12,12,44,26,0,0,8,18,0,18,0,8,0,0,0,0,0,0,0,450.0,417.0,627.0,806.0,708.0,...,0.009632,0.516379,13,7,0.857143,0.461538,312.881419,5.807355,28.0,4.934474,5.820083,6.727432,7.630947,8.542666,9.452737,10.367253,11.281107,12.197904,130.954603,4.043051,0.0,5.398163,0.0,6.970730,0.0,8.646993,0.0,10.378385,61.437322,362.100168,8.229549,1633,49,138.0,168.0,10.222222,5.833333,0.0,0.0,True
167,15.913028,13.077290,1,0,26.247606,2.316776,4.633552,26.247606,1.249886,3.945703,3.825724,0.182177,2.083685,176.194516,8.390215,5.913526,12,12,33,21,0,0,5,12,0,16,1,4,0,0,0,0,0,0,0,348.0,310.0,424.0,398.0,383.0,...,0.008617,0.447955,12,6,1.000000,0.500000,259.890613,5.459432,22.0,4.634729,5.433722,6.270988,7.089243,7.926964,8.751000,9.588571,10.414903,11.252664,114.362784,3.806662,0.0,5.081404,0.0,6.541030,0.0,8.083637,0.0,9.668714,54.181448,282.077181,8.547793,1064,28,102.0,114.0,7.416667,4.722222,0.0,0.0,True
168,17.884519,14.823609,1,0,30.336232,2.328747,4.657494,30.336232,1.264010,4.074059,4.563638,0.190152,2.393589,132.574159,5.523923,5.762611,12,12,40,24,0,0,6,16,0,18,1,5,0,0,0,0,0,0,0,404.0,357.0,493.0,502.0,451.0,...,0.008499,0.452254,13,7,0.857143,0.461538,303.272810,5.643856,25.0,4.762174,5.579730,6.428105,7.263330,8.113427,8.952735,9.802617,10.643208,11.492753,122.038079,3.931826,0.0,5.209486,0.0,6.685861,0.0,8.252967,0.0,9.866357,57.946497,326.103396,8.152585,1535,35,116.0,132.0,8.777778,5.583333,0.0,0.0,True
169,15.913028,13.206444,1,0,26.180249,2.317144,4.634288,26.180249,1.246679,3.945707,3.854448,0.183545,2.091165,167.692428,7.985354,5.864069,12,12,33,21,0,0,5,12,0,16,1,4,0,0,0,0,0,0,0,348.0,310.0,425.0,405.0,393.0,...,0.008605,0.443369,11,6,0.833333,0.454545,259.890613,5.459432,22.0,4.634729,5.433722,6.274762,7.090910,7.933438,8.754792,9.597030,10.420822,11.262732,114.402937,3.806662,0.0,5.081404,0.0,6.541030,0.0,8.086103,0.0,9.674389,54.189588,282.077181,8.547793,1050,28,102.0,114.0,7.416667,4.722222,0.0,0.0,True


In [None]:
st = st.drop(['Name','Taste','Bitter'], axis = 1)

In [None]:
st['Name'] = bitter_test['Name']
st['Taste'] = bitter_test['Taste']
st['Bitter'] = bitter_test['Bitter']


In [None]:
st

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,VE2_A,VE3_A,VR1_A,VR2_A,VR3_A,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,nBridgehead,nHetero,nH,nB,nC,nN,nO,nS,nP,nF,nCl,nBr,nI,nX,ATS0dv,ATS1dv,ATS2dv,ATS3dv,ATS4dv,...,JGI10,JGT10,Diameter,Radius,TopoShapeIndex,PetitjeanIndex,Vabc,VAdjMat,MWC01,MWC02,MWC03,MWC04,MWC05,MWC06,MWC07,MWC08,MWC09,MWC10,TMWC10,SRW02,SRW03,SRW04,SRW05,SRW06,SRW07,SRW08,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Name,Taste,Bitter
0,8.829880,8.715591,0,0,14.640475,2.418289,4.836579,14.640475,1.220040,3.392014,3.170872,0.264239,1.336328,42.564698,3.547058,3.933347,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,212.0,155.0,254.0,346.0,191.0,...,0.000000,0.664926,6,3,1.000000,0.500000,152.717273,4.584963,12.0,4.110874,4.962845,5.849325,6.722630,7.610358,8.487764,9.375092,10.253827,11.140687,92.513402,3.218876,0.0,4.574711,0.0,6.163315,0.0,7.845024,0.0,9.569063,43.370989,180.063388,7.502641,178,20,60.0,71.0,6.145833,2.722222,D-Fructose,Non-bitter,False
1,8.761080,8.651650,0,0,14.708146,2.377683,4.755367,14.708146,1.225679,3.384121,3.178505,0.264875,1.338733,43.594643,3.632887,3.957256,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,210.0,148.0,240.0,328.0,206.0,...,0.000000,0.549705,6,4,0.500000,0.333333,152.717273,4.584963,12.0,4.077537,4.919981,5.783825,6.645091,7.511525,8.375860,9.242711,10.107774,10.974643,91.638947,3.218876,0.0,4.532599,0.0,6.070738,0.0,7.709308,0.0,9.396405,42.927926,180.063388,7.502641,182,19,58.0,68.0,6.055556,2.777778,D-Glucose,Non-bitter,False
2,8.761080,8.651650,0,0,14.708146,2.377683,4.755367,14.708146,1.225679,3.384121,3.178505,0.264875,1.338733,43.594643,3.632887,3.957256,0,0,24,12,0,0,6,12,0,6,0,6,0,0,0,0,0,0,0,210.0,148.0,240.0,328.0,206.0,...,0.000000,0.549705,6,4,0.500000,0.333333,152.717273,4.584963,12.0,4.077537,4.919981,5.783825,6.645091,7.511525,8.375860,9.242711,10.107774,10.974643,91.638947,3.218876,0.0,4.532599,0.0,6.070738,0.0,7.709308,0.0,9.396405,42.927926,180.063388,7.502641,182,19,58.0,68.0,6.055556,2.777778,D-Mannose,Non-bitter,False
3,8.163363,8.029752,0,0,13.137460,2.364871,4.729742,13.137460,1.194315,3.303305,3.092493,0.281136,1.224288,35.727761,3.247978,3.671238,0,0,23,11,0,0,5,12,0,6,0,5,0,0,0,0,0,0,0,182.0,135.0,216.0,272.0,138.0,...,0.000000,0.560975,5,3,0.666667,0.400000,143.927046,4.459432,11.0,4.007333,4.844187,5.707110,6.561031,7.424762,8.281724,9.145375,10.003107,10.866605,88.841233,3.135494,0.0,4.465908,0.0,6.008813,0.0,7.647309,0.0,9.331052,41.588577,164.068473,7.133412,140,17,54.0,63.0,5.805556,2.444444,L-Rhamnose,Non-bitter,False
4,6.611250,7.282959,0,0,11.763639,2.188901,4.377802,11.763639,1.176364,3.134263,2.855225,0.285523,1.049151,30.472997,3.047300,3.416841,0,0,20,10,0,0,5,10,0,5,0,5,0,0,0,0,0,0,0,178.0,109.0,149.0,212.0,141.0,...,0.000000,0.545783,6,3,1.000000,0.500000,136.351061,4.169925,9.0,3.713572,4.465908,5.241747,6.018593,6.801283,7.582229,8.366603,9.148359,9.933095,80.271389,2.944439,0.0,4.143135,0.0,5.556828,0.0,7.048386,0.0,8.575273,38.268062,150.052823,7.502641,125,12,40.0,43.0,5.833333,2.555556,D-Ribulose,Non-bitter,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,19.981978,16.397806,0,0,33.251113,2.526538,5.053076,33.251113,1.278889,4.186542,4.120980,0.158499,2.371602,312.886434,12.034094,6.701352,12,12,44,26,0,0,8,18,0,18,0,8,0,0,0,0,0,0,0,450.0,417.0,627.0,806.0,708.0,...,0.009632,0.516379,13,7,0.857143,0.461538,312.881419,5.807355,28.0,4.934474,5.820083,6.727432,7.630947,8.542666,9.452737,10.367253,11.281107,12.197904,130.954603,4.043051,0.0,5.398163,0.0,6.970730,0.0,8.646993,0.0,10.378385,61.437322,362.100168,8.229549,1633,49,138.0,168.0,10.222222,5.833333,,Bitter,True
167,15.913028,13.077290,1,0,26.247606,2.316776,4.633552,26.247606,1.249886,3.945703,3.825724,0.182177,2.083685,176.194516,8.390215,5.913526,12,12,33,21,0,0,5,12,0,16,1,4,0,0,0,0,0,0,0,348.0,310.0,424.0,398.0,383.0,...,0.008617,0.447955,12,6,1.000000,0.500000,259.890613,5.459432,22.0,4.634729,5.433722,6.270988,7.089243,7.926964,8.751000,9.588571,10.414903,11.252664,114.362784,3.806662,0.0,5.081404,0.0,6.541030,0.0,8.083637,0.0,9.668714,54.181448,282.077181,8.547793,1064,28,102.0,114.0,7.416667,4.722222,,Bitter,True
168,17.884519,14.823609,1,0,30.336232,2.328747,4.657494,30.336232,1.264010,4.074059,4.563638,0.190152,2.393589,132.574159,5.523923,5.762611,12,12,40,24,0,0,6,16,0,18,1,5,0,0,0,0,0,0,0,404.0,357.0,493.0,502.0,451.0,...,0.008499,0.452254,13,7,0.857143,0.461538,303.272810,5.643856,25.0,4.762174,5.579730,6.428105,7.263330,8.113427,8.952735,9.802617,10.643208,11.492753,122.038079,3.931826,0.0,5.209486,0.0,6.685861,0.0,8.252967,0.0,9.866357,57.946497,326.103396,8.152585,1535,35,116.0,132.0,8.777778,5.583333,,Bitter,True
169,15.913028,13.206444,1,0,26.180249,2.317144,4.634288,26.180249,1.246679,3.945707,3.854448,0.183545,2.091165,167.692428,7.985354,5.864069,12,12,33,21,0,0,5,12,0,16,1,4,0,0,0,0,0,0,0,348.0,310.0,425.0,405.0,393.0,...,0.008605,0.443369,11,6,0.833333,0.454545,259.890613,5.459432,22.0,4.634729,5.433722,6.274762,7.090910,7.933438,8.754792,9.597030,10.420822,11.262732,114.402937,3.806662,0.0,5.081404,0.0,6.541030,0.0,8.086103,0.0,9.674389,54.189588,282.077181,8.547793,1050,28,102.0,114.0,7.416667,4.722222,,Bitter,True


In [None]:
st.iloc[:, 1615].isna().sum()

0

In [None]:
st.to_csv('bitter_test_v5.csv', index = False)

In [None]:
listc = [15,53,54,55,56,57,58,59,60,61,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,135,136,137,138,144,145,146,147,152,153,154,155,156,162,163,164,165,171,172,173,174,180,181,182,183,188,189,190,191,192,197,198,199,200,201,206,207,208,209,210,216,217,218,219,225,226,227,228,233,234,235,236,237,238,239,240,241,260,261,262,263,264,265,266,267,268,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,341,342,343,344,345,351,352,353,354,360,361,362,363,368,369,370,371,372,378,379,380,381,387,388,389,390,396,397,398,399,404,405,406,407,408,413,414,415,416,417,422,423,424,425,426,432,433,434,435,441,442,443,444,449,450,451,452,457,458,459,460,465,466,467,468,473,474,475,476,481,482,483,484,489,490,491,492,497,498,499,500,505,506,507,508,513,514,515,516,521,522,523,524,529,530,531,532,537,538,539,540,545,546,547,548,553,554,555,556,561,562,563,564,569,570,571,572,577,578,579,580,585,586,587,588,593,594,595,596,601,602,603,604,609,610,611,612,617,618,619,620,625,626,627,628,633,634,635,636,641,642,647,648,655,656,657,658,659,660,678,691,704,705,706,707,708,709,710,711,712,713,714,715,716,717,718,719,720,721,722,723,724,725,726,727,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,756,769,780,781,791,817,825,826,827,828,833,841,842,843,844,852,853,854,860,861,862,890,1208,1209,1210,1211,1212,1213,1214,1215,1216,1221,1222,1223,1224,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234,1235,1236,1237,1238,1239,1240,1241,1242,1243,1244,1245,1246,1247,1248,1249,1250,1251,1252,1269,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279,1280,1299,1300,1301,1552,1580,1581,1582,1583,1611]

In [None]:
len(listc)

415

In [None]:
sweet_train = pd.read_csv("/content/drive/MyDrive/Capstone/DataSets/sweet-train.tsv", sep='\t',encoding='utf-8', header=0)

In [None]:
sweet_train['Sweet'].unique()

array([ True, False])



---



---



---



---



---



### **ChemTaste DB**

---



### Features Generation

In [None]:
import pandas as pd
import numpy as np

In [None]:
dfs = pd.read_excel("/content/ChemTastesDB_database.xlsx")

In [None]:
dfs

Unnamed: 0,ID,Name,PubChem CID,CAS number,canonical SMILES,Taste,Class taste,Reference_(cod)/[pp]
0,0001,(-)-Haematoxylin,320930,517-28-2,Oc1cc2c(cc1O)C1c3ccc(c(c3OCC1(O)C2)O)O,Sweet,Sweetness,Arnoldi1995_((-)-1); Bassoli2001_(39)
1,0002,(+)-4β-hydroxyhernandulcin,126862,145385-64-4,CC(C)=CCCC(C)(O)C1CC(O)C(=CC1=O)C,Sweet,Sweetness,Kinghorn1998_(2); Kinghorn2002_(7); Kinghorn20...
2,0003,(+)-Dihydroquercetin 3-acetate,442540,78834-97-6,CC(=O)OC1C(Oc2cc(cc(c2C1=O)O)O)c1ccc(c(c1)O)O,Sweet,Sweetness,Bouysset2020_(175); Kinghorn2002_(26); Shallen...
3,0004,(+)-Haematoxylin,442514,517-28-2,Oc1cc2c(cc1O)C1c3ccc(c(c3OCC1(O)C2)O)O,Sweet,Sweetness,Arnoldi1995_((+)-1); Arnoldi1996_(12); Bassoli...
4,0005,(±)-chiro-inositol,892,643-12-9,OC1C(O)C(O)C(O)C(O)C1O,Sweet,Sweetness,Shallenberger1993_[149]
...,...,...,...,...,...,...,...,...
2942,2943,WS23,3016599,51115-71-0,CCNC(=O)C(C)(C(C)C)C(C)C,Cooling,Miscellaneous,Ley2012_[28]
2943,2944,WS3,62907,39711-79-0,CCNC(=O)C1CC(C)CCC1C(C)C,Cooling,Miscellaneous,Ley2012_[28]
2944,,,,,,,,
2945,,"^ Chemical compound information (name, PubChem...",,,,,,


In [None]:
dfs = dfs.iloc[0:2944 , :]

In [None]:
dfs

Unnamed: 0,ID,Name,PubChem CID,CAS number,canonical SMILES,Taste,Class taste,Reference_(cod)/[pp]
0,0001,(-)-Haematoxylin,320930,517-28-2,Oc1cc2c(cc1O)C1c3ccc(c(c3OCC1(O)C2)O)O,Sweet,Sweetness,Arnoldi1995_((-)-1); Bassoli2001_(39)
1,0002,(+)-4β-hydroxyhernandulcin,126862,145385-64-4,CC(C)=CCCC(C)(O)C1CC(O)C(=CC1=O)C,Sweet,Sweetness,Kinghorn1998_(2); Kinghorn2002_(7); Kinghorn20...
2,0003,(+)-Dihydroquercetin 3-acetate,442540,78834-97-6,CC(=O)OC1C(Oc2cc(cc(c2C1=O)O)O)c1ccc(c(c1)O)O,Sweet,Sweetness,Bouysset2020_(175); Kinghorn2002_(26); Shallen...
3,0004,(+)-Haematoxylin,442514,517-28-2,Oc1cc2c(cc1O)C1c3ccc(c(c3OCC1(O)C2)O)O,Sweet,Sweetness,Arnoldi1995_((+)-1); Arnoldi1996_(12); Bassoli...
4,0005,(±)-chiro-inositol,892,643-12-9,OC1C(O)C(O)C(O)C(O)C1O,Sweet,Sweetness,Shallenberger1993_[149]
...,...,...,...,...,...,...,...,...
2939,2940,Spilanthol,5353001,25394-57-4,CC=CC=CCCC=CC(=O)NCC(C)C,"Heating, pungent, and tingling",Miscellaneous,Ley2012_[26]
2940,2941,Tannic acid,16129778,1401-55-4,Oc1cc(cc(c1O)O)C(=O)Oc1cc(cc(c1O)O)C(=O)OCC1OC...,Astringent,Miscellaneous,Ley2012_[29]
2941,2942,trans-Pellitorine,5318516,18836-52-7,CCCCCC=CC=CC(=O)NCC(C)C,"Heating, pungent, and tingling",Miscellaneous,Ley2012_[26]
2942,2943,WS23,3016599,51115-71-0,CCNC(=O)C(C)(C(C)C)C(C)C,Cooling,Miscellaneous,Ley2012_[28]


In [None]:
!conda --version

/bin/bash: conda: command not found


In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()

✨🍰✨ Everything looks OK!


In [None]:
!conda --version

conda 4.9.2


In [None]:
!conda install -c rdkit -c mordred-descriptor mordred

Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / done
Solving environment: \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - mordred


The following packages will be downloaded:

    package                    |            build
    ---------------------------|------------

In [None]:
from rdkit import Chem
from mordred import Calculator, descriptors
calcAll = Calculator(descriptors)
calc2d = Calculator(descriptors, ignore_3D=True)

In [None]:
len(calcAll.descriptors)

1826

In [None]:
len(calc2d.descriptors)

1613

In [None]:
mols = [Chem.MolFromSmiles(smi) for smi in dfs['canonical SMILES']]

In [None]:
type(mols)

list

In [None]:
mols

[<rdkit.Chem.rdchem.Mol at 0x7f3877658a30>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768ef80>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768e710>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768eb20>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768eb70>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768e800>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768edf0>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768ee40>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768ea30>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768e4e0>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768e5d0>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768e9e0>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768ed50>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768ee90>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768ebc0>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768e8f0>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768e620>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768e490>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768e3f0>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768e350>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768e2b0>,
 <rdkit.Chem.rdchem.Mol at 0x7f387768e210>,
 <rdkit.Chem.rdchem.Mol at 0x7f3

In [None]:
noneIndices = []
withoutNoneIndices = []
molesWithoutNone = []
for i in range(len(mols)):
  if mols[i]==None:
    noneIndices.append(i)
  else:
    molesWithoutNone.append(mols[i])
    withoutNoneIndices.append(i)

In [None]:
noneIndices

[]

In [None]:
df_chemTaste = calc2d.pandas(molesWithoutNone)

 22%|██▏       | 654/2944 [04:26<1:09:33,  1.82s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 24%|██▍       | 702/2944 [04:45<43:48,  1.17s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 2944/2944 [22:30<00:00,  2.18it/s]


In [None]:
df_chemTaste

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,10.469965,72.021791,302.079038,8.391084,912,44,130.0,163.0,7.8125,4.527778
1,13.450219,12.310626,0,0,20.608616,2.416443,4.832887,20.608616,1.144923,3.779851,...,9.736665,50.837448,252.172545,6.004108,633,26,88.0,99.0,8.868056,3.930556
2,19.574531,16.089858,0,0,30.966408,2.511665,5.023331,30.966408,1.238656,4.150646,...,10.304443,60.198976,346.068867,8.873561,1370,44,134.0,160.0,9.972222,5.361111
3,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,10.469965,72.021791,302.079038,8.391084,912,44,130.0,163.0,7.8125,4.527778
4,8.898979,8.883053,0,0,14.601126,2.414214,4.828427,14.601126,1.216761,3.391683,...,9.542876,43.309911,180.063388,7.502641,174,21,60.0,72.0,6.666667,2.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2939,10.934771,9.752499,0,0,18.859991,2.07098,4.141959,18.859991,1.178749,3.594751,...,8.451267,45.350312,221.177964,5.671230,622,14,62.0,61.0,6.722222,4.000000
2940,95.791774,67.490769,0,0,153.178425,2.499577,4.999154,153.178425,1.255561,5.725,...,11.830906,165.035792,1700.172974,9.771109,97399,228,658.0,787.0,49.111111,26.027778
2941,10.934771,9.752499,0,0,18.859991,2.07098,4.141959,18.859991,1.178749,3.594751,...,8.451267,45.350312,223.193614,5.443747,622,14,62.0,61.0,6.722222,4.000000
2942,9.006320,9.750091,0,0,14.248626,2.393403,4.786805,14.248626,1.096048,3.417811,...,9.452109,44.032654,185.177964,5.143832,236,21,58.0,67.0,7.895833,3.083333


In [None]:
temp_df = dfs.iloc[withoutNoneIndices]

In [None]:
temp_df

Unnamed: 0,ID,Name,PubChem CID,CAS number,canonical SMILES,Taste,Class taste,Reference_(cod)/[pp]
0,0001,(-)-Haematoxylin,320930,517-28-2,Oc1cc2c(cc1O)C1c3ccc(c(c3OCC1(O)C2)O)O,Sweet,Sweetness,Arnoldi1995_((-)-1); Bassoli2001_(39)
1,0002,(+)-4β-hydroxyhernandulcin,126862,145385-64-4,CC(C)=CCCC(C)(O)C1CC(O)C(=CC1=O)C,Sweet,Sweetness,Kinghorn1998_(2); Kinghorn2002_(7); Kinghorn20...
2,0003,(+)-Dihydroquercetin 3-acetate,442540,78834-97-6,CC(=O)OC1C(Oc2cc(cc(c2C1=O)O)O)c1ccc(c(c1)O)O,Sweet,Sweetness,Bouysset2020_(175); Kinghorn2002_(26); Shallen...
3,0004,(+)-Haematoxylin,442514,517-28-2,Oc1cc2c(cc1O)C1c3ccc(c(c3OCC1(O)C2)O)O,Sweet,Sweetness,Arnoldi1995_((+)-1); Arnoldi1996_(12); Bassoli...
4,0005,(±)-chiro-inositol,892,643-12-9,OC1C(O)C(O)C(O)C(O)C1O,Sweet,Sweetness,Shallenberger1993_[149]
...,...,...,...,...,...,...,...,...
2939,2940,Spilanthol,5353001,25394-57-4,CC=CC=CCCC=CC(=O)NCC(C)C,"Heating, pungent, and tingling",Miscellaneous,Ley2012_[26]
2940,2941,Tannic acid,16129778,1401-55-4,Oc1cc(cc(c1O)O)C(=O)Oc1cc(cc(c1O)O)C(=O)OCC1OC...,Astringent,Miscellaneous,Ley2012_[29]
2941,2942,trans-Pellitorine,5318516,18836-52-7,CCCCCC=CC=CC(=O)NCC(C)C,"Heating, pungent, and tingling",Miscellaneous,Ley2012_[26]
2942,2943,WS23,3016599,51115-71-0,CCNC(=O)C(C)(C(C)C)C(C)C,Cooling,Miscellaneous,Ley2012_[28]


In [None]:
df_chemTaste['Name'] = temp_df['Name']
df_chemTaste['Taste'] = temp_df['Taste']
df_chemTaste['Class taste'] = temp_df['Class taste']

In [None]:
df_chemTaste

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Name,Taste,Class taste
0,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,8.391084,912,44,130.0,163.0,7.8125,4.527778,(-)-Haematoxylin,Sweet,Sweetness
1,13.450219,12.310626,0,0,20.608616,2.416443,4.832887,20.608616,1.144923,3.779851,...,6.004108,633,26,88.0,99.0,8.868056,3.930556,(+)-4β-hydroxyhernandulcin,Sweet,Sweetness
2,19.574531,16.089858,0,0,30.966408,2.511665,5.023331,30.966408,1.238656,4.150646,...,8.873561,1370,44,134.0,160.0,9.972222,5.361111,(+)-Dihydroquercetin 3-acetate,Sweet,Sweetness
3,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,8.391084,912,44,130.0,163.0,7.8125,4.527778,(+)-Haematoxylin,Sweet,Sweetness
4,8.898979,8.883053,0,0,14.601126,2.414214,4.828427,14.601126,1.216761,3.391683,...,7.502641,174,21,60.0,72.0,6.666667,2.666667,(±)-chiro-inositol,Sweet,Sweetness
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2939,10.934771,9.752499,0,0,18.859991,2.07098,4.141959,18.859991,1.178749,3.594751,...,5.671230,622,14,62.0,61.0,6.722222,4.000000,Spilanthol,"Heating, pungent, and tingling",Miscellaneous
2940,95.791774,67.490769,0,0,153.178425,2.499577,4.999154,153.178425,1.255561,5.725,...,9.771109,97399,228,658.0,787.0,49.111111,26.027778,Tannic acid,Astringent,Miscellaneous
2941,10.934771,9.752499,0,0,18.859991,2.07098,4.141959,18.859991,1.178749,3.594751,...,5.443747,622,14,62.0,61.0,6.722222,4.000000,trans-Pellitorine,"Heating, pungent, and tingling",Miscellaneous
2942,9.006320,9.750091,0,0,14.248626,2.393403,4.786805,14.248626,1.096048,3.417811,...,5.143832,236,21,58.0,67.0,7.895833,3.083333,WS23,Cooling,Miscellaneous


In [None]:
df_chemTaste.to_csv('chemTasteDB.csv',index=False)

### Data Cleaning

In [None]:
chemDB = pd.read_csv("/content/drive/MyDrive/Capstone/chemTasteDB/Padel/ChemTasteDBFeatures.csv", encoding='unicode_escape')

In [None]:
chemDB.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2927 entries, 0 to 2926
Columns: 1448 entries, ID to Zagreb
dtypes: float64(1045), int64(399), object(4)
memory usage: 32.3+ MB


In [None]:
chemDB['Class taste'].unique()

array(['Sweetness', 'Bitterness', 'Umaminess', 'Sourness', 'Saltiness',
       'Multitaste', 'Tastelessness', 'Non-sweetness', 'Miscellaneous'],
      dtype=object)

In [None]:
chemDB.iloc[:, 1614].isna().sum()

0

In [None]:
chemDB = chemDB[chemDB['Class Taste'].notna()]


In [None]:
df = chemDB.apply(pd.to_numeric, errors='coerce')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2927 entries, 0 to 2926
Columns: 1448 entries, ID to Zagreb
dtypes: float64(1049), int64(399)
memory usage: 32.4 MB


In [None]:
df = df.fillna(0)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2927 entries, 0 to 2926
Columns: 1448 entries, ID to Zagreb
dtypes: float64(1049), int64(399)
memory usage: 32.4 MB


In [None]:
df.iloc[:, 1615].unique()

array([0.])

In [None]:
df

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Name,Taste,Class taste
0,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,8.391084,912,44,130.0,163.0,7.812500,4.527778,0.0,0.0,0.0
1,13.450219,12.310626,0,0,20.608616,2.416443,4.832887,20.608616,1.144923,3.779851,...,6.004108,633,26,88.0,99.0,8.868056,3.930556,0.0,0.0,0.0
2,19.574531,16.089858,0,0,30.966408,2.511665,5.023331,30.966408,1.238656,4.150646,...,8.873561,1370,44,134.0,160.0,9.972222,5.361111,0.0,0.0,0.0
3,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,8.391084,912,44,130.0,163.0,7.812500,4.527778,0.0,0.0,0.0
4,8.898979,8.883053,0,0,14.601126,2.414214,4.828427,14.601126,1.216761,3.391683,...,7.502641,174,21,60.0,72.0,6.666667,2.666667,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2939,10.934771,9.752499,0,0,18.859991,2.070980,4.141959,18.859991,1.178749,3.594751,...,5.671230,622,14,62.0,61.0,6.722222,4.000000,0.0,0.0,0.0
2940,95.791774,67.490769,0,0,153.178425,2.499577,4.999154,153.178425,1.255561,5.725000,...,9.771109,97399,228,658.0,787.0,49.111111,26.027778,0.0,0.0,0.0
2941,10.934771,9.752499,0,0,18.859991,2.070980,4.141959,18.859991,1.178749,3.594751,...,5.443747,622,14,62.0,61.0,6.722222,4.000000,0.0,0.0,0.0
2942,9.006320,9.750091,0,0,14.248626,2.393403,4.786805,14.248626,1.096048,3.417811,...,5.143832,236,21,58.0,67.0,7.895833,3.083333,0.0,0.0,0.0


In [None]:
chemDB

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Name,Taste,Class taste
0,17.929457,14.412750,0,0,28.336926034610812,2.6128329480551598,5.116646258260108,28.336926034610812,1.2880420924823097,4.069198584297368,...,8.391084,912,44,130.0,163.0,7.8125,4.527778,(-)-Haematoxylin,Sweet,Sweetness
1,13.450219,12.310626,0,0,20.6086162993548,2.4164433869911117,4.832886773982225,20.6086162993548,1.1449231277419334,3.7798512729025218,...,6.004108,633,26,88.0,99.0,8.868055555555555,3.930556,(+)-4β-hydroxyhernandulcin,Sweet,Sweetness
2,19.574531,16.089858,0,0,30.966407756672158,2.5116652595529247,5.023330519105851,30.966407756672158,1.2386563102668864,4.150646434875785,...,8.873561,1370,44,134.0,160.0,9.972222222222221,5.361111,(+)-Dihydroquercetin 3-acetate,Sweet,Sweetness
3,17.929457,14.412750,0,0,28.336926034610812,2.6128329480551598,5.116646258260108,28.336926034610812,1.2880420924823097,4.069198584297368,...,8.391084,912,44,130.0,163.0,7.8125,4.527778,(+)-Haematoxylin,Sweet,Sweetness
4,8.898979,8.883053,0,0,14.601126159491539,2.414213562373095,4.828427124746189,14.601126159491539,1.2167605132909616,3.39168290006577,...,7.502641,174,21,60.0,72.0,6.666666666666666,2.666667,(±)-chiro-inositol,Sweet,Sweetness
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2939,10.934771,9.752499,0,0,18.85999056309914,2.0709795949105736,4.141959189821147,18.85999056309914,1.1787494101936962,3.5947510767936492,...,5.671230,622,14,62.0,61.0,6.722222222222222,4.000000,Spilanthol,"Heating, pungent, and tingling",Miscellaneous
2940,95.791774,67.490769,0,0,153.1784253931834,2.4995768908221407,4.999153781644281,153.17842539318343,1.2555608638785527,5.724999953113247,...,9.771109,97399,228,658.0,787.0,49.11111111111111,26.027778,Tannic acid,Astringent,Miscellaneous
2941,10.934771,9.752499,0,0,18.85999056309914,2.0709795949105736,4.141959189821147,18.85999056309914,1.1787494101936962,3.5947510767936492,...,5.443747,622,14,62.0,61.0,6.722222222222222,4.000000,trans-Pellitorine,"Heating, pungent, and tingling",Miscellaneous
2942,9.006320,9.750091,0,0,14.248626303340256,2.393402504202843,4.786805008405684,14.248626303340256,1.0960481771800197,3.4178110205031547,...,5.143832,236,21,58.0,67.0,7.895833333333333,3.083333,WS23,Cooling,Miscellaneous


In [None]:
temp2 = df

In [None]:
temp2

Unnamed: 0,ID,Name,Taste,Class Taste,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,1.0,0.0,0.0,0.0,0,-2.4092,5.804245,29.9362,42.307102,12,...,8.391084,45.461903,2.066450,15.901910,15.901910,0.000000,912,44,0.503,130
1,2.0,0.0,0.0,0.0,0,1.9056,3.631311,73.5138,44.809032,0,...,6.004108,34.664420,1.925801,7.514035,7.514035,0.000000,633,26,1.865,88
2,3.0,0.0,0.0,0.0,0,-2.5478,6.491285,35.6603,45.671102,12,...,8.873561,50.408763,2.016351,21.332061,21.332061,0.000000,1370,44,0.339,134
3,4.0,0.0,0.0,0.0,0,-2.4092,5.804245,29.9362,42.307102,12,...,8.391084,45.461903,2.066450,15.901910,15.901910,0.000000,912,44,0.503,130
4,5.0,0.0,0.0,0.0,0,-3.0642,9.389322,35.7750,23.373516,0,...,7.502641,22.897487,1.908124,14.905717,14.905717,0.000000,174,21,-1.458,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2922,0.0,0.0,0.0,0.0,0,-0.2512,0.063101,27.5342,31.969965,11,...,15.177071,33.856692,1.991570,25.069339,7.098079,9.104502,549,20,1.567,88
2923,2940.0,0.0,0.0,0.0,0,2.1971,4.827248,71.9383,41.878239,0,...,5.671230,30.482814,1.905176,5.422166,2.505437,2.916729,622,14,4.058,62
2924,2942.0,0.0,0.0,0.0,0,0.2215,0.049062,66.0689,43.211825,0,...,5.443747,30.482814,1.905176,5.422166,2.505437,2.916729,622,14,4.469,62
2925,2943.0,0.0,0.0,0.0,0,1.1359,1.290269,54.0902,36.598239,0,...,5.143832,23.909473,1.839190,5.292463,2.471573,2.820889,236,21,3.034,58


In [None]:
temp2 = temp2.drop(['Name','Taste','Class Taste'], axis = 1)

In [None]:
temp2['Name'] = chemDB['Name']
temp2['Taste'] = chemDB['Taste']
temp2['Class Taste'] = chemDB['Class Taste']


In [None]:
temp2

Unnamed: 0,ID,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,Name,Taste,Class Taste
0,1.0,0,-2.4092,5.804245,29.9362,42.307102,12,12,36,22,...,15.901910,15.901910,0.000000,912,44,0.503,130,(-)-Haematoxylin,Sweet,Sweetness
1,2.0,0,1.9056,3.631311,73.5138,44.809032,0,0,42,18,...,7.514035,7.514035,0.000000,633,26,1.865,88,(+)-4?-hydroxyhernandulcin,Sweet,Sweetness
2,3.0,0,-2.5478,6.491285,35.6603,45.671102,12,12,39,25,...,21.332061,21.332061,0.000000,1370,44,0.339,134,(+)-Dihydroquercetin 3-acetate,Sweet,Sweetness
3,4.0,0,-2.4092,5.804245,29.9362,42.307102,12,12,36,22,...,15.901910,15.901910,0.000000,912,44,0.503,130,(+)-Haematoxylin,Sweet,Sweetness
4,5.0,0,-3.0642,9.389322,35.7750,23.373516,0,0,24,12,...,14.905717,14.905717,0.000000,174,21,-1.458,60,(±)-chiro-inositol,Sweet,Sweetness
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2922,0.0,0,-0.2512,0.063101,27.5342,31.969965,11,12,22,17,...,25.069339,7.098079,9.104502,549,20,1.567,88,"Sodium N-[5-(3-bromophenyl)-1,3,4-thiadiazol-2...",Non-sweet/Sweet,Miscellaneous
2923,2940.0,0,2.1971,4.827248,71.9383,41.878239,0,0,39,16,...,5.422166,2.505437,2.916729,622,14,4.058,62,Spilanthol,"Heating, pungent, and tingling",Miscellaneous
2924,2942.0,0,0.2215,0.049062,66.0689,43.211825,0,0,41,16,...,5.422166,2.505437,2.916729,622,14,4.469,62,trans-Pellitorine,"Heating, pungent, and tingling",Miscellaneous
2925,2943.0,0,1.1359,1.290269,54.0902,36.598239,0,0,36,13,...,5.292463,2.471573,2.820889,236,21,3.034,58,WS23,Cooling,Miscellaneous


In [None]:
temp2.iloc[:, 1615].isna().sum()

0

In [None]:
temp2.to_csv('chemTasteDBCleaned.csv', index = False)

In [None]:
listc = [135,136,137,138,139,144,145,146,147,148,162,163,164,165,166,171,172,173,174,175,180,181,182,183,184,189,190,191,192,193,198,199,200,201,202,207,208,209,210,211,216,217,218,219,220,225,226,227,228,229,233,234,235,236,237,238,239,240,241,341,342,343,344,345,346,351,352,353,354,355,360,361,362,363,364,378,379,380,381,382,387,388,389,390,391,396,397,398,399,400,405,406,407,408,409,414,415,416,417,418,423,424,425,426,427,432,433,434,435,436,441,442,443,444,445,449,450,451,452,453,457,458,459,460,461,465,466,467,468,469,481,482,483,484,485,489,490,491,492,493,497,498,499,500,501,505,506,507,508,509,513,514,515,516,517,521,522,523,524,525,529,530,531,532,533,537,538,539,540,541,545,546,547,548,549,553,554,555,556,557,561,562,563,564,565,577,578,579,580,581,585,586,587,588,589,593,594,595,596,597,601,602,603,604,605,609,610,611,612,613,617,618,619,620,621,625,626,627,628,629,633,634,635,636,637,780,781,791,826,827,828,842,843,844,1275,1276,1277,1278,1279,1280,1299,1300,1301,1552,1583]

In [None]:
len(listc)

245

In [None]:
sweet_train = pd.read_csv("/content/drive/MyDrive/Capstone/DataSets/sweet-train.tsv", sep='\t',encoding='utf-8', header=0)

In [None]:
sweet_train['Sweet'].unique()

array([ True, False])

In [None]:
def findPredictions(x_Train, x_Test, y_Train, y_Test):

  #Preparing data for PCA
  pca2 = PCA(n_components=2)
  x_pca2_data_train = pca2.fit_transform(x_Train)
  x_pca2_data_test = pca2.fit_transform(x_Test)

  pca10 = PCA(n_components=10)
  x_pca10_data_train = pca10.fit_transform(x_Train)
  x_pca10_data_test = pca10.fit_transform(x_Test)

  
  
  #Preparing data for Correlation matrix on features
  train_data = pd.concat([x_Train,y_Train],axis=1)
  test_data = pd.concat([x_Test,y_Test],axis=1)
  train_corr , test_corr = correlation_check(train_data, test_data,0.7)

  
  x_Train_corr = train_corr.drop(['Class taste'],axis=1)
  x_Test_corr = test_corr.drop(['Class taste'],axis=1)

  
  
  #Preparing data for Correlation matrix on features and Labels

  a = x_Train_corr.corr(method ='pearson').abs()
  df_temp = a.head(1)
  df_temp.fillna(0,inplace = True)

  colList = []
  for i in df_temp.columns:
    if(df_temp[i].item() < 0.1):
      colList.append(i)

  for i in colList:
    new_x_train_corr = x_Train_corr.drop(i,axis = 1)

  for i in colList:
    new_x_test_corr = x_Test_corr.drop(i,axis = 1)


  

  #Preparing data for selectkBest
  from sklearn.feature_selection import SelectKBest, f_classif
  
  X_new = SelectKBest(f_classif, k=200).fit(x_Train, y_Train)
  cols = X_new.get_support(indices=True)
  features_df_new_train = x_Train.iloc[:,cols]
  features_df_new_test = x_Test.iloc[:,cols]

  



  #Applying Models : 
  #---------------------

  #1. Logistic Regression :
  from sklearn.linear_model import LogisticRegression

  clfLR1 = LogisticRegression(random_state=0)
  
  print("Logistic Regression : ")
  print("------------------------")
  print()

  
  clfLR_Basic = clfLR1.fit(x_Train, y_Train)
  y_pred_Basic = clfLR_Basic.predict(x_Test)
  print("Basic ==>                          Precision: ",precision_score(y_Test, y_pred_Basic,average='weighted')," Recall: ",recall_score(y_Test, y_pred_Basic,average='weighted'),"  F1: ",f1_score(y_Test, y_pred_Basic, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_pred_Basic))
  print()

  clfLR2 = LogisticRegression(random_state=0)

  clfLRpca2 = clfLR2.fit(x_pca2_data_train, y_Train)
  y_predPca2 = clfLRpca2.predict(x_pca2_data_test)
  print("PCA with n = 2 ==>                 Precision: ",precision_score(y_Test, y_predPca2, average='weighted')," Recall: ",recall_score(y_Test, y_predPca2, average='weighted'),"  F1: ",f1_score(y_Test, y_predPca2, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_predPca2))
  print()

  clfLR3 = LogisticRegression(random_state=0)

  clfLRpca10 = clfLR3.fit(x_pca10_data_train, y_Train)
  y_predPca10 = clfLRpca10.predict(x_pca10_data_test)
  print("PCA with n = 10 ==>                Precision: ",precision_score(y_Test, y_predPca10, average='weighted')," Recall: ",recall_score(y_Test, y_predPca10, average='weighted'),"  F1: ",f1_score(y_Test, y_predPca10, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_predPca10))
  print()

  clfLR4 = LogisticRegression(random_state=0)

  clfLR_CMF = clfLR4.fit(x_Train_corr, y_Train)
  y_pred_CMF = clfLR_CMF.predict(x_Test_corr)
  print("Correlation Matrix on Features ==> Precision: ",precision_score(y_Test, y_pred_CMF, average='weighted')," Recall: ",recall_score(y_Test, y_pred_CMF, average='weighted'),"  F1: ",f1_score(y_Test, y_pred_CMF, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_pred_CMF))
  print()

  clfLR5 = LogisticRegression(random_state=0)

  clfLR_CMFL = clfLR5.fit(new_x_train_corr, y_Train)
  y_pred_CMFL = clfLR_CMFL.predict(new_x_test_corr)
  print("CorMatrix on Features & Labels ==> Precision: ",precision_score(y_Test, y_pred_CMFL, average='weighted')," Recall: ",recall_score(y_Test, y_pred_CMFL, average='weighted'),"  F1: ",f1_score(y_Test, y_pred_CMFL, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_pred_CMFL))
  print()

  clfLR6 = LogisticRegression(random_state=0)

  clfLR_selectK = clfLR6.fit(features_df_new_train, y_Train)
  y_pred_selectK = clfLR_selectK.predict(features_df_new_test)
  print("SelectK best with k = 200 ==>      Precision: ",precision_score(y_Test, y_pred_selectK, average='weighted')," Recall: ",recall_score(y_Test, y_pred_selectK, average='weighted'),"  F1: ",f1_score(y_Test, y_pred_selectK, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_pred_selectK))
  print()

  #=====================================================================================================================================================

  #2. Random forgest :
  from sklearn.ensemble import RandomForestClassifier

  clfRF1 = RandomForestClassifier(max_features='auto', n_estimators= 200, max_depth=10, criterion='entropy', random_state=30)
  
  print("Random Forest : ")
  print("----------------")
  print()

  clfRF_Basic = clfRF1.fit(x_Train, y_Train)
  y_pred_Basic = clfRF_Basic.predict(x_Test)
  print("Basic ==>                          Precision: ",precision_score(y_Test, y_pred_Basic,average='weighted')," Recall: ",recall_score(y_Test, y_pred_Basic,average='weighted'),"  F1: ",f1_score(y_Test, y_pred_Basic, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_pred_Basic))
  print()

  clfRF2 = RandomForestClassifier(max_features='auto', n_estimators= 200, max_depth=10, criterion='entropy', random_state=30)


  clfRFpca2 = clfRF2.fit(x_pca2_data_train, y_Train)
  y_predPca2 = clfRFpca2.predict(x_pca2_data_test)
  print("PCA with n = 2 ==>                 Precision: ",precision_score(y_Test, y_predPca2, average='weighted')," Recall: ",recall_score(y_Test, y_predPca2, average='weighted'),"  F1: ",f1_score(y_Test, y_predPca2, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_predPca2))
  print()

  clfRF3 = RandomForestClassifier(max_features='auto', n_estimators= 200, max_depth=10, criterion='entropy', random_state=30)


  clfRFpca10 = clfRF3.fit(x_pca10_data_train, y_Train)
  y_predPca10 = clfRFpca10.predict(x_pca10_data_test)
  print("PCA with n = 10 ==>                Precision: ",precision_score(y_Test, y_predPca10, average='weighted')," Recall: ",recall_score(y_Test, y_predPca10, average='weighted'),"  F1: ",f1_score(y_Test, y_predPca10, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_predPca10))
  print()


  clfRF4 = RandomForestClassifier(max_features='auto', n_estimators= 200, max_depth=10, criterion='entropy', random_state=30)

  clfRF_CMF = clfRF4.fit(x_Train_corr, y_Train)
  y_pred_CMF = clfRF_CMF.predict(x_Test_corr)
  print("Correlation Matrix on Features ==> Precision: ",precision_score(y_Test, y_pred_CMF, average='weighted')," Recall: ",recall_score(y_Test, y_pred_CMF, average='weighted'),"  F1: ",f1_score(y_Test, y_pred_CMF, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_pred_CMF))
  print()

  clfRF5 = RandomForestClassifier(max_features='auto', n_estimators= 200, max_depth=10, criterion='entropy', random_state=30)

  clfLR_CMFL = clfRF5.fit(new_x_train_corr, y_Train)
  y_pred_CMFL = clfLR_CMFL.predict(new_x_test_corr)
  print("CorMatrix on Features & Labels ==> Precision: ",precision_score(y_Test, y_pred_CMFL, average='weighted')," Recall: ",recall_score(y_Test, y_pred_CMFL, average='weighted'),"  F1: ",f1_score(y_Test, y_pred_CMFL, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_pred_CMFL))
  print()


  clfRF6 = RandomForestClassifier(max_features='auto', n_estimators= 200, max_depth=10, criterion='entropy', random_state=30)

  clfRF_selectK = clfRF6.fit(features_df_new_train, y_Train)
  y_pred_selectK = clfRF_selectK.predict(features_df_new_test)
  print("SelectK best with k = 200 ==>      Precision: ",precision_score(y_Test, y_pred_selectK, average='weighted')," Recall: ",recall_score(y_Test, y_pred_selectK, average='weighted'),"  F1: ",f1_score(y_Test, y_pred_selectK, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_pred_selectK))
  print()



  


  #3. Adaboost
  from sklearn.ensemble import AdaBoostClassifier

  clfAB1 = AdaBoostClassifier(n_estimators=100, random_state=0)

  print("AdaBoost : ")
  print("----------------")
  print()

  clfAB_Basic = clfAB1.fit(x_Train, y_Train)
  y_pred_Basic = clfAB_Basic.predict(x_Test)
  print("Basic ==>                          Precision: ",precision_score(y_Test, y_pred_Basic,average='weighted')," Recall: ",recall_score(y_Test, y_pred_Basic,average='weighted'),"  F1: ",f1_score(y_Test, y_pred_Basic, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_pred_Basic))
  print()

  clfAB2 = AdaBoostClassifier(n_estimators=100, random_state=0)

  clfABpca2 = clfAB2.fit(x_pca2_data_train, y_Train)
  y_predPca2 = clfABpca2.predict(x_pca2_data_test)
  print("PCA with n = 2 ==>                 Precision: ",precision_score(y_Test, y_predPca2, average='weighted')," Recall: ",recall_score(y_Test, y_predPca2, average='weighted'),"  F1: ",f1_score(y_Test, y_predPca2, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_predPca2))
  print()

  clfAB3 = AdaBoostClassifier(n_estimators=100, random_state=0)

  clfABpca10 = clfAB3.fit(x_pca10_data_train, y_Train)
  y_predPca10 = clfABpca10.predict(x_pca10_data_test)
  print("PCA with n = 10 ==>                Precision: ",precision_score(y_Test, y_predPca10, average='weighted')," Recall: ",recall_score(y_Test, y_predPca10, average='weighted'),"  F1: ",f1_score(y_Test, y_predPca10, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_predPca10))
  print()


  clfAB4 = AdaBoostClassifier(n_estimators=100, random_state=0)

  clfAB_CMF = clfAB4.fit(x_Train_corr, y_Train)
  y_pred_CMF = clfAB_CMF.predict(x_Test_corr)
  print("Correlation Matrix on Features ==> Precision: ",precision_score(y_Test, y_pred_CMF, average='weighted')," Recall: ",recall_score(y_Test, y_pred_CMF, average='weighted'),"  F1: ",f1_score(y_Test, y_pred_CMF, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_pred_CMF))
  print()

  clfAB5 = AdaBoostClassifier(n_estimators=100, random_state=0)

  clfAB_CMFL = clfAB5.fit(new_x_train_corr, y_Train)
  y_pred_CMFL = clfAB_CMFL.predict(new_x_test_corr)
  print("CorMatrix on Features & Labels ==> Precision: ",precision_score(y_Test, y_pred_CMFL, average='weighted')," Recall: ",recall_score(y_Test, y_pred_CMFL, average='weighted'),"  F1: ",f1_score(y_Test, y_pred_CMFL, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_pred_CMFL))
  print()

  clfAB6 = AdaBoostClassifier(n_estimators=100, random_state=0)


  clfAB_selectK = clfAB6.fit(features_df_new_train, y_Train)
  y_pred_selectK = clfAB_selectK.predict(features_df_new_test)
  print("SelectK best with k = 200 ==>      Precision: ",precision_score(y_Test, y_pred_selectK, average='weighted')," Recall: ",recall_score(y_Test, y_pred_selectK, average='weighted'),"  F1: ",f1_score(y_Test, y_pred_selectK, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_pred_selectK))
  print()



  
  #4. XGBoostClassifier
  import xgboost as xg

  xgb_r1 = xg.XGBClassifier(max_depth=10)

  print("XGBoostClassifier : ")
  print("----------------")
  print()

  clfXGB_Basic = xgb_r1.fit(x_Train, y_Train)
  y_pred_Basic = clfXGB_Basic.predict(x_Test)
  print("Basic ==>                          Precision: ",precision_score(y_Test, y_pred_Basic,average='weighted')," Recall: ",recall_score(y_Test, y_pred_Basic,average='weighted'),"  F1: ",f1_score(y_Test, y_pred_Basic, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_pred_Basic))
  print()

  xgb_r2 = xg.XGBClassifier(max_depth=10)

  clfXGBpca2 = xgb_r2.fit(x_pca2_data_train, y_Train)
  y_predPca2 = clfXGBpca2.predict(x_pca2_data_test)
  print("PCA with n = 2 ==>                 Precision: ",precision_score(y_Test, y_predPca2, average='weighted')," Recall: ",recall_score(y_Test, y_predPca2, average='weighted'),"  F1: ",f1_score(y_Test, y_predPca2, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_predPca2))
  print()

  xgb_r3 = xg.XGBClassifier(max_depth=10)


  clfXGBpca10 = xgb_r3.fit(x_pca10_data_train, y_Train)
  y_predPca10 = clfXGBpca10.predict(x_pca10_data_test)
  print("PCA with n = 10 ==>                Precision: ",precision_score(y_Test, y_predPca10, average='weighted')," Recall: ",recall_score(y_Test, y_predPca10, average='weighted'),"  F1: ",f1_score(y_Test, y_predPca10, average='macro')," Accuracy: ",accuracy_score(y_Test, y_predPca10))
  print()


  xgb_r4 = xg.XGBClassifier(max_depth=10)

  clfXGB_CMF = xgb_r4.fit(x_Train_corr, y_Train)
  y_pred_CMF = clfXGB_CMF.predict(x_Test_corr)
  print("Correlation Matrix on Features ==> Precision: ",precision_score(y_Test, y_pred_CMF, average='weighted')," Recall: ",recall_score(y_Test, y_pred_CMF, average='weighted'),"  F1: ",f1_score(y_Test, y_pred_CMF, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_pred_CMF))
  print()

  xgb_r5 = xg.XGBClassifier(max_depth=10)

  clfXGB_CMFL = xgb_r5.fit(new_x_train_corr, y_Train)
  y_pred_CMFL = clfXGB_CMFL.predict(new_x_test_corr)
  print("CorMatrix on Features & Labels ==> Precision: ",precision_score(y_Test, y_pred_CMFL, average='weighted')," Recall: ",recall_score(y_Test, y_pred_CMFL, average='weighted'),"  F1: ",f1_score(y_Test, y_pred_CMFL, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_pred_CMFL))
  print()

  xgb_r6 = xg.XGBClassifier(max_depth=10)

  clfXGB_selectK = xgb_r6.fit(features_df_new_train, y_Train)
  y_pred_selectK = clfXGB_selectK.predict(features_df_new_test)
  print("SelectK best with k = 200 ==>      Precision: ",precision_score(y_Test, y_pred_selectK, average='weighted')," Recall: ",recall_score(y_Test, y_pred_selectK, average='weighted'),"  F1: ",f1_score(y_Test, y_pred_selectK, average='weighted')," Accuracy: ",accuracy_score(y_Test, y_pred_selectK))
  print()


  
  

In [None]:
def correlation_check(traindata,testdata,thresh): # drop columns above certain threshold    
        corr_matrix = traindata.corr()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
        to_drop = [column for column in upper.columns if any(upper[column] >thresh)]
        trainset=traindata.drop(traindata[to_drop], axis=1)
        testset=testdata.drop(testdata[to_drop],axis=1)
        return trainset,testset

In [None]:
def gridSearchOnLogReg(x, y):
  grid={
        "C":np.logspace(-3,3,7,1,0),
        "penalty":["l1","l2","elasticnet","none"],
        "max_iter":[50,80,100],
        "solver":["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
        }
  logreg=LogisticRegression()
  logreg_cv=GridSearchCV(logreg,grid)
  logreg_cv.fit(x,y)
  return logreg_cv

In [None]:
y = temp2['Class Taste']
x = temp2.drop(['Class Taste','Name','Taste', 'ID'],axis=1)
y=y.to_frame()
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

Week 9

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, precision_score, accuracy_score, f1_score, recall_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
chemDBModred = pd.read_csv("/content/drive/MyDrive/Capstone/chemTasteDB/Mordred Cleaned/chemTasteDBCleaned.csv")

In [None]:
chemDBModred

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Name,Taste,Class taste
0,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,8.391084,912,44,130.0,163.0,7.812500,4.527778,(-)-Haematoxylin,Sweet,Sweetness
1,13.450219,12.310626,0,0,20.608616,2.416443,4.832887,20.608616,1.144923,3.779851,...,6.004108,633,26,88.0,99.0,8.868056,3.930556,(+)-4β-hydroxyhernandulcin,Sweet,Sweetness
2,19.574531,16.089858,0,0,30.966408,2.511665,5.023331,30.966408,1.238656,4.150646,...,8.873561,1370,44,134.0,160.0,9.972222,5.361111,(+)-Dihydroquercetin 3-acetate,Sweet,Sweetness
3,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,8.391084,912,44,130.0,163.0,7.812500,4.527778,(+)-Haematoxylin,Sweet,Sweetness
4,8.898979,8.883053,0,0,14.601126,2.414214,4.828427,14.601126,1.216761,3.391683,...,7.502641,174,21,60.0,72.0,6.666667,2.666667,(±)-chiro-inositol,Sweet,Sweetness
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2939,10.934771,9.752499,0,0,18.859991,2.070980,4.141959,18.859991,1.178749,3.594751,...,5.671230,622,14,62.0,61.0,6.722222,4.000000,Spilanthol,"Heating, pungent, and tingling",Miscellaneous
2940,95.791774,67.490769,0,0,153.178425,2.499577,4.999154,153.178425,1.255561,5.725000,...,9.771109,97399,228,658.0,787.0,49.111111,26.027778,Tannic acid,Astringent,Miscellaneous
2941,10.934771,9.752499,0,0,18.859991,2.070980,4.141959,18.859991,1.178749,3.594751,...,5.443747,622,14,62.0,61.0,6.722222,4.000000,trans-Pellitorine,"Heating, pungent, and tingling",Miscellaneous
2942,9.006320,9.750091,0,0,14.248626,2.393403,4.786805,14.248626,1.096048,3.417811,...,5.143832,236,21,58.0,67.0,7.895833,3.083333,WS23,Cooling,Miscellaneous


In [None]:
chemDBModred = chemDBModred.drop(['Name', 'Taste'], axis=1)

In [None]:
chemDBModred['Class taste'].unique()

array(['Sweetness', 'Bitterness', 'Umaminess', 'Sourness', 'Saltiness',
       'Multitaste', 'Tastelessness', 'Non-sweetness', 'Miscellaneous'],
      dtype=object)

In [None]:
sweetTrainM = pd.read_csv("/content/drive/MyDrive/Capstone/Features Extracted DataSets/Mordred/Cleaned Data/sweet_train_v5.csv")
sweetTestM = pd.read_csv("/content/drive/MyDrive/Capstone/Features Extracted DataSets/Mordred/Cleaned Data/sweet_test_v5.csv")
bitterTrainM = pd.read_csv("/content/drive/MyDrive/Capstone/Features Extracted DataSets/Mordred/Cleaned Data/bitter_train_v5.csv")
bitterTestM = pd.read_csv("/content/drive/MyDrive/Capstone/Features Extracted DataSets/Mordred/Cleaned Data/bitter_test_v5.csv")

In [None]:
sweetTrainM = sweetTrainM.drop(['Name', 'Sweet'], axis=1)

In [None]:
sweetTestM = sweetTestM.drop(['Name', 'Sweet'], axis=1)

In [None]:
bitterTrainM = bitterTrainM.drop(['Name', 'Bitter'], axis=1)

In [None]:
bitterTestM = bitterTestM.drop(['Name', 'Bitter'], axis=1)

In [None]:
sweetTrainM['Taste'].unique()

array(['Sweet', 'Bitter', 'Tasteless'], dtype=object)

In [None]:
sweetTestM['Taste'].unique()

array(['Sweet', 'Bitter', 'Tasteless'], dtype=object)

In [None]:
bitterTrainM['Taste'].unique()

array(['Sweet', 'Bitter', 'Tasteless'], dtype=object)

In [None]:
bitterTestM['Taste'].unique()

array(['Non-bitter', 'Bitter'], dtype=object)

In [None]:
sweetTrainM

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Taste
0,17.213262,16.059815,0,0,28.766735,2.522420,4.958230,28.766735,1.250728,4.052767,...,72.317821,342.116212,7.602582,1110,43,120.0,147.0,10.451389,5.291667,Sweet
1,17.213262,16.059815,0,0,28.766735,2.522420,4.958230,28.766735,1.250728,4.052767,...,72.317821,396.014551,9.428918,1110,43,120.0,147.0,10.451389,5.291667,Sweet
2,15.315311,13.970681,1,1,25.367667,2.287533,4.575067,25.367667,1.207984,3.915999,...,53.670480,294.121572,7.541579,1000,27,96.0,105.0,8.916667,4.888889,Sweet
3,8.829880,8.715591,0,0,14.640475,2.418289,4.836579,14.640475,1.220040,3.392014,...,43.370989,180.063388,7.502641,178,20,60.0,71.0,6.145833,2.722222,Sweet
4,17.372180,15.638061,0,0,28.650077,2.478404,4.876167,28.650077,1.245656,4.052400,...,72.204129,342.116212,7.602582,1220,41,120.0,145.0,10.451389,5.166667,Sweet
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2162,6.473351,6.191587,0,0,11.142106,2.200122,4.400244,11.142106,1.238012,3.089776,...,37.313444,123.068414,6.837134,88,9,40.0,43.0,3.472222,2.166667,Bitter
2163,6.473351,6.127583,0,0,11.189957,2.193993,4.387987,11.189957,1.243329,3.089765,...,37.289972,124.052429,7.297202,90,9,40.0,43.0,3.472222,2.166667,Bitter
2164,10.635111,9.197041,0,0,18.814625,2.307250,4.614501,18.814625,1.343902,3.558646,...,45.217583,183.068414,7.959496,307,18,68.0,77.0,3.833333,3.222222,Bitter
2165,8.106344,7.838945,0,0,13.165395,2.242743,4.296119,13.165395,1.196854,3.289120,...,53.584828,152.083730,6.612336,173,9,50.0,53.0,4.583333,2.500000,Bitter


In [None]:
sweetTrainM.rename(columns={'Taste': 'Class taste'}, inplace=True)

In [None]:
sweetTrainM

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Class taste
0,17.213262,16.059815,0,0,28.766735,2.522420,4.958230,28.766735,1.250728,4.052767,...,72.317821,342.116212,7.602582,1110,43,120.0,147.0,10.451389,5.291667,Sweet
1,17.213262,16.059815,0,0,28.766735,2.522420,4.958230,28.766735,1.250728,4.052767,...,72.317821,396.014551,9.428918,1110,43,120.0,147.0,10.451389,5.291667,Sweet
2,15.315311,13.970681,1,1,25.367667,2.287533,4.575067,25.367667,1.207984,3.915999,...,53.670480,294.121572,7.541579,1000,27,96.0,105.0,8.916667,4.888889,Sweet
3,8.829880,8.715591,0,0,14.640475,2.418289,4.836579,14.640475,1.220040,3.392014,...,43.370989,180.063388,7.502641,178,20,60.0,71.0,6.145833,2.722222,Sweet
4,17.372180,15.638061,0,0,28.650077,2.478404,4.876167,28.650077,1.245656,4.052400,...,72.204129,342.116212,7.602582,1220,41,120.0,145.0,10.451389,5.166667,Sweet
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2162,6.473351,6.191587,0,0,11.142106,2.200122,4.400244,11.142106,1.238012,3.089776,...,37.313444,123.068414,6.837134,88,9,40.0,43.0,3.472222,2.166667,Bitter
2163,6.473351,6.127583,0,0,11.189957,2.193993,4.387987,11.189957,1.243329,3.089765,...,37.289972,124.052429,7.297202,90,9,40.0,43.0,3.472222,2.166667,Bitter
2164,10.635111,9.197041,0,0,18.814625,2.307250,4.614501,18.814625,1.343902,3.558646,...,45.217583,183.068414,7.959496,307,18,68.0,77.0,3.833333,3.222222,Bitter
2165,8.106344,7.838945,0,0,13.165395,2.242743,4.296119,13.165395,1.196854,3.289120,...,53.584828,152.083730,6.612336,173,9,50.0,53.0,4.583333,2.500000,Bitter


In [None]:
sweetTestM.rename(columns={'Taste': 'Class taste'}, inplace=True)

In [None]:
bitterTrainM.rename(columns={'Taste': 'Class taste'}, inplace=True)

In [None]:
bitterTrainM

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Class taste
0,17.213262,16.059815,0,0,28.766735,2.522420,4.958230,28.766735,1.250728,4.052767,...,72.317821,342.116212,7.602582,1110,43,120.0,147.0,10.451389,5.291667,Sweet
1,17.213262,16.059815,0,0,28.766735,2.522420,4.958230,28.766735,1.250728,4.052767,...,72.317821,396.014551,9.428918,1110,43,120.0,147.0,10.451389,5.291667,Sweet
2,17.920577,15.130873,1,3,27.113140,2.544120,5.088241,27.113140,1.129714,4.067565,...,58.750589,363.219178,6.372266,1807,27,116.0,128.0,11.069444,5.250000,Sweet
3,15.315311,13.970681,1,1,25.367667,2.287533,4.575067,25.367667,1.207984,3.915999,...,53.670480,294.121572,7.541579,1000,27,96.0,105.0,8.916667,4.888889,Sweet
4,8.829880,8.715591,0,0,14.640475,2.418289,4.836579,14.640475,1.220040,3.392014,...,43.370989,180.063388,7.502641,178,20,60.0,71.0,6.145833,2.722222,Sweet
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2204,6.473351,6.191587,0,0,11.142106,2.200122,4.400244,11.142106,1.238012,3.089776,...,37.313444,123.068414,6.837134,88,9,40.0,43.0,3.472222,2.166667,Bitter
2205,6.473351,6.127583,0,0,11.189957,2.193993,4.387987,11.189957,1.243329,3.089765,...,37.289972,124.052429,7.297202,90,9,40.0,43.0,3.472222,2.166667,Bitter
2206,10.635111,9.197041,0,0,18.814625,2.307250,4.614501,18.814625,1.343902,3.558646,...,45.217583,183.068414,7.959496,307,18,68.0,77.0,3.833333,3.222222,Bitter
2207,8.106344,7.838945,0,0,13.165395,2.242743,4.296119,13.165395,1.196854,3.289120,...,53.584828,152.083730,6.612336,173,9,50.0,53.0,4.583333,2.500000,Bitter


In [None]:
bitterTestM.rename(columns={'Taste': 'Class taste'}, inplace=True)

In [None]:
bitterTestM

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Class taste
0,8.829880,8.715591,0,0,14.640475,2.418289,4.836579,14.640475,1.220040,3.392014,...,43.370989,180.063388,7.502641,178,20,60.0,71.0,6.145833,2.722222,Non-bitter
1,8.761080,8.651650,0,0,14.708146,2.377683,4.755367,14.708146,1.225679,3.384121,...,42.927926,180.063388,7.502641,182,19,58.0,68.0,6.055556,2.777778,Non-bitter
2,8.761080,8.651650,0,0,14.708146,2.377683,4.755367,14.708146,1.225679,3.384121,...,42.927926,180.063388,7.502641,182,19,58.0,68.0,6.055556,2.777778,Non-bitter
3,8.163363,8.029752,0,0,13.137460,2.364871,4.729742,13.137460,1.194315,3.303305,...,41.588577,164.068473,7.133412,140,17,54.0,63.0,5.805556,2.444444,Non-bitter
4,6.611250,7.282959,0,0,11.763639,2.188901,4.377802,11.763639,1.176364,3.134263,...,38.268062,150.052823,7.502641,125,12,40.0,43.0,5.833333,2.555556,Non-bitter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,19.981978,16.397806,0,0,33.251113,2.526538,5.053076,33.251113,1.278889,4.186542,...,61.437322,362.100168,8.229549,1633,49,138.0,168.0,10.222222,5.833333,Bitter
167,15.913028,13.077290,1,0,26.247606,2.316776,4.633552,26.247606,1.249886,3.945703,...,54.181448,282.077181,8.547793,1064,28,102.0,114.0,7.416667,4.722222,Bitter
168,17.884519,14.823609,1,0,30.336232,2.328747,4.657494,30.336232,1.264010,4.074059,...,57.946497,326.103396,8.152585,1535,35,116.0,132.0,8.777778,5.583333,Bitter
169,15.913028,13.206444,1,0,26.180249,2.317144,4.634288,26.180249,1.246679,3.945707,...,54.189588,282.077181,8.547793,1050,28,102.0,114.0,7.416667,4.722222,Bitter


In [None]:
chemDBModred

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Class taste
0,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,72.021791,302.079038,8.391084,912,44,130.0,163.0,7.812500,4.527778,Sweetness
1,13.450219,12.310626,0,0,20.608616,2.416443,4.832887,20.608616,1.144923,3.779851,...,50.837448,252.172545,6.004108,633,26,88.0,99.0,8.868056,3.930556,Sweetness
2,19.574531,16.089858,0,0,30.966408,2.511665,5.023331,30.966408,1.238656,4.150646,...,60.198976,346.068867,8.873561,1370,44,134.0,160.0,9.972222,5.361111,Sweetness
3,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,72.021791,302.079038,8.391084,912,44,130.0,163.0,7.812500,4.527778,Sweetness
4,8.898979,8.883053,0,0,14.601126,2.414214,4.828427,14.601126,1.216761,3.391683,...,43.309911,180.063388,7.502641,174,21,60.0,72.0,6.666667,2.666667,Sweetness
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2939,10.934771,9.752499,0,0,18.859991,2.070980,4.141959,18.859991,1.178749,3.594751,...,45.350312,221.177964,5.671230,622,14,62.0,61.0,6.722222,4.000000,Miscellaneous
2940,95.791774,67.490769,0,0,153.178425,2.499577,4.999154,153.178425,1.255561,5.725000,...,165.035792,1700.172974,9.771109,97399,228,658.0,787.0,49.111111,26.027778,Miscellaneous
2941,10.934771,9.752499,0,0,18.859991,2.070980,4.141959,18.859991,1.178749,3.594751,...,45.350312,223.193614,5.443747,622,14,62.0,61.0,6.722222,4.000000,Miscellaneous
2942,9.006320,9.750091,0,0,14.248626,2.393403,4.786805,14.248626,1.096048,3.417811,...,44.032654,185.177964,5.143832,236,21,58.0,67.0,7.895833,3.083333,Miscellaneous


In [None]:
bitterTestM = bitterTestM.loc[bitterTestM['Class taste'] == 'Bitter']

In [None]:
chemDBModredExtended = chemDBModred.append(bitterTestM)

In [None]:
chemDBModredExtended

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Class taste
0,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,72.021791,302.079038,8.391084,912,44,130.0,163.0,7.812500,4.527778,Sweetness
1,13.450219,12.310626,0,0,20.608616,2.416443,4.832887,20.608616,1.144923,3.779851,...,50.837448,252.172545,6.004108,633,26,88.0,99.0,8.868056,3.930556,Sweetness
2,19.574531,16.089858,0,0,30.966408,2.511665,5.023331,30.966408,1.238656,4.150646,...,60.198976,346.068867,8.873561,1370,44,134.0,160.0,9.972222,5.361111,Sweetness
3,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,72.021791,302.079038,8.391084,912,44,130.0,163.0,7.812500,4.527778,Sweetness
4,8.898979,8.883053,0,0,14.601126,2.414214,4.828427,14.601126,1.216761,3.391683,...,43.309911,180.063388,7.502641,174,21,60.0,72.0,6.666667,2.666667,Sweetness
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,19.981978,16.397806,0,0,33.251113,2.526538,5.053076,33.251113,1.278889,4.186542,...,61.437322,362.100168,8.229549,1633,49,138.0,168.0,10.222222,5.833333,Bitter
167,15.913028,13.077290,1,0,26.247606,2.316776,4.633552,26.247606,1.249886,3.945703,...,54.181448,282.077181,8.547793,1064,28,102.0,114.0,7.416667,4.722222,Bitter
168,17.884519,14.823609,1,0,30.336232,2.328747,4.657494,30.336232,1.264010,4.074059,...,57.946497,326.103396,8.152585,1535,35,116.0,132.0,8.777778,5.583333,Bitter
169,15.913028,13.206444,1,0,26.180249,2.317144,4.634288,26.180249,1.246679,3.945707,...,54.189588,282.077181,8.547793,1050,28,102.0,114.0,7.416667,4.722222,Bitter


In [None]:
chemDBModredExtended = chemDBModredExtended.append(bitterTrainM)

In [None]:
chemDBModredExtended

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Class taste
0,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,72.021791,302.079038,8.391084,912,44,130.0,163.0,7.812500,4.527778,Sweetness
1,13.450219,12.310626,0,0,20.608616,2.416443,4.832887,20.608616,1.144923,3.779851,...,50.837448,252.172545,6.004108,633,26,88.0,99.0,8.868056,3.930556,Sweetness
2,19.574531,16.089858,0,0,30.966408,2.511665,5.023331,30.966408,1.238656,4.150646,...,60.198976,346.068867,8.873561,1370,44,134.0,160.0,9.972222,5.361111,Sweetness
3,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,72.021791,302.079038,8.391084,912,44,130.0,163.0,7.812500,4.527778,Sweetness
4,8.898979,8.883053,0,0,14.601126,2.414214,4.828427,14.601126,1.216761,3.391683,...,43.309911,180.063388,7.502641,174,21,60.0,72.0,6.666667,2.666667,Sweetness
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2204,6.473351,6.191587,0,0,11.142106,2.200122,4.400244,11.142106,1.238012,3.089776,...,37.313444,123.068414,6.837134,88,9,40.0,43.0,3.472222,2.166667,Bitter
2205,6.473351,6.127583,0,0,11.189957,2.193993,4.387987,11.189957,1.243329,3.089765,...,37.289972,124.052429,7.297202,90,9,40.0,43.0,3.472222,2.166667,Bitter
2206,10.635111,9.197041,0,0,18.814625,2.307250,4.614501,18.814625,1.343902,3.558646,...,45.217583,183.068414,7.959496,307,18,68.0,77.0,3.833333,3.222222,Bitter
2207,8.106344,7.838945,0,0,13.165395,2.242743,4.296119,13.165395,1.196854,3.289120,...,53.584828,152.083730,6.612336,173,9,50.0,53.0,4.583333,2.500000,Bitter


In [None]:
chemDBModredExtended = chemDBModredExtended.append(sweetTrainM)

In [None]:
chemDBModredExtended = chemDBModredExtended.append(sweetTestM)

In [None]:
chemDBModredExtended['Class taste'].unique()

array(['Sweetness', 'Bitterness', 'Umaminess', 'Sourness', 'Saltiness',
       'Multitaste', 'Tastelessness', 'Non-sweetness', 'Miscellaneous',
       'Bitter', 'Sweet', 'Tasteless'], dtype=object)

In [None]:
chemDBModredExtended['Class taste'] = chemDBModredExtended['Class taste'].str.replace('Bitter','Bitterness')

In [None]:
chemDBModredExtended['Class taste'] = chemDBModredExtended['Class taste'].str.replace('Bitternessness','Bitterness')

In [None]:
chemDBModredExtended['Class taste'].unique()

array(['Sweetness', 'Bitterness', 'Umaminess', 'Sourness', 'Saltiness',
       'Multitaste', 'Tastelessness', 'Non-sweetness', 'Miscellaneous',
       'Sweet', 'Tasteless'], dtype=object)

In [None]:
chemDBModredExtended['Class taste'] = chemDBModredExtended['Class taste'].str.replace('Sweet','Sweetness')

In [None]:
chemDBModredExtended['Class taste'].unique()

array(['Sweetnessness', 'Bitterness', 'Umaminess', 'Sourness',
       'Saltiness', 'Multitaste', 'Tastelessness', 'Non-sweetness',
       'Miscellaneous', 'Sweetness', 'Tasteless'], dtype=object)

In [None]:
chemDBModredExtended['Class taste'] = chemDBModredExtended['Class taste'].str.replace('Sweetnessness','Sweetness')

In [None]:
chemDBModredExtended['Class taste'].unique()

array(['Sweetness', 'Bitterness', 'Umaminess', 'Sourness', 'Saltiness',
       'Multitaste', 'Tastelessness', 'Non-sweetness', 'Miscellaneous',
       'Tasteless'], dtype=object)

In [None]:
chemDBModredExtended['Class taste'] = chemDBModredExtended['Class taste'].str.replace('Tasteless','Tastelessness')

In [None]:
chemDBModredExtended['Class taste'].unique()

array(['Sweetness', 'Bitterness', 'Umaminess', 'Sourness', 'Saltiness',
       'Multitaste', 'Tastelessnessness', 'Non-sweetness',
       'Miscellaneous', 'Tastelessness'], dtype=object)

In [None]:
chemDBModredExtended['Class taste'] = chemDBModredExtended['Class taste'].str.replace('Tastelessnessness','Tastelessness')

In [None]:
chemDBModredExtended['Class taste'].unique()

array(['Sweetness', 'Bitterness', 'Umaminess', 'Sourness', 'Saltiness',
       'Multitaste', 'Tastelessness', 'Non-sweetness', 'Miscellaneous'],
      dtype=object)

In [None]:
chemDBModredExtended

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Class taste
0,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,72.021791,302.079038,8.391084,912,44,130.0,163.0,7.812500,4.527778,Sweetness
1,13.450219,12.310626,0,0,20.608616,2.416443,4.832887,20.608616,1.144923,3.779851,...,50.837448,252.172545,6.004108,633,26,88.0,99.0,8.868056,3.930556,Sweetness
2,19.574531,16.089858,0,0,30.966408,2.511665,5.023331,30.966408,1.238656,4.150646,...,60.198976,346.068867,8.873561,1370,44,134.0,160.0,9.972222,5.361111,Sweetness
3,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,72.021791,302.079038,8.391084,912,44,130.0,163.0,7.812500,4.527778,Sweetness
4,8.898979,8.883053,0,0,14.601126,2.414214,4.828427,14.601126,1.216761,3.391683,...,43.309911,180.063388,7.502641,174,21,60.0,72.0,6.666667,2.666667,Sweetness
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,72.021791,304.058303,8.942891,912,44,130.0,163.0,7.812500,4.527778,Sweetness
149,31.761755,24.298459,0,0,51.640551,2.468276,4.936552,51.640551,1.259526,4.632783,...,78.459339,582.194856,7.762598,6188,73,216.0,257.0,16.138889,8.972222,Sweetness
150,49.555912,34.562157,0,0,79.858260,2.636734,5.234851,79.858260,1.288036,5.076763,...,116.639812,884.476951,6.600574,19191,130,354.0,441.0,23.513889,13.083333,Sweetness
151,9.477125,8.806333,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,59.310845,222.970546,11.735292,1200000170,18,66.0,79.0,0.000000,2.513889,Sweetness


In [None]:
chemDBModredExtended = chemDBModredExtended.drop_duplicates()

In [None]:
chemDBModredExtended

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Class taste
0,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,4.069199,...,72.021791,302.079038,8.391084,912,44,130.0,163.0,7.812500,4.527778,Sweetness
1,13.450219,12.310626,0,0,20.608616,2.416443,4.832887,20.608616,1.144923,3.779851,...,50.837448,252.172545,6.004108,633,26,88.0,99.0,8.868056,3.930556,Sweetness
2,19.574531,16.089858,0,0,30.966408,2.511665,5.023331,30.966408,1.238656,4.150646,...,60.198976,346.068867,8.873561,1370,44,134.0,160.0,9.972222,5.361111,Sweetness
4,8.898979,8.883053,0,0,14.601126,2.414214,4.828427,14.601126,1.216761,3.391683,...,43.309911,180.063388,7.502641,174,21,60.0,72.0,6.666667,2.666667,Sweetness
5,9.968445,9.018963,0,0,17.245360,2.272498,4.544996,17.245360,1.231811,3.514624,...,44.517416,196.121178,6.537373,342,17,62.0,68.0,5.333333,3.444444,Sweetness
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,9.899495,8.289893,0,0,17.300563,2.236068,4.472136,17.300563,1.330813,3.484392,...,43.464355,169.089149,7.045381,264,14,62.0,68.0,2.972222,3.000000,Bitterness
2145,9.192388,8.164909,0,0,17.133544,1.956295,3.912590,17.133544,1.223825,3.447813,...,41.681626,198.198365,4.954959,455,11,50.0,48.0,5.000000,3.750000,Bitterness
2146,8.773011,8.182021,1,1,14.565903,2.211610,4.423220,14.565903,1.213825,3.368107,...,41.714016,165.078979,7.177347,212,13,54.0,58.0,4.833333,2.777778,Bitterness
2147,6.473351,6.620103,0,0,11.429915,2.023683,4.047367,11.429915,1.142991,3.114674,...,36.589574,146.130680,5.218953,150,8,36.0,35.0,4.611111,2.666667,Bitterness


In [None]:
chemDBModredExtended.reset_index(inplace = True)

In [None]:
chemDBModredExtended

Unnamed: 0,index,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,...,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Class taste
0,0,17.929457,14.412750,0,0,28.336926,2.612833,5.116646,28.336926,1.288042,...,72.021791,302.079038,8.391084,912,44,130.0,163.0,7.812500,4.527778,Sweetness
1,1,13.450219,12.310626,0,0,20.608616,2.416443,4.832887,20.608616,1.144923,...,50.837448,252.172545,6.004108,633,26,88.0,99.0,8.868056,3.930556,Sweetness
2,2,19.574531,16.089858,0,0,30.966408,2.511665,5.023331,30.966408,1.238656,...,60.198976,346.068867,8.873561,1370,44,134.0,160.0,9.972222,5.361111,Sweetness
3,4,8.898979,8.883053,0,0,14.601126,2.414214,4.828427,14.601126,1.216761,...,43.309911,180.063388,7.502641,174,21,60.0,72.0,6.666667,2.666667,Sweetness
4,5,9.968445,9.018963,0,0,17.245360,2.272498,4.544996,17.245360,1.231811,...,44.517416,196.121178,6.537373,342,17,62.0,68.0,5.333333,3.444444,Sweetness
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5170,2144,9.899495,8.289893,0,0,17.300563,2.236068,4.472136,17.300563,1.330813,...,43.464355,169.089149,7.045381,264,14,62.0,68.0,2.972222,3.000000,Bitterness
5171,2145,9.192388,8.164909,0,0,17.133544,1.956295,3.912590,17.133544,1.223825,...,41.681626,198.198365,4.954959,455,11,50.0,48.0,5.000000,3.750000,Bitterness
5172,2146,8.773011,8.182021,1,1,14.565903,2.211610,4.423220,14.565903,1.213825,...,41.714016,165.078979,7.177347,212,13,54.0,58.0,4.833333,2.777778,Bitterness
5173,2147,6.473351,6.620103,0,0,11.429915,2.023683,4.047367,11.429915,1.142991,...,36.589574,146.130680,5.218953,150,8,36.0,35.0,4.611111,2.666667,Bitterness


In [None]:
chemDBModredExtended = chemDBModredExtended.drop(['index'], axis=1)

In [None]:
chemDBModredExtended = chemDBModredExtended.sample(frac=1).reset_index(drop=True)

In [None]:
chemDBModredExtended

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Class taste
0,7.468187,7.789937,0,0,12.009226,2.143864,4.287729,12.009226,1.091748,3.228098,...,39.347403,154.135765,5.315026,174,11,44.0,45.0,6.083333,2.694444,Sweetness
1,7.289847,7.529137,0,0,12.184668,2.101003,4.202006,12.184668,1.107697,3.219626,...,38.788335,160.073559,6.959720,184,10,42.0,42.0,5.472222,2.833333,Sweetness
2,10.146784,9.293694,0,0,16.935280,2.314215,4.628431,16.935280,1.209663,3.520966,...,44.880046,196.084792,7.541723,327,18,64.0,71.0,5.944444,3.305556,Sweetness
3,4.352030,4.798324,0,0,7.878462,1.969616,3.939231,7.878462,1.125495,2.752840,...,31.656710,104.047344,6.936490,50,5,24.0,23.0,3.861111,1.916667,Sweetness
4,16.551185,13.671803,0,0,28.127789,2.296049,4.592099,28.127789,1.222947,3.999044,...,55.807775,322.214409,6.079517,1558,28,102.0,110.0,8.805556,5.527778,Miscellaneous
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5170,17.205921,13.856828,0,0,27.587949,2.468806,4.937613,27.587949,1.253998,4.028888,...,56.535011,302.079038,8.391084,1025,38,118.0,141.0,8.000000,4.805556,Bitterness
5171,9.261338,8.702921,0,0,16.240218,2.276858,4.553717,16.240218,1.249248,3.444375,...,43.264832,183.125929,6.104198,260,16,58.0,64.0,5.083333,3.194444,Bitterness
5172,8.065904,7.893227,0,0,13.812926,2.092177,4.184353,13.812926,1.151077,3.306818,...,40.156271,170.130680,5.671023,256,11,46.0,46.0,5.722222,3.027778,Sweetness
5173,28.088322,23.220988,0,0,45.211883,2.578162,5.155749,45.211883,1.221943,4.522479,...,87.514988,557.125019,8.705078,4697,56,188.0,221.0,14.805556,8.333333,Bitterness


In [None]:
chemDBModredExtended.to_csv('chemTasteDBModredMerged.csv', index = False)

paddle

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, precision_score, accuracy_score, f1_score, recall_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
chemDBPadel = pd.read_csv("/content/drive/MyDrive/Capstone/ChemTestDB + Padel/ChemTasteDBFeatures.csv", encoding='unicode_escape')

In [None]:
chemDBPadel

Unnamed: 0,ID,Name,Taste,Class Taste,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,1,(-)-Haematoxylin,Sweet,Sweetness,0,-2.4092,5.804245,29.9362,42.307102,12,...,8.391084,45.461903,2.066450,15.901910,15.901910,0.000000,912,44,0.503,130
1,2,(+)-4?-hydroxyhernandulcin,Sweet,Sweetness,0,1.9056,3.631311,73.5138,44.809032,0,...,6.004108,34.664420,1.925801,7.514035,7.514035,0.000000,633,26,1.865,88
2,3,(+)-Dihydroquercetin 3-acetate,Sweet,Sweetness,0,-2.5478,6.491285,35.6603,45.671102,12,...,8.873561,50.408763,2.016351,21.332061,21.332061,0.000000,1370,44,0.339,134
3,4,(+)-Haematoxylin,Sweet,Sweetness,0,-2.4092,5.804245,29.9362,42.307102,12,...,8.391084,45.461903,2.066450,15.901910,15.901910,0.000000,912,44,0.503,130
4,5,(±)-chiro-inositol,Sweet,Sweetness,0,-3.0642,9.389322,35.7750,23.373516,0,...,7.502641,22.897487,1.908124,14.905717,14.905717,0.000000,174,21,-1.458,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2922,2939*,"Sodium N-[5-(3-bromophenyl)-1,3,4-thiadiazol-2...",Non-sweet/Sweet,Miscellaneous,0,-0.2512,0.063101,27.5342,31.969965,11,...,15.177071,33.856692,1.991570,25.069339,7.098079,9.104502,549,20,1.567,88
2923,2940,Spilanthol,"Heating, pungent, and tingling",Miscellaneous,0,2.1971,4.827248,71.9383,41.878239,0,...,5.671230,30.482814,1.905176,5.422166,2.505437,2.916729,622,14,4.058,62
2924,2942,trans-Pellitorine,"Heating, pungent, and tingling",Miscellaneous,0,0.2215,0.049062,66.0689,43.211825,0,...,5.443747,30.482814,1.905176,5.422166,2.505437,2.916729,622,14,4.469,62
2925,2943,WS23,Cooling,Miscellaneous,0,1.1359,1.290269,54.0902,36.598239,0,...,5.143832,23.909473,1.839190,5.292463,2.471573,2.820889,236,21,3.034,58


In [None]:
chemDBPadel = chemDBPadel.drop(['Name', 'Taste','ID'], axis=1)

In [None]:
chemDBPadel

Unnamed: 0,Class Taste,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,Sweetness,0,-2.4092,5.804245,29.9362,42.307102,12,12,36,22,...,8.391084,45.461903,2.066450,15.901910,15.901910,0.000000,912,44,0.503,130
1,Sweetness,0,1.9056,3.631311,73.5138,44.809032,0,0,42,18,...,6.004108,34.664420,1.925801,7.514035,7.514035,0.000000,633,26,1.865,88
2,Sweetness,0,-2.5478,6.491285,35.6603,45.671102,12,12,39,25,...,8.873561,50.408763,2.016351,21.332061,21.332061,0.000000,1370,44,0.339,134
3,Sweetness,0,-2.4092,5.804245,29.9362,42.307102,12,12,36,22,...,8.391084,45.461903,2.066450,15.901910,15.901910,0.000000,912,44,0.503,130
4,Sweetness,0,-3.0642,9.389322,35.7750,23.373516,0,0,24,12,...,7.502641,22.897487,1.908124,14.905717,14.905717,0.000000,174,21,-1.458,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2922,Miscellaneous,0,-0.2512,0.063101,27.5342,31.969965,11,12,22,17,...,15.177071,33.856692,1.991570,25.069339,7.098079,9.104502,549,20,1.567,88
2923,Miscellaneous,0,2.1971,4.827248,71.9383,41.878239,0,0,39,16,...,5.671230,30.482814,1.905176,5.422166,2.505437,2.916729,622,14,4.058,62
2924,Miscellaneous,0,0.2215,0.049062,66.0689,43.211825,0,0,41,16,...,5.443747,30.482814,1.905176,5.422166,2.505437,2.916729,622,14,4.469,62
2925,Miscellaneous,0,1.1359,1.290269,54.0902,36.598239,0,0,36,13,...,5.143832,23.909473,1.839190,5.292463,2.471573,2.820889,236,21,3.034,58


In [None]:
chemDBPadel['Class Taste'].unique()

array(['Sweetness', 'Bitterness', 'Umaminess', 'Sourness', 'Saltiness',
       'Multitaste', 'Tastelessness', 'Non-sweetness', 'Miscellaneous'],
      dtype=object)

In [None]:
bitter_train_P = pd.read_csv("/content/drive/MyDrive/Capstone/Features Extracted DataSets/Padel/Cleaned Dataset/bitterTrainCleaned.csv")
bitter_test_P = pd.read_csv("/content/drive/MyDrive/Capstone/Features Extracted DataSets/Padel/Cleaned Dataset/bitterTestCleaned.csv")
sweet_train_P = pd.read_csv("/content/drive/MyDrive/Capstone/Features Extracted DataSets/Padel/Cleaned Dataset/sweet_train_clean.csv")
sweet_test_P = pd.read_csv("/content/drive/MyDrive/Capstone/Features Extracted DataSets/Padel/Cleaned Dataset/sweet_test_clean.csv")

In [None]:
bitter_train_P

Unnamed: 0,Name,Taste,Bitter,Name.1,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,Sucrose,Sweet,False,AUTOGEN_molecule_1,0,-4.3105,18.580410,68.7741,44.611446,0,...,7.602582,45.298746,1.969511,29.320458,29.320458,0.000000,1110,43,-3.277,120
1,"Sucralose / 4,1',6'-Trichloro-galactosucrose",Sweet,False,AUTOGEN_molecule_2,0,-0.6967,0.485391,77.9271,46.745067,0,...,9.428918,45.298746,1.969511,29.320458,21.851311,0.000000,1110,43,-0.734,120
2,Alitame,Sweet,False,AUTOGEN_molecule_3,1,-0.3908,0.152725,95.1429,59.572169,0,...,6.372266,46.302152,1.929256,22.105073,10.719972,8.482492,1807,27,0.775,116
3,Aspartame/Aspartyl-phenylalanine methylester,Sweet,False,AUTOGEN_molecule_4,1,-1.8303,3.349998,46.9315,42.852274,6,...,7.541579,40.865946,1.945997,17.972641,12.463725,5.508916,1000,27,1.743,96
4,Tagatose,Sweet,False,AUTOGEN_molecule_5,0,-2.5636,6.572045,36.3608,23.373516,0,...,7.502641,23.000999,1.916750,15.355991,15.355991,0.000000,178,20,-1.740,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2247,6-Methyl-2-pyridinemethanol,Bitter,True,AUTOGEN_molecule_617,0,-0.0092,0.000085,13.0093,20.223137,6,...,6.837134,17.549446,1.949938,5.429766,2.454019,2.975746,88,9,1.262,40
2248,4-hydroxybenzyl alcohol,Bitter,True,AUTOGEN_molecule_618,0,-0.9090,0.826281,9.8414,19.258344,6,...,7.297202,17.549594,1.949955,4.961358,4.961358,0.000000,90,9,0.813,40
2249,4-Benzoylpyridine,Bitter,True,AUTOGEN_molecule_619,0,-0.2638,0.069590,5.3460,29.023137,12,...,7.959496,28.405439,2.028960,5.566157,2.571148,2.995009,307,18,3.648,68
2250,4-(5-Methyl-2-furyl)-2-butanone,Bitter,True,AUTOGEN_molecule_620,0,0.3219,0.103620,27.1230,25.445516,5,...,6.612336,21.308457,1.937132,5.377103,5.377103,0.000000,173,9,1.752,50


In [None]:
bitter_train_P=bitter_train_P.drop(['Name', 'Bitter','Name.1'], axis=1)

In [None]:
bitter_train_P

Unnamed: 0,Taste,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,Sweet,0,-4.3105,18.580410,68.7741,44.611446,0,0,45,23,...,7.602582,45.298746,1.969511,29.320458,29.320458,0.000000,1110,43,-3.277,120
1,Sweet,0,-0.6967,0.485391,77.9271,46.745067,0,0,42,23,...,9.428918,45.298746,1.969511,29.320458,21.851311,0.000000,1110,43,-0.734,120
2,Sweet,1,-0.3908,0.152725,95.1429,59.572169,0,0,57,24,...,6.372266,46.302152,1.929256,22.105073,10.719972,8.482492,1807,27,0.775,116
3,Sweet,1,-1.8303,3.349998,46.9315,42.852274,6,6,39,21,...,7.541579,40.865946,1.945997,17.972641,12.463725,5.508916,1000,27,1.743,96
4,Sweet,0,-2.5636,6.572045,36.3608,23.373516,0,0,24,12,...,7.502641,23.000999,1.916750,15.355991,15.355991,0.000000,178,20,-1.740,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2247,Bitter,0,-0.0092,0.000085,13.0093,20.223137,6,6,18,9,...,6.837134,17.549446,1.949938,5.429766,2.454019,2.975746,88,9,1.262,40
2248,Bitter,0,-0.9090,0.826281,9.8414,19.258344,6,6,17,9,...,7.297202,17.549594,1.949955,4.961358,4.961358,0.000000,90,9,0.813,40
2249,Bitter,0,-0.2638,0.069590,5.3460,29.023137,12,12,23,14,...,7.959496,28.405439,2.028960,5.566157,2.571148,2.995009,307,18,3.648,68
2250,Bitter,0,0.3219,0.103620,27.1230,25.445516,5,5,23,11,...,6.612336,21.308457,1.937132,5.377103,5.377103,0.000000,173,9,1.752,50


In [None]:
bitter_test_P=bitter_test_P.drop(['Name', 'Bitter','Name.1'], axis=1)

In [None]:
sweet_train_P=sweet_train_P.drop(['Name', 'Sweet','Name.1'], axis=1)

In [None]:
sweet_test_P=sweet_test_P.drop(['Name', 'Sweet','Name.1'], axis=1)

In [None]:
sweet_test_P=sweet_test_P.drop(['Unnamed: 0'], axis=1)
sweet_train_P=sweet_train_P.drop(['Unnamed: 0'], axis=1)

In [None]:
sweet_train_P

Unnamed: 0,Taste,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,Sweet,0,-4.3105,18.580410,68.7741,44.611446,0,0,45,23,...,7.602582,45.298746,1.969511,29.320458,29.320458,0.000000,1110,43,-3.277,120
1,Sweet,0,-0.6967,0.485391,77.9271,46.745067,0,0,42,23,...,9.428918,45.298746,1.969511,29.320458,21.851311,0.000000,1110,43,-0.734,120
2,Sweet,1,-1.8303,3.349998,46.9315,42.852274,6,6,39,21,...,7.541579,40.865946,1.945997,17.972641,12.463725,5.508916,1000,27,1.743,96
3,Sweet,0,-2.5636,6.572045,36.3608,23.373516,0,0,24,12,...,7.502641,23.000999,1.916750,15.355991,15.355991,0.000000,178,20,-1.740,60
4,Sweet,0,-4.3105,18.580410,68.7741,44.611446,0,0,45,23,...,7.602582,45.281674,1.968768,29.221647,29.221647,0.000000,1220,41,-3.277,120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,Bitter,0,-0.0092,0.000085,13.0093,20.223137,6,6,18,9,...,6.837134,17.549446,1.949938,5.429766,2.454019,2.975746,88,9,1.262,40
2196,Bitter,0,-0.9090,0.826281,9.8414,19.258344,6,6,17,9,...,7.297202,17.549594,1.949955,4.961358,4.961358,0.000000,90,9,0.813,40
2197,Bitter,0,-0.2638,0.069590,5.3460,29.023137,12,12,23,14,...,7.959496,28.405439,2.028960,5.566157,2.571148,2.995009,307,18,3.648,68
2198,Bitter,0,0.3219,0.103620,27.1230,25.445516,5,5,23,11,...,6.612336,21.308457,1.937132,5.377103,5.377103,0.000000,173,9,1.752,50


In [None]:
bitter_train_P['Taste'].unique()

array(['Sweet', 'Bitter', 'Tasteless'], dtype=object)

In [None]:
bitter_test_P['Taste'].unique()

array(['Non-bitter', 'Bitter'], dtype=object)

In [None]:
sweet_train_P['Taste'].unique()

array(['Sweet', 'Bitter', 'Tasteless'], dtype=object)

In [None]:
sweet_test_P['Taste'].unique()

array(['Sweet', 'Bitter', 'Tasteless'], dtype=object)

In [None]:
sweet_test_P.rename(columns={'Taste': 'Class Taste'}, inplace=True)

In [None]:
sweet_train_P.rename(columns={'Taste': 'Class Taste'}, inplace=True)

In [None]:
bitter_test_P.rename(columns={'Taste': 'Class Taste'}, inplace=True)

In [None]:
bitter_train_P.rename(columns={'Taste': 'Class Taste'}, inplace=True)

In [None]:
bitter_test_P = bitter_test_P.loc[bitter_test_P['Class Taste'] == 'Bitter']

In [None]:
bitter_test_P

Unnamed: 0,Class Taste,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
33,Bitter,0,-0.0535,0.002862,101.3003,72.179790,5,5,64,34,...,7.346782,69.517332,2.044627,22.357647,22.357647,0.000000,2698,85,1.449,214
34,Bitter,0,-0.0988,0.009761,104.4448,68.764962,0,0,64,30,...,6.597351,60.673241,2.022441,18.939523,18.939523,0.000000,1881,70,2.114,174
35,Bitter,0,0.7128,0.508084,99.7896,71.377790,5,5,63,33,...,7.209510,67.692875,2.051299,19.783147,19.783147,0.000000,2542,82,2.690,208
36,Bitter,0,-1.2999,1.689740,103.4129,67.004962,0,0,63,29,...,6.511594,58.566460,2.019533,18.846745,18.846745,0.000000,1710,72,0.693,172
37,Bitter,0,-0.3399,0.115532,102.7754,63.535790,0,0,58,28,...,6.727659,56.806840,2.028816,16.364716,16.364716,0.000000,1575,68,1.377,164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,Bitter,0,-2.1746,4.728885,44.9435,50.098274,12,12,44,26,...,8.229549,52.405345,2.015590,21.484941,21.484941,0.000000,1633,49,0.095,138
167,Bitter,0,-1.6516,2.727783,25.1911,40.469516,12,12,33,21,...,8.547777,41.983572,1.999218,12.963296,9.907891,3.055405,1064,28,2.975,102
168,Bitter,0,-1.2949,1.676766,37.2581,47.458688,12,12,40,24,...,8.152571,47.879283,1.994970,16.037180,12.980349,3.056831,1535,35,2.839,116
169,Bitter,0,-1.6516,2.727783,25.1911,40.469516,12,12,33,21,...,8.547777,41.983151,1.999198,12.967464,9.912138,3.055326,1050,28,2.975,102


In [None]:
chemDBModredExtended = chemDBPadel.append(bitter_test_P)

In [None]:
chemDBModredExtended

Unnamed: 0,Class Taste,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,Sweetness,0,-2.4092,5.804245,29.9362,42.307102,12,12,36,22,...,8.391084,45.461903,2.066450,15.901910,15.901910,0.000000,912,44,0.503,130
1,Sweetness,0,1.9056,3.631311,73.5138,44.809032,0,0,42,18,...,6.004108,34.664420,1.925801,7.514035,7.514035,0.000000,633,26,1.865,88
2,Sweetness,0,-2.5478,6.491285,35.6603,45.671102,12,12,39,25,...,8.873561,50.408763,2.016351,21.332061,21.332061,0.000000,1370,44,0.339,134
3,Sweetness,0,-2.4092,5.804245,29.9362,42.307102,12,12,36,22,...,8.391084,45.461903,2.066450,15.901910,15.901910,0.000000,912,44,0.503,130
4,Sweetness,0,-3.0642,9.389322,35.7750,23.373516,0,0,24,12,...,7.502641,22.897487,1.908124,14.905717,14.905717,0.000000,174,21,-1.458,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,Bitter,0,-2.1746,4.728885,44.9435,50.098274,12,12,44,26,...,8.229549,52.405345,2.015590,21.484941,21.484941,0.000000,1633,49,0.095,138
167,Bitter,0,-1.6516,2.727783,25.1911,40.469516,12,12,33,21,...,8.547777,41.983572,1.999218,12.963296,9.907891,3.055405,1064,28,2.975,102
168,Bitter,0,-1.2949,1.676766,37.2581,47.458688,12,12,40,24,...,8.152571,47.879283,1.994970,16.037180,12.980349,3.056831,1535,35,2.839,116
169,Bitter,0,-1.6516,2.727783,25.1911,40.469516,12,12,33,21,...,8.547777,41.983151,1.999198,12.967464,9.912138,3.055326,1050,28,2.975,102


In [None]:
chemDBModredExtended = chemDBModredExtended.append(bitter_train_P)

In [None]:
chemDBModredExtended

Unnamed: 0,Class Taste,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,Sweetness,0,-2.4092,5.804245,29.9362,42.307102,12,12,36,22,...,8.391084,45.461903,2.066450,15.901910,15.901910,0.000000,912,44,0.503,130
1,Sweetness,0,1.9056,3.631311,73.5138,44.809032,0,0,42,18,...,6.004108,34.664420,1.925801,7.514035,7.514035,0.000000,633,26,1.865,88
2,Sweetness,0,-2.5478,6.491285,35.6603,45.671102,12,12,39,25,...,8.873561,50.408763,2.016351,21.332061,21.332061,0.000000,1370,44,0.339,134
3,Sweetness,0,-2.4092,5.804245,29.9362,42.307102,12,12,36,22,...,8.391084,45.461903,2.066450,15.901910,15.901910,0.000000,912,44,0.503,130
4,Sweetness,0,-3.0642,9.389322,35.7750,23.373516,0,0,24,12,...,7.502641,22.897487,1.908124,14.905717,14.905717,0.000000,174,21,-1.458,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2247,Bitter,0,-0.0092,0.000085,13.0093,20.223137,6,6,18,9,...,6.837134,17.549446,1.949938,5.429766,2.454019,2.975746,88,9,1.262,40
2248,Bitter,0,-0.9090,0.826281,9.8414,19.258344,6,6,17,9,...,7.297202,17.549594,1.949955,4.961358,4.961358,0.000000,90,9,0.813,40
2249,Bitter,0,-0.2638,0.069590,5.3460,29.023137,12,12,23,14,...,7.959496,28.405439,2.028960,5.566157,2.571148,2.995009,307,18,3.648,68
2250,Bitter,0,0.3219,0.103620,27.1230,25.445516,5,5,23,11,...,6.612336,21.308457,1.937132,5.377103,5.377103,0.000000,173,9,1.752,50


In [None]:
chemDBModredExtended = chemDBModredExtended.append(sweet_test_P)

In [None]:
chemDBModredExtended = chemDBModredExtended.append(sweet_train_P)

In [None]:
chemDBModredExtended

Unnamed: 0,Class Taste,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,Sweetness,0,-2.4092,5.804245,29.9362,42.307102,12,12,36,22,...,8.391084,45.461903,2.066450,15.901910,15.901910,0.000000,912,44,0.503,130
1,Sweetness,0,1.9056,3.631311,73.5138,44.809032,0,0,42,18,...,6.004108,34.664420,1.925801,7.514035,7.514035,0.000000,633,26,1.865,88
2,Sweetness,0,-2.5478,6.491285,35.6603,45.671102,12,12,39,25,...,8.873561,50.408763,2.016351,21.332061,21.332061,0.000000,1370,44,0.339,134
3,Sweetness,0,-2.4092,5.804245,29.9362,42.307102,12,12,36,22,...,8.391084,45.461903,2.066450,15.901910,15.901910,0.000000,912,44,0.503,130
4,Sweetness,0,-3.0642,9.389322,35.7750,23.373516,0,0,24,12,...,7.502641,22.897487,1.908124,14.905717,14.905717,0.000000,174,21,-1.458,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,Bitter,0,-0.0092,0.000085,13.0093,20.223137,6,6,18,9,...,6.837134,17.549446,1.949938,5.429766,2.454019,2.975746,88,9,1.262,40
2196,Bitter,0,-0.9090,0.826281,9.8414,19.258344,6,6,17,9,...,7.297202,17.549594,1.949955,4.961358,4.961358,0.000000,90,9,0.813,40
2197,Bitter,0,-0.2638,0.069590,5.3460,29.023137,12,12,23,14,...,7.959496,28.405439,2.028960,5.566157,2.571148,2.995009,307,18,3.648,68
2198,Bitter,0,0.3219,0.103620,27.1230,25.445516,5,5,23,11,...,6.612336,21.308457,1.937132,5.377103,5.377103,0.000000,173,9,1.752,50


In [None]:
chemDBModredExtended['Class Taste'].unique()

array(['Sweetness', 'Bitterness', 'Umaminess', 'Sourness', 'Saltiness',
       'Multitaste', 'Tastelessness', 'Non-sweetness', 'Miscellaneous',
       'Bitter', 'Sweet', 'Tasteless'], dtype=object)

In [None]:
chemDBModredExtended['Class Taste'] = chemDBModredExtended['Class Taste'].str.replace('Bitter','Bitterness')

In [None]:
chemDBModredExtended['Class Taste'].unique()

array(['Sweetness', 'Bitternessness', 'Umaminess', 'Sourness',
       'Saltiness', 'Multitaste', 'Tastelessness', 'Non-sweetness',
       'Miscellaneous', 'Bitterness', 'Sweet', 'Tasteless'], dtype=object)

In [None]:
chemDBModredExtended['Class Taste'] = chemDBModredExtended['Class Taste'].str.replace('Bitternessness','Bitterness')

In [None]:
chemDBModredExtended['Class Taste'].unique()

array(['Sweetness', 'Bitterness', 'Umaminess', 'Sourness', 'Saltiness',
       'Multitaste', 'Tastelessness', 'Non-sweetness', 'Miscellaneous',
       'Sweet', 'Tasteless'], dtype=object)

In [None]:
chemDBModredExtended['Class Taste'] = chemDBModredExtended['Class Taste'].str.replace('Sweet','Sweetness')

In [None]:
chemDBModredExtended['Class Taste'].unique()

array(['Sweetnessness', 'Bitterness', 'Umaminess', 'Sourness',
       'Saltiness', 'Multitaste', 'Tastelessness', 'Non-sweetness',
       'Miscellaneous', 'Sweetness', 'Tasteless'], dtype=object)

In [None]:
chemDBModredExtended['Class Taste'] = chemDBModredExtended['Class Taste'].str.replace('Sweetnessness','Sweetness')

In [None]:
chemDBModredExtended['Class Taste'].unique()

array(['Sweetness', 'Bitterness', 'Umaminess', 'Sourness', 'Saltiness',
       'Multitaste', 'Tastelessness', 'Non-sweetness', 'Miscellaneous',
       'Tasteless'], dtype=object)

In [None]:
chemDBModredExtended['Class Taste'] = chemDBModredExtended['Class Taste'].str.replace('Tasteless','Tastelessness')

In [None]:
chemDBModredExtended['Class Taste'].unique()

array(['Sweetness', 'Bitterness', 'Umaminess', 'Sourness', 'Saltiness',
       'Multitaste', 'Tastelessnessness', 'Non-sweetness',
       'Miscellaneous', 'Tastelessness'], dtype=object)

In [None]:
chemDBModredExtended['Class Taste'] = chemDBModredExtended['Class Taste'].str.replace('Tastelessnessness','Tastelessness')

In [None]:
chemDBModredExtended['Class Taste'].unique()

array(['Sweetness', 'Bitterness', 'Umaminess', 'Sourness', 'Saltiness',
       'Multitaste', 'Tastelessness', 'Non-sweetness', 'Miscellaneous'],
      dtype=object)

In [None]:
chemDBModredExtended

Unnamed: 0,Class Taste,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,Sweetness,0,-2.4092,5.804245,29.9362,42.307102,12,12,36,22,...,8.391084,45.461903,2.066450,15.901910,15.901910,0.000000,912,44,0.503,130
1,Sweetness,0,1.9056,3.631311,73.5138,44.809032,0,0,42,18,...,6.004108,34.664420,1.925801,7.514035,7.514035,0.000000,633,26,1.865,88
2,Sweetness,0,-2.5478,6.491285,35.6603,45.671102,12,12,39,25,...,8.873561,50.408763,2.016351,21.332061,21.332061,0.000000,1370,44,0.339,134
3,Sweetness,0,-2.4092,5.804245,29.9362,42.307102,12,12,36,22,...,8.391084,45.461903,2.066450,15.901910,15.901910,0.000000,912,44,0.503,130
4,Sweetness,0,-3.0642,9.389322,35.7750,23.373516,0,0,24,12,...,7.502641,22.897487,1.908124,14.905717,14.905717,0.000000,174,21,-1.458,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,Bitterness,0,-0.0092,0.000085,13.0093,20.223137,6,6,18,9,...,6.837134,17.549446,1.949938,5.429766,2.454019,2.975746,88,9,1.262,40
2196,Bitterness,0,-0.9090,0.826281,9.8414,19.258344,6,6,17,9,...,7.297202,17.549594,1.949955,4.961358,4.961358,0.000000,90,9,0.813,40
2197,Bitterness,0,-0.2638,0.069590,5.3460,29.023137,12,12,23,14,...,7.959496,28.405439,2.028960,5.566157,2.571148,2.995009,307,18,3.648,68
2198,Bitterness,0,0.3219,0.103620,27.1230,25.445516,5,5,23,11,...,6.612336,21.308457,1.937132,5.377103,5.377103,0.000000,173,9,1.752,50


In [None]:
chemDBModredExtended = chemDBModredExtended.drop_duplicates()

In [None]:
chemDBModredExtended

Unnamed: 0,Class Taste,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,Sweetness,0,-2.4092,5.804245,29.9362,42.307102,12,12,36,22,...,8.391084,45.461903,2.066450,15.901910,15.901910,0.000000,912,44,0.503,130
1,Sweetness,0,1.9056,3.631311,73.5138,44.809032,0,0,42,18,...,6.004108,34.664420,1.925801,7.514035,7.514035,0.000000,633,26,1.865,88
2,Sweetness,0,-2.5478,6.491285,35.6603,45.671102,12,12,39,25,...,8.873561,50.408763,2.016351,21.332061,21.332061,0.000000,1370,44,0.339,134
4,Sweetness,0,-3.0642,9.389322,35.7750,23.373516,0,0,24,12,...,7.502641,22.897487,1.908124,14.905717,14.905717,0.000000,174,21,-1.458,60
5,Sweetness,0,0.2099,0.044058,56.0748,32.072688,0,0,30,14,...,6.537373,27.415257,1.958233,10.743140,5.146601,5.596539,342,17,1.351,62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,Bitterness,0,-0.0092,0.000085,13.0093,20.223137,6,6,18,9,...,6.837134,17.549446,1.949938,5.429766,2.454019,2.975746,88,9,1.262,40
2196,Bitterness,0,-0.9090,0.826281,9.8414,19.258344,6,6,17,9,...,7.297202,17.549594,1.949955,4.961358,4.961358,0.000000,90,9,0.813,40
2197,Bitterness,0,-0.2638,0.069590,5.3460,29.023137,12,12,23,14,...,7.959496,28.405439,2.028960,5.566157,2.571148,2.995009,307,18,3.648,68
2198,Bitterness,0,0.3219,0.103620,27.1230,25.445516,5,5,23,11,...,6.612336,21.308457,1.937132,5.377103,5.377103,0.000000,173,9,1.752,50


In [None]:
chemDBModredExtended.reset_index(inplace = True)

In [None]:
chemDBModredExtended

Unnamed: 0,index,Class Taste,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,0,Sweetness,0,-2.4092,5.804245,29.9362,42.307102,12,12,36,...,8.391084,45.461903,2.066450,15.901910,15.901910,0.000000,912,44,0.503,130
1,1,Sweetness,0,1.9056,3.631311,73.5138,44.809032,0,0,42,...,6.004108,34.664420,1.925801,7.514035,7.514035,0.000000,633,26,1.865,88
2,2,Sweetness,0,-2.5478,6.491285,35.6603,45.671102,12,12,39,...,8.873561,50.408763,2.016351,21.332061,21.332061,0.000000,1370,44,0.339,134
3,4,Sweetness,0,-3.0642,9.389322,35.7750,23.373516,0,0,24,...,7.502641,22.897487,1.908124,14.905717,14.905717,0.000000,174,21,-1.458,60
4,5,Sweetness,0,0.2099,0.044058,56.0748,32.072688,0,0,30,...,6.537373,27.415257,1.958233,10.743140,5.146601,5.596539,342,17,1.351,62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5723,2195,Bitterness,0,-0.0092,0.000085,13.0093,20.223137,6,6,18,...,6.837134,17.549446,1.949938,5.429766,2.454019,2.975746,88,9,1.262,40
5724,2196,Bitterness,0,-0.9090,0.826281,9.8414,19.258344,6,6,17,...,7.297202,17.549594,1.949955,4.961358,4.961358,0.000000,90,9,0.813,40
5725,2197,Bitterness,0,-0.2638,0.069590,5.3460,29.023137,12,12,23,...,7.959496,28.405439,2.028960,5.566157,2.571148,2.995009,307,18,3.648,68
5726,2198,Bitterness,0,0.3219,0.103620,27.1230,25.445516,5,5,23,...,6.612336,21.308457,1.937132,5.377103,5.377103,0.000000,173,9,1.752,50


In [None]:
chemDBModredExtended = chemDBModredExtended.drop(['index'], axis=1)

In [None]:
chemDBModredExtended = chemDBModredExtended.sample(frac=1).reset_index(drop=True)

In [None]:
chemDBModredExtended

Unnamed: 0,Class Taste,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,Umaminess,0,-2.0037,4.014814,83.0163,58.019067,5,5,50,31,...,9.561119,62.084617,2.002730,45.413635,26.152723,12.963360,2953,47,-2.802,162
1,Sweetness,0,-5.8784,34.555587,206.2451,135.572338,0,0,130,64,...,7.095420,130.804216,2.043816,60.348463,60.348463,0.000000,18629,136,-1.452,364
2,Bitterness,0,-0.1482,0.021963,27.9430,14.742758,0,0,12,6,...,8.501161,10.675851,1.779308,5.353285,2.382088,0.000000,31,4,-0.002,20
3,Sweetness,0,2.9586,8.753314,63.4004,35.684274,0,0,32,14,...,5.941743,27.002281,1.928734,2.506981,2.506981,0.000000,285,21,3.407,68
4,Bitterness,0,-3.1892,10.170997,79.8478,50.849446,0,0,48,26,...,7.794194,52.329255,2.012664,27.572613,27.572613,0.000000,1572,51,-1.400,140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5723,Sweetness,1,1.9958,3.983218,49.3101,31.206274,0,0,30,12,...,5.671023,22.318519,1.859877,4.751104,4.751104,0.000000,244,10,3.177,48
5724,Sweetness,0,-4.1851,17.515062,211.4574,140.843096,0,0,134,62,...,6.600574,126.785623,2.044929,47.064358,47.064358,0.000000,19191,130,2.898,354
5725,Sweetness,0,1.6783,2.816691,43.0371,23.316723,0,0,21,10,...,6.527813,19.557977,1.955798,5.170178,2.434451,2.735727,127,10,1.753,44
5726,Bitterness,0,-0.2795,0.078120,47.7009,59.298997,12,12,53,24,...,6.136377,47.718541,1.988273,9.209354,2.540659,6.668695,1392,36,5.565,118


In [None]:
chemDBModredExtended.to_csv('chemTasteDBPadelMerged.csv', index = False)