Reading and writing files

In [30]:
molecules = ['Amigdalin', 'Fenfuram', 'Estradiol', '2-Methylbutanol']

In [31]:
with open('molecules.txt', 'w') as file:
    file.write('\n'.join(molecules))
#this code creates a text file named 'molecules.txt' and writes each molecule from the list on a new line.

In [32]:
print('\n'.join(molecules))

Amigdalin
Fenfuram
Estradiol
2-Methylbutanol


In [33]:
with open('molecules.txt', 'r') as file:
    molecules = [molecule for molecule in file.readlines()]
print(molecules)

['Amigdalin\n', 'Fenfuram\n', 'Estradiol\n', '2-Methylbutanol']


In [34]:
with open('molecules.txt', 'r') as file:
    molecules = [molecule.strip() for molecule in file.readlines()]
print(molecules)

['Amigdalin', 'Fenfuram', 'Estradiol', '2-Methylbutanol']


Pandas

In [35]:
import pandas as pd

In [36]:
import requests

url = "https://raw.githubusercontent.com/schwallergroup/ai4chem_course/main/notebooks/01%20-%20Basics/data/delaney-processed.csv"
content = requests.get(url).content

with open("delaney-processed.csv", "wb") as f:
    f.write(content)

In [37]:
!head data/delaney-processed.csv

'head' is not recognized as an internal or external command,
operable program or batch file.


In [38]:
df = pd.read_csv('data\\delaney-processed.csv')

In [39]:
df.head()

Unnamed: 0,Compound ID,ESOL predicted log solubility in mols per litre,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,measured log solubility in mols per litre,smiles
0,Amigdalin,-0.974,1,457.432,7,3,7,202.32,-0.77,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...
1,Fenfuram,-2.885,1,201.225,1,2,2,42.24,-3.3,Cc1occc1C(=O)Nc2ccccc2
2,citral,-2.579,1,152.237,0,0,4,17.07,-2.06,CC(C)=CCCC(C)=CC(=O)
3,Picene,-6.618,2,278.354,0,5,0,0.0,-7.87,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43
4,Thiophene,-2.232,2,84.143,0,1,0,0.0,-1.33,c1ccsc1


In [40]:
solubility = df["measured log solubility in mols per litre"]
print(solubility)

0      -0.770
1      -3.300
2      -2.060
3      -7.870
4      -1.330
        ...  
1123   -1.710
1124    0.106
1125   -3.091
1126   -3.180
1127   -4.522
Name: measured log solubility in mols per litre, Length: 1128, dtype: float64


In [41]:
mean_solubility = solubility.mean()
print("Mean Solubility:", mean_solubility)

Mean Solubility: -3.05010195035461


In [42]:
high_solubility = df[df["measured log solubility in mols per litre"] > mean_solubility]
print(high_solubility)

           Compound ID  ESOL predicted log solubility in mols per litre  \
0            Amigdalin                                           -0.974   
2               citral                                           -2.579   
4            Thiophene                                           -2.232   
5        benzothiazole                                           -2.733   
10       2-pyrrolidone                                            0.243   
...                ...                                              ...   
1118  Isobutyl acetate                                           -1.463   
1121    Phthalonitrile                                           -1.717   
1122    m-Nitrotoluene                                           -2.640   
1123         halothane                                           -2.608   
1124            Oxamyl                                           -0.908   

      Minimum Degree  Molecular Weight  Number of H-Bond Donors  \
0                  1           4

In [43]:
df["Solubility Class"] = "Low"
df.head()

Unnamed: 0,Compound ID,ESOL predicted log solubility in mols per litre,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,measured log solubility in mols per litre,smiles,Solubility Class
0,Amigdalin,-0.974,1,457.432,7,3,7,202.32,-0.77,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,Low
1,Fenfuram,-2.885,1,201.225,1,2,2,42.24,-3.3,Cc1occc1C(=O)Nc2ccccc2,Low
2,citral,-2.579,1,152.237,0,0,4,17.07,-2.06,CC(C)=CCCC(C)=CC(=O),Low
3,Picene,-6.618,2,278.354,0,5,0,0.0,-7.87,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,Low
4,Thiophene,-2.232,2,84.143,0,1,0,0.0,-1.33,c1ccsc1,Low


In [44]:
df.loc[df["measured log solubility in mols per litre"] > mean_solubility, "Solubility Class"] = "High"
df.head()

Unnamed: 0,Compound ID,ESOL predicted log solubility in mols per litre,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,measured log solubility in mols per litre,smiles,Solubility Class
0,Amigdalin,-0.974,1,457.432,7,3,7,202.32,-0.77,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,High
1,Fenfuram,-2.885,1,201.225,1,2,2,42.24,-3.3,Cc1occc1C(=O)Nc2ccccc2,Low
2,citral,-2.579,1,152.237,0,0,4,17.07,-2.06,CC(C)=CCCC(C)=CC(=O),High
3,Picene,-6.618,2,278.354,0,5,0,0.0,-7.87,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,Low
4,Thiophene,-2.232,2,84.143,0,1,0,0.0,-1.33,c1ccsc1,High


In [45]:
grouped = df.groupby("Solubility Class")
grouped.head()

Unnamed: 0,Compound ID,ESOL predicted log solubility in mols per litre,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,measured log solubility in mols per litre,smiles,Solubility Class
0,Amigdalin,-0.974,1,457.432,7,3,7,202.32,-0.77,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,High
1,Fenfuram,-2.885,1,201.225,1,2,2,42.24,-3.3,Cc1occc1C(=O)Nc2ccccc2,Low
2,citral,-2.579,1,152.237,0,0,4,17.07,-2.06,CC(C)=CCCC(C)=CC(=O),High
3,Picene,-6.618,2,278.354,0,5,0,0.0,-7.87,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,Low
4,Thiophene,-2.232,2,84.143,0,1,0,0.0,-1.33,c1ccsc1,High
5,benzothiazole,-2.733,2,135.191,0,2,0,12.89,-1.5,c2ccc1scnc1c2,High
6,"2,2,4,6,6'-PCB",-6.545,1,326.437,0,2,1,0.0,-7.32,Clc1cc(Cl)c(c(Cl)c1)c2c(Cl)cccc2Cl,Low
7,Estradiol,-4.138,1,272.388,2,4,0,40.46,-5.03,CC12CCC3C(CCc4cc(O)ccc34)C2CCC1O,Low
8,Dieldrin,-4.533,1,380.913,0,5,0,12.53,-6.29,ClC4=C(Cl)C5(Cl)C3C1CC(C2OC12)C3C4(Cl)C5(Cl)Cl,Low
10,2-pyrrolidone,0.243,1,85.106,1,1,0,29.1,1.07,O=C1CCCN1,High


In [49]:
grouped_means = df.groupby("Solubility Class")["measured log solubility in mols per litre"].mean()
print(grouped_means)

Solubility Class
High   -1.490285
Low    -4.822621
Name: measured log solubility in mols per litre, dtype: float64


In [50]:
df.sort_values("measured log solubility in mols per litre", ascending=False, inplace=True)
df.head()

Unnamed: 0,Compound ID,ESOL predicted log solubility in mols per litre,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,measured log solubility in mols per litre,smiles,Solubility Class
605,Acetamide,0.494,1,59.068,1,0,0,43.09,1.58,CC(=O)N,High
146,Methanol,0.441,1,32.042,1,0,0,20.23,1.57,CO,High
201,Methyl hydrazine,0.543,1,46.073,2,0,0,38.05,1.34,CNN,High
1064,vamidothion,-1.446,1,287.343,1,0,8,64.63,1.144,CNC(=O)C(C)SCCSP(=O)(OC)(OC),High
679,Glycerol,0.688,1,92.094,3,0,2,60.69,1.12,OCC(O)CO,High


In [51]:
df1 = df[["Compound ID", "measured log solubility in mols per litre"]]
df2 = df[["Compound ID", "smiles"]]

merged_df = pd.merge(df1, df2, on="Compound ID")
merged_df.head()

Unnamed: 0,Compound ID,measured log solubility in mols per litre,smiles
0,Acetamide,1.58,CC(=O)N
1,Methanol,1.57,CO
2,Methyl hydrazine,1.34,CNN
3,vamidothion,1.144,CNC(=O)C(C)SCCSP(=O)(OC)(OC)
4,Glycerol,1.12,OCC(O)CO


In [54]:
df3 = df[["Compound ID", "measured log solubility in mols per litre"]].head(10)
df4 = df[["Compound ID", "smiles"]].tail(10)
concatenated = pd.concat([df3, df4])
print(concatenated)
concatenated.head()

                       Compound ID  measured log solubility in mols per litre  \
605                      Acetamide                                      1.580   
146                       Methanol                                      1.570   
201               Methyl hydrazine                                      1.340   
1064                   vamidothion                                      1.144   
679                       Glycerol                                      1.120   
687          N,N-Dimethylacetamide                                      1.110   
365                     Pyrimidine                                      1.100   
983                        Ethanol                                      1.100   
186                     Pyridazine                                      1.100   
276                       Sorbitol                                      1.090   
181                     Etofenprox                                        NaN   
637                 Benzo(a)

Unnamed: 0,Compound ID,measured log solubility in mols per litre,smiles
605,Acetamide,1.58,
146,Methanol,1.57,
201,Methyl hydrazine,1.34,
1064,vamidothion,1.144,
679,Glycerol,1.12,
