In [1]:
import pandas as pd
import sys
import os

# plot

%matplotlib inline
from rdkit import Chem

src_path = os.path.join("..")
if src_path not in sys.path:
    sys.path.append(src_path)

src_path = os.path.join("..", "src")
if src_path not in sys.path:
    sys.path.append(src_path)

src_path = os.path.join("..", "external", "HOSE_code_generator")
if src_path not in sys.path:
    sys.path.append(src_path)

from src import (
    common,
    atomic_features_2D,
)  # Some common methods I defined

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

pd.options.mode.chained_assignment = None  # Suppress the SettingWithCopyWarning

# Data import

In [2]:
datapath = os.path.join("..", "dataset", "All PFAS in Lab_w_smiles.xlsx")

PFAS_in_lab = pd.read_excel(datapath, index_col=0)
print(f"Number of PFAS in our lab {PFAS_in_lab.shape[0]}")
PFAS_in_lab["SMILES"] = common.canonical_smiles(PFAS_in_lab["SMILES"])
PFAS_in_lab.head(5)

Number of PFAS in our lab 222


Unnamed: 0,Group,Compound name,CAS,Alternative name,SMILES
0,PFCA,Sodium trifluoroacetate,2923-18-4,C2 COOH,O=C([O-])C(F)(F)F.[Na+]
1,PFCA,Perfluoropropionic acid,422-64-0,C3 COOH,O=C(O)C(F)(F)C(F)(F)F
2,PFCA,Perfluorobutyric acid,375-22-4,C4 COOH,O=C(O)C(F)(F)C(F)(F)C(F)(F)F
3,PFCA,Perfluoropentanoic acid,2706-90-3,C5 COOH,O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F
4,PFCA,Perfluorohexanoic acid,307-24-4,C6 COOH,O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F


In [3]:
data_path = os.path.join("..", "dataset", "Raw_PFAS 19F NMR spectra data.csv")

all_fluorinated_compounds = pd.read_csv(data_path, index_col=0)

print(
    f"Number of the fluorinated compounds in our dataset for modeling {all_fluorinated_compounds.shape[0]}"
)
all_fluorinated_compounds.head()

Number of the fluorinated compounds in our dataset for modeling 647


Unnamed: 0,Group,IsPFAS(haveCF2),CAS,Compound name,Code,Solvent_used_for_NMR,Source,Note,SMILES,Internal Standard,FluorineInConjSystem,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70
0,Per- and polyfluorinated carboxylic acids,True,,perfluoropropionic acid,COOH_1,CDCl3,100+PFAS,,O=C(O)C(F)(F)C(F)(F)F,,0.0,,,,,-122.45,-122.45,,-83.4,-83.4,-83.4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Per- and polyfluorinated carboxylic acids,True,,perfluorobutanoic acid,COOH_2,CDCl3,100+PFAS,,O=C(O)C(F)(F)C(F)(F)C(F)(F)F,,0.0,,,,,-119.78,-119.78,,-127.39,-127.39,,-81.14,-81.14,-81.14,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Per- and polyfluorinated carboxylic acids,True,,perfluoropentanoic acid,COOH_3,CDCl3,100+PFAS,,O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,,0.0,,,,,-119.41,-119.41,,-123.99,-123.99,,-126.26,-126.26,,-81.19,-81.19,-81.19,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Per- and polyfluorinated carboxylic acids,True,,perfluorohexanoic acid,COOH_4,CDCl3,100+PFAS,,O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,,0.0,,,,,-119.23,-119.23,,-122.87,-122.87,,-123.25,-123.25,,-126.44,-126.44,,-81.06,-81.06,-81.06,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Per- and polyfluorinated carboxylic acids,True,,perfluoroheptanoic acid,COOH_5,CDCl3,100+PFAS,,O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,,0.0,,,,,-119.14,-119.14,,-122.01,-122.01,,-122.99,-122.99,,-122.99,-122.99,,-126.29,-126.29,,-80.96,-80.96,-80.96,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# PFAS not reported in the modeling dataset

In [4]:
PFAS_not_reported = PFAS_in_lab[
    ~PFAS_in_lab["SMILES"].isin(all_fluorinated_compounds["SMILES"])
]
print(f"Number of PFAS not reported but in our lab: {PFAS_not_reported.shape[0]}")

Number of PFAS not reported but in our lab: 162


In [5]:
PFAS_not_reported

Unnamed: 0,Group,Compound name,CAS,Alternative name,SMILES
0,PFCA,Sodium trifluoroacetate,2923-18-4,C2 COOH,O=C([O-])C(F)(F)F.[Na+]
13,PFdiCOOH,Difluoromalonic acid,1514-85-8,C3(COOH)2,O=C(O)C(F)(F)C(=O)O
14,PFdiCOOH,"2,2-difluorosuccinic acid",665-31-6,C4(COOH)2 telo,O=C(O)CC(F)(F)C(=O)O
15,PFdiCOOH,Tetrafluorosuccinic acid,377-38-8,C4(COOH)2,O=C(O)C(F)(F)C(F)(F)C(=O)O
16,PFdiCOOH,Hexafluoroglutaric acid,376-73-8,C5(COOH)2,O=C(O)C(F)(F)C(F)(F)C(F)(F)C(=O)O
...,...,...,...,...,...
227,Ring Structures,4-(Pentafluorothio)phenol,774-94-7,,Oc1ccc(S(F)(F)(F)(F)F)cc1
229,Acyl chlroides,Perfluoro(2-methyl-3-oxahexanoyl) chloride,72848-57-8,,O=C(Cl)C(F)(OC(F)(F)C(F)(F)C(F)(F)F)C(F)(F)F
230,Ester,Methyl heptafluoroisobutyrate,680-05-7,,COC(=O)C(F)(C(F)(F)F)C(F)(F)F
232,Ketone,"1,1,1-Trifluoropentane-2,4-dione",367-57-7,,CC(=O)CC(=O)C(F)(F)F


In [6]:
PFAS_not_reported.to_csv(
    os.path.join("..", "artifacts", "temp", "PFAS_not_reported.csv")
)

## Check 19F NMR value range for specific structures

In [7]:
class track2DNeighborAtoms:
    def findNeighborsNextSphere(
        self, last_sphere_atoms, current_sphere_atoms, next_sphere_level
    ):
        """Get a list of atom indexs in the next spheres

        Parameters
        ----------
        last_sphere_atoms: list
            A list of atoms in the last sphere.

        current_sphere_atoms: list
            A list of atoms in the current sphere

        next_sphere_level: int
             The number of next sphere. For target F atom, the C atom next to it belongs to sphere 0.

        Output
        ----------
        A list of indexs of atoms in the next sphere. If the sphere is No.4. Then the output list will be a list of length 12.
        """
        neighbor_list = []

        # Convert last_sphere_atoms to a set of indices for faster lookups
        if last_sphere_atoms is not None:
            last_sphere_atoms_set = set(
                atom.GetIdx() for atom in last_sphere_atoms if atom is not None
            )
        else:
            last_sphere_atoms_set = set()

        for atom in current_sphere_atoms:
            if atom is None:
                neighbors = [
                    None
                ] * 3  # Adjusted to 3 neighbors. In our case, each atom have at most 4 neighbor atoms.
            else:
                neighbors = (
                    atom.GetNeighbors()
                )  # This function will get all neighbor atoms
                # Filter out atoms that are in last_sphere_atoms
                neighbors = [
                    a for a in neighbors if a.GetIdx() not in last_sphere_atoms_set
                ]

            neighbor_list.extend(neighbors)

        # Final length should be 3^sphere_level
        return neighbor_list

    # Get neighbors of each F atom
    def getNeighborsOfFAtom(self, F_atom):
        """For the target F atom, get neighbor atoms in the 6 neighbor spheres
        Parameters
        ----------
        F_atom: atom object

        Output
        ----------
        sphere0[:1], sphere1, sphere2, sphere3, sphere4, sphere5: list, list, list, list, list, list
            index of atoms in sphere 0, index of atoms in sphere 1, index of atoms in sphere 2, index of atoms in sphere 3,
            index of atoms in sphere 4, and index of atoms in sphere 5
        """
        sphere0_atoms = self.findNeighborsNextSphere(None, [F_atom], 0)
        sphere1_atoms = self.findNeighborsNextSphere(
            [F_atom], sphere0_atoms[:1], 1
        )  # We know sphere0 can only have one valid atom, C
        sphere2_atoms = self.findNeighborsNextSphere(
            sphere0_atoms[:1], sphere1_atoms, 2
        )
        sphere3_atoms = self.findNeighborsNextSphere(sphere1_atoms, sphere2_atoms, 3)
        sphere4_atoms = self.findNeighborsNextSphere(sphere2_atoms, sphere3_atoms, 4)
        sphere5_atoms = self.findNeighborsNextSphere(sphere3_atoms, sphere4_atoms, 5)

        sphere0 = [
            atom.GetSymbol() if atom is not None else None for atom in sphere0_atoms
        ]
        sphere1 = [
            atom.GetSymbol() if atom is not None else None for atom in sphere1_atoms
        ]
        sphere2 = [
            atom.GetSymbol() if atom is not None else None for atom in sphere2_atoms
        ]
        sphere3 = [
            atom.GetSymbol() if atom is not None else None for atom in sphere3_atoms
        ]
        sphere4 = [
            atom.GetSymbol() if atom is not None else None for atom in sphere4_atoms
        ]
        sphere5 = [
            atom.GetSymbol() if atom is not None else None for atom in sphere5_atoms
        ]

        return (
            sphere0[:1],
            sphere1,
            sphere2,
            sphere3,
            sphere4,
            sphere5,
        )  # Only keep the only one valid atom, C

    def getNeighborsInDiffSpheres(self, smiles):
        """
        Parameter
        ----------
        smiles: string
            SMILES of the target compound

        Output
        ----------
        df： Dataframe
            Each line in the df shows the index of neighbor atoms for one F atom in the molecule.
            column name of the df are the number of spheres: 0, 1, 2, 3, 4, 5.
            index of the df are index of F atoms
        """
        # Create an RDKit molecule object from the SMILES string
        mol = Chem.MolFromSmiles(smiles)
        mol = Chem.AddHs(mol)
        neighbors = {}
        for atom in mol.GetAtoms():
            atom_symbol = (
                atom.GetSymbol()
            )  # Get the atomic symbol (e.g., 'C', 'O', 'F')
            if atom_symbol == "F":
                atom_index = atom.GetIdx()  # Get the index of the atom
                sphere0, sphere1, sphere2, sphere3, sphere4, sphere5 = (
                    self.getNeighborsOfFAtom(atom)
                )
                neighbors[atom_index] = [
                    sphere0,
                    sphere1,
                    sphere2,
                    sphere3,
                    sphere4,
                    sphere5,
                ]

        df = pd.DataFrame.from_dict(neighbors).T
        df = df.applymap(lambda x: tuple(x) if isinstance(x, list) else x)
        df = df.drop_duplicates()
        return df

    def getNieghborsFromDataset(self, dataset):
        """Get a dataframe with each row being the atomic features of neighbor atoms of a target F atom in a fluorinated compound"""
        # Step 1. Transform the column names of the DataFrame to integers where possible and keep them as strings otherwise
        dataset.columns = [common.convert_column_name(name) for name in dataset.columns]
        fluorinated_compounds_content = pd.DataFrame()
        for i, row in dataset.iterrows():
            smiles = row["SMILES"]
            fluorinated_compounds = row["Code"]
            content = self.getNeighborsInDiffSpheres(smiles)
            index_list = content.index
            try:
                content["NMR_Peaks"] = row[index_list]
            except (KeyError, IndexError):
                pass

            content = content.rename(lambda x: f"{x}_{fluorinated_compounds}")
            fluorinated_compounds_content = pd.concat(
                [fluorinated_compounds_content, content], axis=0
            )
        return fluorinated_compounds_content

In [8]:
get2DNeighbors_class = track2DNeighborAtoms()
df = get2DNeighbors_class.getNieghborsFromDataset(all_fluorinated_compounds)
df

  df = df.applymap(lambda x: tuple(x) if isinstance(x, list) else x)
  df = df.applymap(lambda x: tuple(x) if isinstance(x, list) else x)
  df = df.applymap(lambda x: tuple(x) if isinstance(x, list) else x)
  df = df.applymap(lambda x: tuple(x) if isinstance(x, list) else x)
  df = df.applymap(lambda x: tuple(x) if isinstance(x, list) else x)
  df = df.applymap(lambda x: tuple(x) if isinstance(x, list) else x)
  df = df.applymap(lambda x: tuple(x) if isinstance(x, list) else x)
  df = df.applymap(lambda x: tuple(x) if isinstance(x, list) else x)
  df = df.applymap(lambda x: tuple(x) if isinstance(x, list) else x)
  df = df.applymap(lambda x: tuple(x) if isinstance(x, list) else x)
  df = df.applymap(lambda x: tuple(x) if isinstance(x, list) else x)
  df = df.applymap(lambda x: tuple(x) if isinstance(x, list) else x)
  df = df.applymap(lambda x: tuple(x) if isinstance(x, list) else x)
  df = df.applymap(lambda x: tuple(x) if isinstance(x, list) else x)
  df = df.applymap(lambda x: tuple

Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
4_COOH_1,"(C,)","(C, F, C)","(O, O, F, F, F)","(H,)",(),(),-122.45
7_COOH_1,"(C,)","(C, F, F)","(C, F, F)","(O, O)","(H,)",(),-83.4
4_COOH_2,"(C,)","(C, F, C)","(O, O, F, F, C)","(H, F, F, F)",(),(),-119.78
7_COOH_2,"(C,)","(C, F, C)","(C, F, F, F, F, F)","(O, O)","(H,)",(),-127.39
10_COOH_2,"(C,)","(C, F, F)","(C, F, F)","(C, F, F)","(O, O)","(H,)",-81.14
...,...,...,...,...,...,...,...
7_From_Review_484,"(C,)","(C, N)","(C, F, C)","(N, H, F, N)",(),(),-72.0
0_From_Review_485,"(C,)","(N, N)","(C, C)","(F, C, C, F)","(F, F)",(),-43.0
4_From_Review_485,"(C,)","(N, C)","(C, F, C)","(F, N, F, N)",(),(),-69.0
6_From_Review_485,"(C,)","(C, C)","(N, F, F, N)","(C, C)","(F, F)",(),-171.0


In [9]:
df_dropna = df.dropna(subset=["NMR_Peaks"])
df_dropna["NMR_Peaks"] = df_dropna["NMR_Peaks"].apply(pd.to_numeric)
df_dropna

Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
4_COOH_1,"(C,)","(C, F, C)","(O, O, F, F, F)","(H,)",(),(),-122.45
7_COOH_1,"(C,)","(C, F, F)","(C, F, F)","(O, O)","(H,)",(),-83.40
4_COOH_2,"(C,)","(C, F, C)","(O, O, F, F, C)","(H, F, F, F)",(),(),-119.78
7_COOH_2,"(C,)","(C, F, C)","(C, F, F, F, F, F)","(O, O)","(H,)",(),-127.39
10_COOH_2,"(C,)","(C, F, F)","(C, F, F)","(C, F, F)","(O, O)","(H,)",-81.14
...,...,...,...,...,...,...,...
7_From_Review_484,"(C,)","(C, N)","(C, F, C)","(N, H, F, N)",(),(),-72.00
0_From_Review_485,"(C,)","(N, N)","(C, C)","(F, C, C, F)","(F, F)",(),-43.00
4_From_Review_485,"(C,)","(N, C)","(C, F, C)","(F, N, F, N)",(),(),-69.00
6_From_Review_485,"(C,)","(C, C)","(N, F, F, N)","(C, C)","(F, F)",(),-171.00


In [10]:
get_2d_descriptors = atomic_features_2D.getAtomicDescriptorsFrom2DNeighbors()
descriptor_2d_content = get_2d_descriptors.getDescriptorsFromDataset(
    all_fluorinated_compounds, 3
)
descriptor_2d_content.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,NMR_Peaks
4_COOH_1,12.011,3.0,0.0,4.0,4.0,4.0,0.0,12.011,2.0,0.0,3.0,4.0,4.0,0.0,18.998,3.0,0.0,1.0,1.0,1.0,0.0,12.011,3.0,0.0,4.0,4.0,4.0,0.0,15.999,2.0,0.0,1.0,2.0,2.0,0.0,15.999,2.0,0.0,1.0,2.0,1.0,0.0,18.998,3.0,0.0,1.0,1.0,1.0,0.0,18.998,3.0,0.0,1.0,1.0,1.0,0.0,18.998,3.0,0.0,1.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-122.45
5_COOH_1,12.011,3.0,0.0,4.0,4.0,4.0,0.0,12.011,2.0,0.0,3.0,4.0,4.0,0.0,18.998,3.0,0.0,1.0,1.0,1.0,0.0,12.011,3.0,0.0,4.0,4.0,4.0,0.0,15.999,2.0,0.0,1.0,2.0,2.0,0.0,15.999,2.0,0.0,1.0,2.0,1.0,0.0,18.998,3.0,0.0,1.0,1.0,1.0,0.0,18.998,3.0,0.0,1.0,1.0,1.0,0.0,18.998,3.0,0.0,1.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-122.45
7_COOH_1,12.011,3.0,0.0,4.0,4.0,4.0,0.0,12.011,3.0,0.0,4.0,4.0,4.0,0.0,18.998,3.0,0.0,1.0,1.0,1.0,0.0,18.998,3.0,0.0,1.0,1.0,1.0,0.0,12.011,2.0,0.0,3.0,4.0,4.0,0.0,18.998,3.0,0.0,1.0,1.0,1.0,0.0,18.998,3.0,0.0,1.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-83.4


## Get F shifts in different structures

In [11]:
# Find F in certain structures:
# CF3-C: 0 [C]; 1 [C, F, F];

results = {}


df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)
filtered_df = df_list[
    (df_list[0].apply(lambda x: x == ["C"]))
    & (
        df_list[1].apply(
            lambda x: x.count("F") == 2 and x.count("C") == 1 and len(x) == 3
        )
    )
]

max_value = filtered_df["NMR_Peaks"].max()
mean_value = filtered_df["NMR_Peaks"].mean()
min_value = filtered_df["NMR_Peaks"].min()
num_points = len(filtered_df)
results["CF3-C"] = [min_value, max_value, mean_value, num_points]

# Print the results
print("Max:", max_value)
print("Mean:", mean_value)
print("Min:", min_value)

print(f"Number of datapoints in our dataset: {num_points}")
filtered_df.head(3)

Max: -49.4
Mean: -72.9153869047619
Min: -88.2
Number of datapoints in our dataset: 336


  df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)


Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
7_COOH_1,[C],"[C, F, F]","[C, F, F]","[O, O]",[H],[],-83.4
10_COOH_2,[C],"[C, F, F]","[C, F, F]","[C, F, F]","[O, O]",[H],-81.14
13_COOH_3,[C],"[C, F, F]","[C, F, F]","[C, F, F]","[C, F, F]","[O, O]",-81.19


In [12]:
# C-CF2-C: 0 [C]; 1 [C, F, C];
df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)
filtered_df = df_list[
    (df_list[0].apply(lambda x: x == ["C"]))
    & (
        df_list[1].apply(
            lambda x: x.count("F") == 1 and x.count("C") == 2 and len(x) == 3
        )
    )
]

max_value = filtered_df["NMR_Peaks"].max()
mean_value = filtered_df["NMR_Peaks"].mean()
min_value = filtered_df["NMR_Peaks"].min()
num_points = len(filtered_df)
results["C-CF2-C"] = [min_value, max_value, mean_value, num_points]

# Print the results
print("Max:", max_value)
print("Mean:", mean_value)
print("Min:", min_value)
print(f"Number of datapoints in our dataset: {len(filtered_df)}")
filtered_df.head(3)

Max: -84.5
Mean: -121.7869065934066
Min: -159.0
Number of datapoints in our dataset: 728


  df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)


Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
4_COOH_1,[C],"[C, F, C]","[O, O, F, F, F]",[H],[],[],-122.45
4_COOH_2,[C],"[C, F, C]","[O, O, F, F, C]","[H, F, F, F]",[],[],-119.78
7_COOH_2,[C],"[C, F, C]","[C, F, F, F, F, F]","[O, O]",[H],[],-127.39


In [13]:
# CF3-C6H5 (CF3-aryl): 0 [C]; 1 [C, F, F]; 2 [C, C]; 3 [H, H, C, C]
df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)
filtered_df = df_list[
    (df_list[0].apply(lambda x: x == ["C"]))
    & (
        df_list[1].apply(
            lambda x: x.count("F") == 2 and x.count("C") == 1 and len(x) == 3
        )
    )
    & (df_list[2].apply(lambda x: x.count("C") == 2 and len(x) == 2))
]

max_value = filtered_df["NMR_Peaks"].max()
mean_value = filtered_df["NMR_Peaks"].mean()
min_value = filtered_df["NMR_Peaks"].min()
num_points = len(filtered_df)
results["CF3-C6H5"] = [min_value, max_value, mean_value, num_points]

# Print the results
print("Max:", max_value)
print("Mean:", mean_value)
print("Min:", min_value)
print(f"Number of datapoints in our dataset: {len(filtered_df)}")
filtered_df

Max: -49.4
Mean: -58.976666666666674
Min: -70.0
Number of datapoints in our dataset: 69


  df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)


Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
10_aromatics_1,[C],"[C, F, F]","[C, C]","[C, H, N, C]","[C, H, O, O, C, H]","[H, H]",-60.02
5_aromatics_2,[C],"[C, F, F]","[C, C]","[C, H, C, H]","[N, C, C, C]","[H, H, H, F, F, F, H]",-63.52
11_aromatics_2,[C],"[C, F, F]","[C, C]","[C, H, C, H]","[C, C, N, C]","[H, F, F, F, H, H, H]",-63.52
6_aromatics_3,[C],"[C, F, F]","[C, C]","[C, H, C, H]","[N, C, C, C]","[N, H, H, F, F, F, H]",-63.42
12_aromatics_3,[C],"[C, F, F]","[C, C]","[C, H, C, H]","[C, C, N, C]","[H, F, F, F, N, H, H]",-63.42
7_aromatics_4,[C],"[C, F, F]","[C, C]","[C, H, C, H]","[C, H, N, C]","[H, H, H, H]",-62.87
6_aromatics_5,[C],"[C, F, F]","[C, C]","[C, H, C, H]","[C, H, C, H]","[N, N]",-61.33
0_aromatics_7,[C],"[F, F, C]","[C, C]","[C, H, C, H]","[C, H, C, H]","[Cl, Cl]",-62.58
7_aromatics_9,[C],"[C, F, F]","[C, C]","[C, H, C, H]","[C, H, O, C]","[H, H, H]",-63.84
10_aromatics_10,[C],"[C, F, F]","[C, C]","[C, H, C, C]","[C, H, O, O, C, H]","[H, H, H]",-59.38


Some rows are CF3 on C=C, we need to distinguish these structures mannually.
For CF3 on benzene ring: Min = 63.9, max = -49.4. Avg = -58.55. Number = 65  
For CF3 on C=C: Min = -70, max = -64, mean = -65.92, number = 4

In [14]:
base_file_path = os.path.join("..", "artifacts", "results")
filtered_df.to_csv(os.path.join(base_file_path, "CF3- on C=C or benzene ring.csv"))

In [15]:
results["CF3-C6H5"] = [-63.90, -49.4, -58.55, 65]
results["CF3-C(C)=c"] = [-70, -64, -65.92, 4]

In [16]:
# CF3-CH=C (CF3-vinylic): 0 [C]; 1 [C, F, F]; 2 [C, ?]; 3 []
df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)
filtered_df = df_list[
    (df_list[0].apply(lambda x: x == ["C"]))
    & (
        df_list[1].apply(
            lambda x: x.count("F") == 2 and x.count("C") == 1 and len(x) == 3
        )
    )
    & (
        df_list[2].apply(
            lambda x: x.count("C") == 1 and x.count("H") == 1 and len(x) == 2
        )
    )
]

max_value = filtered_df["NMR_Peaks"].max()
mean_value = filtered_df["NMR_Peaks"].mean()
min_value = filtered_df["NMR_Peaks"].min()
num_points = len(filtered_df)
results["CF3-CH=C"] = [min_value, max_value, mean_value, num_points]

# Print the results
print("Max:", max_value)
print("Mean:", mean_value)
print("Min:", min_value)
print(f"Number of datapoints in our dataset: {len(filtered_df)}")
filtered_df

Max: -58.0
Mean: -63.611111111111114
Min: -67.0
Number of datapoints in our dataset: 18


  df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)


Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
3_From_Review_341,[C],"[C, F, F]","[C, H]","[H, H]",[],[],-67.0
7_From_Review_342,[C],"[C, F, F]","[C, H]","[C, H]","[C, H, H]","[C, H, H]",-65.0
7_From_Review_343,[C],"[C, F, F]","[C, H]","[C, H]","[C, H, H]","[C, H, H]",-59.0
9_From_Review_344,[C],"[C, F, F]","[C, H]","[C, H]","[C, H, H]","[C, H, H]",-65.0
8_From_Review_345,[C],"[C, F, F]","[C, H]","[C, H]","[C, C]","[C, H, C, H]",-64.0
0_From_Review_346,[C],"[F, F, C]","[C, H]","[C, H]","[C, C]","[C, H, C, H]",-63.0
0_From_Review_347,[C],"[F, F, C]","[C, H]","[C, H]","[C, C]","[C, H, C, H]",-58.0
10_From_Review_348,[C],"[C, F, F]","[C, H]","[C, H]","[C, H]","[C, H]",-64.0
0_From_Review_349,[C],"[F, F, C]","[C, H]","[C, H]","[C, H]","[C, H]",-63.0
0_From_Review_354,[C],"[F, F, C]","[C, H]","[C, H]","[F, F, F]",[],-66.4


In [17]:
# CF2=C: 0 [C]; 1 [C, F];
df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)
filtered_df = df_list[
    (df_list[0].apply(lambda x: x == ["C"]))
    & (
        df_list[1].apply(
            lambda x: x.count("F") == 1 and x.count("C") == 1 and len(x) == 2
        )
    )
]

max_value = filtered_df["NMR_Peaks"].max()
mean_value = filtered_df["NMR_Peaks"].mean()
min_value = filtered_df["NMR_Peaks"].min()
num_points = len(filtered_df)
results["CF2=C"] = [min_value, max_value, mean_value, num_points]

# Print the results
print("Max:", max_value)
print("Mean:", mean_value)
print("Min:", min_value)
print(f"Number of datapoints in our dataset: {len(filtered_df)}")
filtered_df

Max: -61.2
Mean: -100.96666666666665
Min: -134.0
Number of datapoints in our dataset: 24


  df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)


Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
0_From_Review_258,[C],"[F, C]","[Cl, H]",[],[],[],-95.0
4_From_Review_259,[C],"[C, F]","[C, H]","[O, H, H]",[H],[],-88.0
0_From_Review_260,[C],"[F, C]","[C, H]","[I, H, H]",[],[],-84.0
2_From_Review_261,[C],"[C, F]","[H, H]",[],[],[],-81.0
3_From_Review_262,[C],"[C, F]","[F, H]",[],[],[],-126.0
0_From_Review_263,[C],"[F, C]","[F, F]",[],[],[],-134.0
4_From_Review_263,[C],"[C, F]","[F, F]",[],[],[],-134.0
7_From_Review_266,[C],"[C, F]","[C, F]","[C, H, H]","[C, H, H]","[C, H, H]",-106.7
7_From_Review_267,[C],"[C, F]","[C, H]","[C, O]","[C, C, H]","[H, H, H, H, H, H]",-61.2
8_From_Review_268,[C],"[C, F]","[C, H]","[O, O]",[C],"[C, C, H]",-63.5


In [18]:
# -CF=C: 0 [C]; 1 [C, F];
df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)
filtered_df = df_list[
    (df_list[0].apply(lambda x: x == ["C"]))
    & (
        df_list[1].apply(
            lambda x: x.count("F") == 0 and x.count("C") >= 1 and len(x) == 2
        )
    )
]

max_value = filtered_df["NMR_Peaks"].max()
mean_value = filtered_df["NMR_Peaks"].mean()
min_value = filtered_df["NMR_Peaks"].min()
num_points = len(filtered_df)
results["-CF=C"] = [min_value, max_value, mean_value, num_points]

# Print the results
print("Max:", max_value)
print("Mean:", mean_value)
print("Min:", min_value)
print(f"Number of datapoints in our dataset: {len(filtered_df)}")
filtered_df

Max: -63.0
Mean: -133.5397894736842
Min: -205.0
Number of datapoints in our dataset: 285


  df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)


Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
5_COOH_16,[C],"[C, C]","[C, H, F, F, C]","[O, O, F, F, C]","[H, F, F, F]",[],-109.19
5_COOH_17,[C],"[C, C]","[C, H, F, F, C]","[O, O, F, F, C]","[H, F, F, C]","[F, F, C]",-109.08
5_COOH_18,[C],"[C, C]","[C, H, F, F, C]","[O, O, F, F, C]","[H, F, F, C]","[F, F, C]",-109.02
5_COOH_19,[C],"[C, C]","[C, H, F, F, C]","[O, O, F, F, C]","[H, F, F, C]","[F, F, C]",-108.98
0_aromatics_12,[C],"[C, C]","[C, H, C, H]","[C, H, C, H]","[F, F]",[],-119.85
...,...,...,...,...,...,...,...
5_From_Review_484,[C],"[C, C]","[N, H, F, N]","[C, C]","[F, F]",[],-155.00
7_From_Review_484,[C],"[C, N]","[C, F, C]","[N, H, F, N]",[],[],-72.00
4_From_Review_485,[C],"[N, C]","[C, F, C]","[F, N, F, N]",[],[],-69.00
6_From_Review_485,[C],"[C, C]","[N, F, F, N]","[C, C]","[F, F]",[],-171.00


In [19]:
# C-CF2H: 0 [C]; 1 [C, F];
df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)
filtered_df = df_list[
    (df_list[0].apply(lambda x: x == ["C"]))
    & (
        df_list[1].apply(
            lambda x: x.count("F") == 1
            and x.count("C") == 1
            and x.count("H") == 1
            and len(x) == 3
        )
    )
]

max_value = filtered_df["NMR_Peaks"].max()
mean_value = filtered_df["NMR_Peaks"].mean()
min_value = filtered_df["NMR_Peaks"].min()
num_points = len(filtered_df)
results["C-CF2H"] = [min_value, max_value, mean_value, num_points]

# Print the results
print("Max:", max_value)
print("Mean:", mean_value)
print("Min:", min_value)
print(f"Number of datapoints in our dataset: {len(filtered_df)}")
filtered_df.head(3)

Max: -110.0
Mean: -131.08
Min: -140.01
Number of datapoints in our dataset: 34


  df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)


Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
19_COOH_20,[C],"[C, F, H]","[C, F, F]","[C, F, F]","[C, F, F]","[C, F, F]",-137.39
22_COOH_21,[C],"[C, F, H]","[C, F, F]","[C, F, F]","[C, F, F]","[C, F, F]",-137.31
25_COOH_22,[C],"[C, F, H]","[C, F, F]","[C, F, F]","[C, F, F]","[C, F, F]",-137.3


In [20]:
# C-CH2F: 0 [C]; 1 [C, H, H];
df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)
filtered_df = df_list[
    (df_list[0].apply(lambda x: x == ["C"]))
    & (
        df_list[1].apply(
            lambda x: x.count("H") == 2 and x.count("C") == 1 and len(x) == 3
        )
    )
]

max_value = filtered_df["NMR_Peaks"].max()
mean_value = filtered_df["NMR_Peaks"].mean()
min_value = filtered_df["NMR_Peaks"].min()
num_points = len(filtered_df)
results["C-CH2F"] = [min_value, max_value, mean_value, num_points]

# Print the results
print("Max:", max_value)
print("Mean:", mean_value)
print("Min:", min_value)
print(f"Number of datapoints in our dataset: {len(filtered_df)}")
filtered_df.head(3)

Max: -198.0
Mean: -217.6451612903226
Min: -232.0
Number of datapoints in our dataset: 31


  df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)


Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
0_From_Review_1,[C],"[C, H, H]","[Cl, H, H]",[],[],[],-220.0
2_From_Review_2,[C],"[C, H, H]","[H, H, H]",[],[],[],-212.0
4_From_Review_3,[C],"[C, H, H]","[C, H, H]","[C, H, H]","[H, H, H]",[],-219.0


In [21]:
# C-CF2-COO-
df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)
filtered_df = df_list[
    (df_list[0].apply(lambda x: x == ["C"]))
    & (
        df_list[1].apply(
            lambda x: x.count("C") == 2 and x.count("F") == 1 and len(x) == 3
        )
    )
    & (df_list[2].apply(lambda x: x.count("O") == 2))
]
file_path = os.path.join("..", "artifacts", "results", "C-CF2-COO-.csv")
filtered_df.to_csv(file_path)
# C_CF2_COOH = filtered_df[filtered_df['NMR_Peaks'] < -115]

# max_value = C_CF2_COOH['NMR_Peaks'].max()
# mean_value = C_CF2_COOH['NMR_Peaks'].mean()
# min_value = C_CF2_COOH['NMR_Peaks'].min()
# num_points = len(C_CF2_COOH)
# results['C-CF2-COOH'] = [min_value, max_value, mean_value, num_points]

# # Print the results
# print("Max:", max_value)
# print("Mean:", mean_value)
# print("Min:", min_value)
# print(f'Number of datapoints in our dataset: {len(C_CF2_COOH)}')
# C_CF2_COOH

  df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)


In [22]:
results["C-CF2-COOH"] = [-124, -118, -120.05, 34]
results["C-CF2-COO-C"] = [-121, -95, -108.89, 9]

In [23]:
# # C-CF2-COO-R
# C_CF2_COO_R = filtered_df[filtered_df['NMR_Peaks'] > -115]
# max_value = C_CF2_COO_R['NMR_Peaks'].max()
# mean_value = C_CF2_COO_R['NMR_Peaks'].mean()
# min_value = C_CF2_COO_R['NMR_Peaks'].min()
# num_points = len(C_CF2_COO_R)
# results['C-CF2-COO-R'] = [min_value, max_value, mean_value, num_points]

# # Print the results
# print("Max:", max_value)
# print("Mean:", mean_value)
# print("Min:", min_value)
# print(f'Number of datapoints in our dataset: {len(C_CF2_COO_R)}')
# C_CF2_COO_R

In [24]:
# C-CF2-SO3H
df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)
filtered_df = df_list[
    (df_list[0].apply(lambda x: x == ["C"]))
    & (
        df_list[1].apply(
            lambda x: x.count("C") == 1
            and x.count("F") == 1
            and x.count("S") == 1
            and len(x) == 3
        )
    )
    & (df_list[2].apply(lambda x: x.count("O") == 3))
]


max_value = filtered_df["NMR_Peaks"].max()
mean_value = filtered_df["NMR_Peaks"].mean()
min_value = filtered_df["NMR_Peaks"].min()
num_points = len(filtered_df)
results["C-CF2-SO3H"] = [min_value, max_value, mean_value, num_points]

# Print the results
print("Max:", max_value)
print("Mean:", mean_value)
print("Min:", min_value)
print(f"Number of datapoints in our dataset: {len(filtered_df)}")
filtered_df

  df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)


Max: -114.18
Mean: -115.46076923076924
Min: -118.91
Number of datapoints in our dataset: 13


Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
5_SO3H_1,[C],"[S, F, C]","[O, O, O, F, F, F]",[H],[],[],-118.91
5_SO3H_2,[C],"[S, F, C]","[O, O, O, F, F, C]","[H, F, F, F]",[],[],-115.25
5_SO3H_3,[C],"[S, F, C]","[O, O, O, F, F, C]","[H, F, F, C]","[F, F, F]",[],-114.79
5_SO3H_4,[C],"[S, F, C]","[O, O, O, F, F, C]","[H, F, F, C]","[F, F, C]","[F, F, F]",-114.65
5_SO3H_5,[C],"[S, F, C]","[O, O, O, F, F, C]","[H, F, F, C]","[F, F, C]","[F, F, C]",-114.19
5_SO3H_6,[C],"[S, F, C]","[O, O, O, F, F, C]","[H, F, F, C]","[F, F, C]","[F, F, C]",-114.18
5_SO3H_7,[C],"[S, F, C]","[O, O, O, F, F, C]","[H, F, F, C]","[F, F, C]","[F, F, C]",-114.49
5_SO3H_18,[C],"[S, F, C]","[O, O, O, F, F, C]","[H, F, F, C]","[F, F, F]",[],-116.03
5_SO3H_19,[C],"[S, F, C]","[O, O, O, F, F, C]","[F, F, C]","[F, F, C]","[F, F, F]",-115.74
5_SO3H_20,[C],"[S, F, C]","[O, O, O, F, F, C]","[H, F, F, C]","[F, F, C]","[F, F, C]",-115.71


In [25]:
# -CF2-CH2-CH2-R
df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)
filtered_df = df_list[
    (df_list[0].apply(lambda x: x == ["C"]))
    & (
        df_list[1].apply(
            lambda x: x.count("C") == 2 and x.count("F") == 1 and len(x) == 3
        )
    )
    & (df_list[2].apply(lambda x: x.count("H") == 2))
    & (df_list[3].apply(lambda x: x.count("H") == 2))
]


max_value = filtered_df["NMR_Peaks"].max()
mean_value = filtered_df["NMR_Peaks"].mean()
min_value = filtered_df["NMR_Peaks"].min()
num_points = len(filtered_df)
results["-CF2-CH2-CH2-R"] = [min_value, max_value, mean_value, num_points]

# Print the results
print("Max:", max_value)
print("Mean:", mean_value)
print("Min:", min_value)
print(f"Number of datapoints in our dataset: {len(filtered_df)}")
filtered_df

Max: -96.0
Mean: -114.49324999999999
Min: -123.57
Number of datapoints in our dataset: 40


  df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)


Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
9_acrylates_1,[C],"[C, F, C]","[C, H, H, F, F, C]","[O, H, H, F, F, C]","[C, F, F, F]","[C, O]",-113.89
8_acrylates_2,[C],"[C, F, C]","[C, H, H, F, F, C]","[O, H, H, F, F, C]","[C, F, F, C]","[C, O, F, F, C]",-113.67
8_acrylates_3,[C],"[C, F, C]","[C, H, H, F, F, C]","[O, H, H, F, F, C]","[C, F, F, C]","[C, O, F, F, C]",-113.71
8_acrylates_4,[C],"[C, F, C]","[C, H, H, F, F, C]","[O, H, H, F, F, C]","[C, F, F, C]","[C, O, F, F, C]",-113.71
9_acrylates_5,[C],"[C, F, C]","[C, H, H, F, F, C]","[O, H, H, F, F, C]","[C, F, F, F]","[C, O]",-113.87
9_acrylates_6,[C],"[C, F, C]","[C, H, H, F, F, C]","[O, H, H, F, F, C]","[C, F, F, C]","[C, O, F, F, C]",-113.66
4_alcohols_1,[C],"[C, F, C]","[C, H, H, F, F, F]","[O, H, H]",[H],[],-123.57
4_alcohols_2,[C],"[C, F, C]","[C, H, H, F, F, C]","[O, H, H, F, F, C]","[H, F, F, F]",[],-114.02
4_alcohols_3,[C],"[C, F, C]","[C, H, H, F, F, C]","[O, H, H, F, F, C]","[H, F, F, C]","[F, F, C]",-113.78
4_alcohols_4,[C],"[C, F, C]","[C, H, H, F, F, C]","[O, H, H, F, F, C]","[H, F, F, C]","[F, F, C]",-113.73


From_Review_256 don't fill requirements

In [26]:
results["-CF2-CH2-CH2-R"] = [-114.49, -105.00, -114.49, 39]

In [27]:
# -CF2-CH2-(non-CH2 group)
df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)
filtered_df = df_list[
    (df_list[0].apply(lambda x: x == ["C"]))
    & (
        df_list[1].apply(
            lambda x: x.count("C") == 2 and x.count("F") == 1 and len(x) == 3
        )
    )
    & (df_list[2].apply(lambda x: x.count("H") == 2))
    & (df_list[3].apply(lambda x: x.count("H") < 2))
]


max_value = filtered_df["NMR_Peaks"].max()
mean_value = filtered_df["NMR_Peaks"].mean()
min_value = filtered_df["NMR_Peaks"].min()
num_points = len(filtered_df)
results["-CF2-CH2-(non-CH2 group)"] = [min_value, max_value, mean_value, num_points]

# Print the results
print("Max:", max_value)
print("Mean:", mean_value)
print("Min:", min_value)
print(f"Number of datapoints in our dataset: {len(filtered_df)}")
filtered_df

Max: -111.1
Mean: -117.14307692307692
Min: -126.69
Number of datapoints in our dataset: 13


  df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)


Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
5_COOH_13,[C],"[C, F, C]","[C, H, H, F, F, C]","[O, O, F, F, C]","[H, F, F, F]",[],-112.44
5_COOH_14,[C],"[C, F, C]","[C, H, H, F, F, C]","[O, O, F, F, C]","[H, F, F, C]","[F, F, C]",-112.18
5_COOH_15,[C],"[C, F, C]","[C, H, H, F, F, C]","[O, O, F, F, C]","[H, F, F, C]","[F, F, C]",-112.16
11_acrylates_8,[C],"[C, F, C]","[C, H, H, F, F, C]","[C, O, H, F, F, C]","[O, H, H, H, F, F, C]","[C, F, C, C]",-113.22
3_alcohols_6,[C],"[C, F, C]","[O, H, H, F, F, F]",[H],[],[],-126.69
3_alcohols_7,[C],"[C, F, C]","[O, H, H, F, F, C]","[H, F, F, C]","[F, F, C]","[F, F, C]",-122.22
3_alcohols_8,[C],"[C, F, C]","[O, H, H, F, F, C]","[H, F, F, C]","[F, F, C]","[F, F, C]",-121.92
3_alcohols_9,[C],"[C, F, C]","[O, H, H, F, F, C]","[H, F, F, C]","[F, F, H]",[],-122.62
17_SO3H_9,[C],"[C, F, C]","[C, F, F, S, H, H]","[C, F, F, H]","[C, F, F]","[C, F, F]",-114.47
9_ethers_3,[C],"[C, F, C]","[O, H, H, F, F, C]","[C, F, F, C]","[C, F, F, F, F, H]","[F, F, H]",-120.34


In [28]:
results

{'CF3-C': [-88.2, -49.4, -72.9153869047619, 336],
 'C-CF2-C': [-159.0, -84.5, -121.7869065934066, 728],
 'CF3-C6H5': [-63.9, -49.4, -58.55, 65],
 'CF3-C(C)=c': [-70, -64, -65.92, 4],
 'CF3-CH=C': [-67.0, -58.0, -63.611111111111114, 18],
 'CF2=C': [-134.0, -61.2, -100.96666666666665, 24],
 '-CF=C': [-205.0, -63.0, -133.5397894736842, 285],
 'C-CF2H': [-140.01, -110.0, -131.08, 34],
 'C-CH2F': [-232.0, -198.0, -217.6451612903226, 31],
 'C-CF2-COOH': [-124, -118, -120.05, 34],
 'C-CF2-COO-C': [-121, -95, -108.89, 9],
 'C-CF2-SO3H': [-118.91, -114.18, -115.46076923076924, 13],
 '-CF2-CH2-CH2-R': [-114.49, -105.0, -114.49, 39],
 '-CF2-CH2-(non-CH2 group)': [-126.69, -111.1, -117.14307692307692, 13]}

In [29]:
descriptor_2d_content["NMR_Peaks"] = descriptor_2d_content["NMR_Peaks"].apply(
    pd.to_numeric
)

In [30]:
# C-CF2-O-
df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)
filtered_df = df_list[
    (df_list[0].apply(lambda x: x == ["C"]))
    & (
        df_list[1].apply(
            lambda x: x.count("C") == 1
            and x.count("F") == 1
            and x.count("O") == 1
            and len(x) == 3
        )
    )
]


max_value = filtered_df["NMR_Peaks"].max()
mean_value = filtered_df["NMR_Peaks"].mean()
min_value = filtered_df["NMR_Peaks"].min()
num_points = len(filtered_df)
results["C-CF2-O-"] = [min_value, max_value, mean_value, num_points]

# Print the results
print("Max:", max_value)
print("Mean:", mean_value)
print("Min:", min_value)
print(f"Number of datapoints in our dataset: {len(filtered_df)}")
filtered_df

  df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)


Max: -79.6
Mean: -90.06833333333333
Min: -125.54
Number of datapoints in our dataset: 42


Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
3_ethers_0,[C],"[O, F, C]","[C, F, F, C]","[H, H, H, F, F, C]","[F, F, F]",[],-88.39
4_ethers_2,[C],"[O, F, C]","[C, F, F, H]","[C, H, H]","[H, H, H]",[],-91.64
4_ethers_3,[C],"[C, F, O]","[F, F, H, C]","[C, H, H]","[F, F, C]","[F, F, C]",-92.5
5_ethers_5,[C],"[O, F, C]","[C, F, F, H]","[F, F, H]",[],[],-125.13
4_ethers_7,[C],"[O, F, C]","[C, F, F, H]","[C, H, H]","[H, H, H]",[],-91.59
3_ethers_8,[C],"[O, F, C]","[C, F, F, C]","[H, H, H, F, F, F]",[],[],-89.59
3_ethers_9,[C],"[O, F, C]","[C, F, F, H]","[H, H, H]",[],[],-125.54
6_ethers_10,[C],"[O, F, C]","[C, F, F, H]","[C, H, H]","[C, H, H]","[C, H, H]",-91.37
7_ether_11,[C],"[C, F, O]","[C, F, F, C]","[O, O, F, F, F]",[H],[],-90.0
10_ether_12,[C],"[C, F, O]","[C, F, F, C]","[C, F, F, F, F, F]","[O, O]",[H],-87.7


In [31]:
# -C-CFH-C-
df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)
filtered_df = df_list[
    (df_list[0].apply(lambda x: x == ["C"]))
    & (
        df_list[1].apply(
            lambda x: x.count("H") == 1 and x.count("C") == 2 and len(x) == 3
        )
    )
]


max_value = filtered_df["NMR_Peaks"].max()
mean_value = filtered_df["NMR_Peaks"].mean()
min_value = filtered_df["NMR_Peaks"].min()
num_points = len(filtered_df)
results["-C-CFH-C-"] = [min_value, max_value, mean_value, num_points]

# Print the results
print("Max:", max_value)
print("Mean:", mean_value)
print("Min:", min_value)
print(f"Number of datapoints in our dataset: {len(filtered_df)}")
filtered_df

Max: -160.0
Mean: -183.3
Min: -213.0
Number of datapoints in our dataset: 40


  df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)


Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
3_From_Review_25,[C],"[C, C, H]","[C, H, H, C, C, H]","[H, H, H, H, H, H, H, H, H]",[],[],-189.0
3_From_Review_26,[C],"[C, C, H]","[H, H, H, H, H, H]",[],[],[],-165.0
0_From_Review_27,[C],"[C, C, H]","[C, H, H, C, H, H]","[C, H, H, C, H, H]","[H, H, H, H]",[],-175.0
0_From_Review_28,[C],"[C, C, H]","[C, H, H, C, H, H]","[C, H, H, C, H, H]","[C, H, H, C, H, H]","[C, H, H, C, H, H]",-171.0
0_From_Review_29,[C],"[C, C, H]","[C, H, H, C, H, H]","[H, H, H, H]",[],[],-160.0
0_From_Review_30,[C],"[C, C, H]","[C, H, H, C, H, H]","[C, H, H, C, H, H]","[F, H, F, H]",[],-213.0
7_From_Review_36,[C],"[C, C, H]","[C, H, H, H, H, H]","[C, H, H]","[C, H, H]","[C, H, H]",-172.0
10_From_Review_37,[C],"[C, C, H]","[C, H, H, Cl, H, H]","[C, H, H]","[C, H, H]","[C, H, H]",-182.0
11_From_Review_38,[C],"[C, C, H]","[C, H, H, Br, H, H]","[C, H, H]","[C, H, H]","[C, H, H]",-178.0
11_From_Review_39,[C],"[C, C, H]","[C, H, H, I, H, H]","[C, H, H]","[C, H, H]","[C, H, H]",-171.0


In [32]:
# -C-CFH2
df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)
filtered_df = df_list[
    (df_list[0].apply(lambda x: x == ["C"]))
    & (
        df_list[1].apply(
            lambda x: x.count("C") == 1 and x.count("H") == 2 and len(x) == 3
        )
    )
]


max_value = filtered_df["NMR_Peaks"].max()
mean_value = filtered_df["NMR_Peaks"].mean()
min_value = filtered_df["NMR_Peaks"].min()
num_points = len(filtered_df)
results["-C-CFH2"] = [min_value, max_value, mean_value, num_points]

# Print the results
print("Max:", max_value)
print("Mean:", mean_value)
print("Min:", min_value)
print(f"Number of datapoints in our dataset: {len(filtered_df)}")
filtered_df

  df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)


Max: -198.0
Mean: -217.6451612903226
Min: -232.0
Number of datapoints in our dataset: 31


Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
0_From_Review_1,[C],"[C, H, H]","[Cl, H, H]",[],[],[],-220.0
2_From_Review_2,[C],"[C, H, H]","[H, H, H]",[],[],[],-212.0
4_From_Review_3,[C],"[C, H, H]","[C, H, H]","[C, H, H]","[H, H, H]",[],-219.0
0_From_Review_6,[C],"[C, H, H]","[C, C]","[C, H, C, H]","[C, H, C, H]","[H, H]",-206.0
6_From_Review_22,[C],"[C, H, H]","[C, C, H]","[C, H, H, C, H, H]","[H, H, H, H, H, H]",[],-226.0
5_From_Review_23,[C],"[C, H, H]","[C, C, C]","[H, H, H, H, H, H, H, H, H]",[],[],-223.0
0_From_Review_24,[C],"[C, H, H]","[C, C, H]","[C, H, H, C, H, H]","[C, H, H, C, H, H]","[C, H, C, H]",-208.0
0_From_Review_33,[C],"[C, H, H]","[Br, H, H]",[],[],[],-212.0
0_From_Review_34,[C],"[C, H, H]","[F, H, H]",[],[],[],-226.0
4_From_Review_35,[C],"[C, H, H]","[C, Br, H]","[H, H, H]",[],[],-210.0


In [33]:
# C=C(F)-O-
df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)
filtered_df = df_list[
    (df_list[0].apply(lambda x: x == ["C"]))
    & (
        df_list[1].apply(
            lambda x: x.count("O") == 1 and x.count("C") == 1 and len(x) == 2
        )
    )
]


max_value = filtered_df["NMR_Peaks"].max()
mean_value = filtered_df["NMR_Peaks"].mean()
min_value = filtered_df["NMR_Peaks"].min()
num_points = len(filtered_df)
results["C=C(F)-O-"] = [min_value, max_value, mean_value, num_points]

# Print the results
print("Max:", max_value)
print("Mean:", mean_value)
print("Min:", min_value)
print(f"Number of datapoints in our dataset: {len(filtered_df)}")
filtered_df

Max: -88.0
Mean: -125.83333333333333
Min: -141.0
Number of datapoints in our dataset: 12


  df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)


Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
3_From_Review_152,[C],"[C, O]","[F, H, C]","[F, F, F]",[],[],-132.0
3_From_Review_153,[C],"[C, O]","[F, H, C]","[F, F, F]",[],[],-105.0
3_From_Review_154,[C],"[O, C]","[C, C, H]","[H, H, H, C, C]","[C, H, C, H]","[C, H, C, H]",-88.0
3_From_Review_444,[C],"[O, C]","[C, F, F]","[H, H, H]",[],[],-138.0
4_From_Review_445,[C],"[C, O]","[F, F, C]","[F, F, F]",[],[],-141.0
4_From_Review_447,[C],"[C, O]","[F, F, C]","[C, C]","[C, H, C, H]","[C, H, C, H]",-134.0
6_From_Review_477,[C],"[O, C]","[C, F, C]","[C, H, C, H]","[C, H, O, H]","[F, C, C]",-130.0
0_From_Review_478,[C],"[C, O]","[C, H, C]","[C, H, C, F]","[F, O, C, H]","[C, C, H]",-123.0
5_From_Review_478,[C],"[C, O]","[C, H, C]","[C, H, F, C]","[F, O, C, H]","[C, C, H]",-123.0
5_From_Review_480,[C],"[O, C]","[C, F, C]","[C, H, F, C]","[F, C, O, H]","[C, F, C]",-122.0


In [34]:
results

{'CF3-C': [-88.2, -49.4, -72.9153869047619, 336],
 'C-CF2-C': [-159.0, -84.5, -121.7869065934066, 728],
 'CF3-C6H5': [-63.9, -49.4, -58.55, 65],
 'CF3-C(C)=c': [-70, -64, -65.92, 4],
 'CF3-CH=C': [-67.0, -58.0, -63.611111111111114, 18],
 'CF2=C': [-134.0, -61.2, -100.96666666666665, 24],
 '-CF=C': [-205.0, -63.0, -133.5397894736842, 285],
 'C-CF2H': [-140.01, -110.0, -131.08, 34],
 'C-CH2F': [-232.0, -198.0, -217.6451612903226, 31],
 'C-CF2-COOH': [-124, -118, -120.05, 34],
 'C-CF2-COO-C': [-121, -95, -108.89, 9],
 'C-CF2-SO3H': [-118.91, -114.18, -115.46076923076924, 13],
 '-CF2-CH2-CH2-R': [-114.49, -105.0, -114.49, 39],
 '-CF2-CH2-(non-CH2 group)': [-126.69, -111.1, -117.14307692307692, 13],
 'C-CF2-O-': [-125.54, -79.6, -90.06833333333333, 42],
 '-C-CFH-C-': [-213.0, -160.0, -183.3, 40],
 '-C-CFH2': [-232.0, -198.0, -217.6451612903226, 31],
 'C=C(F)-O-': [-141.0, -88.0, -125.83333333333333, 12]}

In [35]:
# -CF2-CF2-CF2- the F in middle
df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)
filtered_df = df_list[
    (df_list[0].apply(lambda x: x == ["C"]))
    & (
        df_list[1].apply(
            lambda x: x.count("C") == 2 and x.count("F") == 1 and len(x) == 3
        )
    )
    & (df_list[2].apply(lambda x: x.count("F") >= 4))
]


max_value = filtered_df["NMR_Peaks"].max()
mean_value = filtered_df["NMR_Peaks"].mean()
min_value = filtered_df["NMR_Peaks"].min()
num_points = len(filtered_df)
# results['-CF2-CF2-CF2- middle F no Ring'] = [min_value, max_value, mean_value, num_points]

# Print the results
print("Max:", max_value)
print("Mean:", mean_value)
print("Min:", min_value)
print(f"Number of datapoints in our dataset: {len(filtered_df)}")
filtered_df

  df_list = df_dropna.applymap(lambda x: list(x) if isinstance(x, tuple) else x)


Max: -108.7
Mean: -123.62935972461273
Min: -159.0
Number of datapoints in our dataset: 581


Unnamed: 0,0,1,2,3,4,5,NMR_Peaks
7_COOH_2,[C],"[C, F, C]","[C, F, F, F, F, F]","[O, O]",[H],[],-127.39
7_COOH_3,[C],"[C, F, C]","[C, F, F, F, F, C]","[O, O, F, F, F]",[H],[],-123.99
10_COOH_3,[C],"[C, F, C]","[C, F, F, F, F, F]","[C, F, F]","[O, O]",[H],-126.26
7_COOH_4,[C],"[C, F, C]","[C, F, F, F, F, C]","[O, O, F, F, C]","[H, F, F, F]",[],-122.87
10_COOH_4,[C],"[C, F, C]","[C, F, F, F, F, C]","[C, F, F, F, F, F]","[O, O]",[H],-123.25
...,...,...,...,...,...,...,...
19_From_Review_437,[C],"[C, F, C]","[C, F, F, F, F, F]","[C, F, F]","[C, F, F]","[C, F, F]",-126.20
9_From_Review_448,[C],"[C, F, C]","[C, F, F, F, F, C]","[C, F, F, F, F]","[F, F]",[],-127.00
12_From_Review_448,[C],"[C, F, C]","[C, F, F, F, F, F]","[C, F, F]","[C, F]","[F, F]",-129.00
9_From_Review_449,[C],"[C, F, C]","[C, F, F, F, F, C]","[C, F, F, F, H]","[F, F]",[],-125.50


In [36]:
file_path = os.path.join("..", "artifacts", "results", "-CF2-CF2-CF2-.csv")
filtered_df.to_csv(file_path)

In [37]:
results["-CF2-CF2-CF2- middle F linear"] = [-132, -112.81, -123.68, 574]
results["-CF2-CF2-CF2- middle F in ring"] = [-159, -135, -147.67, 6]

In [38]:
results

{'CF3-C': [-88.2, -49.4, -72.9153869047619, 336],
 'C-CF2-C': [-159.0, -84.5, -121.7869065934066, 728],
 'CF3-C6H5': [-63.9, -49.4, -58.55, 65],
 'CF3-C(C)=c': [-70, -64, -65.92, 4],
 'CF3-CH=C': [-67.0, -58.0, -63.611111111111114, 18],
 'CF2=C': [-134.0, -61.2, -100.96666666666665, 24],
 '-CF=C': [-205.0, -63.0, -133.5397894736842, 285],
 'C-CF2H': [-140.01, -110.0, -131.08, 34],
 'C-CH2F': [-232.0, -198.0, -217.6451612903226, 31],
 'C-CF2-COOH': [-124, -118, -120.05, 34],
 'C-CF2-COO-C': [-121, -95, -108.89, 9],
 'C-CF2-SO3H': [-118.91, -114.18, -115.46076923076924, 13],
 '-CF2-CH2-CH2-R': [-114.49, -105.0, -114.49, 39],
 '-CF2-CH2-(non-CH2 group)': [-126.69, -111.1, -117.14307692307692, 13],
 'C-CF2-O-': [-125.54, -79.6, -90.06833333333333, 42],
 '-C-CFH-C-': [-213.0, -160.0, -183.3, 40],
 '-C-CFH2': [-232.0, -198.0, -217.6451612903226, 31],
 'C=C(F)-O-': [-141.0, -88.0, -125.83333333333333, 12],
 '-CF2-CF2-CF2- middle F linear': [-132, -112.81, -123.68, 574],
 '-CF2-CF2-CF2- middle

In [39]:
results_df = pd.DataFrame(results)
file_path = os.path.join("..", "artifacts", "results", "structure-chemical shifts.csv")
results_df.to_csv(file_path)

In [40]:
results_df

Unnamed: 0,CF3-C,C-CF2-C,CF3-C6H5,CF3-C(C)=c,CF3-CH=C,CF2=C,-CF=C,C-CF2H,C-CH2F,C-CF2-COOH,C-CF2-COO-C,C-CF2-SO3H,-CF2-CH2-CH2-R,-CF2-CH2-(non-CH2 group),C-CF2-O-,-C-CFH-C-,-C-CFH2,C=C(F)-O-,-CF2-CF2-CF2- middle F linear,-CF2-CF2-CF2- middle F in ring
0,-88.2,-159.0,-63.9,-70.0,-67.0,-134.0,-205.0,-140.01,-232.0,-124.0,-121.0,-118.91,-114.49,-126.69,-125.54,-213.0,-232.0,-141.0,-132.0,-159.0
1,-49.4,-84.5,-49.4,-64.0,-58.0,-61.2,-63.0,-110.0,-198.0,-118.0,-95.0,-114.18,-105.0,-111.1,-79.6,-160.0,-198.0,-88.0,-112.81,-135.0
2,-72.915387,-121.786907,-58.55,-65.92,-63.611111,-100.966667,-133.539789,-131.08,-217.645161,-120.05,-108.89,-115.460769,-114.49,-117.143077,-90.068333,-183.3,-217.645161,-125.833333,-123.68,-147.67
3,336.0,728.0,65.0,4.0,18.0,24.0,285.0,34.0,31.0,34.0,9.0,13.0,39.0,13.0,42.0,40.0,31.0,12.0,574.0,6.0
