In [None]:
import os

import pandas as pd
import math
import cirpy
import matplotlib.pyplot as plt
from rdkit.Chem import rdMolDescriptors

from tqdm import tqdm

from Code.Utils.util_methods import NNUtils


## Provide the data

In [None]:
PATH = "PROVIDE YOUR PATH HERE" # Path to the dataset. An example dataset is provided in the Dataset folder
BASE = ".."
MAX_MW = int(os.getenv("MAX_MASS")) # Maximum mass weight to consider
INFER_SMILES = False # Must stay false. Only set to True if you want to run the inference of the SMILES from the name and CAS number. It will take a long time.
DELETE_AROMATICS = False # Only set to True if you want to delete the aromatic molecules.

## Extract the data

In [None]:
# you can ignore this cell
# flag to know if the dataset is the original one, so it will check the SMILES from the name and CAS number
if PATH == "../Dataset/nist_08_Common_all_smiles.pkl":
    f_original_dataset = True
else:
    f_original_dataset = False # always False

In [None]:
if PATH.split('.')[-1] == 'csv':
    df = NNUtils.read_big_csv(PATH)
elif PATH.split('.')[-1] == 'pkl':
    df = pd.read_pickle(PATH)
else:
    raise Exception("The file must be a csv or pkl file")
PATH

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# change the column names smiles to SMILES
df.rename(columns={'smiles': 'SMILES'}, inplace=True)
print(df["SMILES"].value_counts())

In [None]:
c=0
for smile in df["SMILES"]:
  #print(smile)
  if smile == None:
    c+=1
  if isinstance(smile, float) and math.isnan(smile):
    c+=1

print(f'{c} None or NaN values')
nb_invalid_smiles = c

## Remove excess columns

In [None]:
value_counts = df['Mw'].value_counts().sort_index()

# Plotting the value counts
plt.figure(figsize=(12, 9))
plt.bar(value_counts.index, value_counts.values)  # Using a bar plot
plt.title('Value Counts of masses')
plt.xlabel('Mw')
plt.ylabel('Counts')

# Set x-axis limits from 0 to the max value
plt.xlim(0, value_counts.index.max())

# Adjusting the x-ticks to show every 50th element
plt.xticks(range(0, value_counts.index.max() + 1, 50), rotation=45)
plt.show()

In [None]:
df[["Mw"]].sort_values("Mw").reset_index(drop=True).plot()

-> cut out from mz600

In [None]:
try:
    # assuming you have a DataFrame called df and want to delete columns between 'start_column' and 'end_column'
    start_column = 'mz'+str(MAX_MW+1)
    end_column = 'mz1000'
    
    # finding the index positions of the start and end columns
    start_idx = df.columns.get_loc(start_column)
    end_idx = df.columns.get_loc(end_column) + 1  # add 1 to include the end column
    
    # creating a list of column names to delete
    columns_to_delete = df.columns[start_idx:end_idx]
    
    # deleting the specified range of columns
    df.drop(columns=columns_to_delete, inplace=True)

except Exception as e:
    print(e)

In [None]:
df.reset_index(drop=True, inplace=True)
df.head()

### Verify and delete the mz columns that have 0 sum

In [None]:
sum_of_masses= df.loc[:, 'mz1':'mz600'].sum()

# Plotting the value counts
plt.figure(figsize=(20, 16))
sum_of_masses.plot(kind='line')  # Using a bar plot
plt.title('Value Counts of column_name')
plt.xlabel('Mz')
plt.ylabel('Sum')

In [None]:
zero_masses = sum_of_masses[sum_of_masses <= 0].index.tolist()
zero_masses

## Transform CAS number or name to smiles

### Transformation

In [None]:
#smiles_name_cas = pd.DataFrame(columns=['smiles_name','smiles_cas'])

if INFER_SMILES:

    overwrite = False # overwrites existing, if it doesn't exit, it creates
    
    if overwrite:
      with open(BASE+"/Dataset/smiles_name_cas.csv", 'w') as file:
          # Write the string data to the file
          file.write('no,smile_name,smile_cas\n')
    
    
    start = 205054 #included
    for row in range(start, df.shape[0]):
      if row%1000 == 0 and row != 0:
        print(row)
        #break
      name = df['Name'].iloc[row]
      smile_name = cirpy.resolve(name, 'smiles')
      cas = df['Cas'].iloc[row]
      try:
        smile_cas = cirpy.resolve(cas, 'smiles')
        if smile_cas is None:
          smile_cas = smile_name
      except:
        smile_cas = smile_name
        #print(cas)
      #smiles_name_cas.loc[len(smiles_name_cas)] = [smile_name, smile_cas]
    
      with open(BASE+"/Dataset/smiles_name_cas.csv", 'a') as file:
        # Write the string data to the file
        file.write(f'{row},{smile_name},{smile_cas}\n')
    
      if row == start:
        print("First line decoded")
    
    #smiles_name_cas

else:
    smiles_name_cas=(
    pd.DataFrame({'no':[], 'smile_name':[],'smile_cas':[]})
    )

# try to do it locally batchwise, and write directly to a csv

## Merge the newly computed smiles to to df

In [None]:
if INFER_SMILES or f_original_dataset:
    smiles_name_cas = pd.read_csv(BASE+"/Dataset/smiles_name_cas.csv")
smiles_name_cas.head()

In [None]:
df_diff_smiles = pd.concat([df, smiles_name_cas.loc[:, smiles_name_cas.columns != 'no']], axis=1)
df_diff_smiles['keep'] = True
df_diff_smiles

In [None]:
c=0
row=0
for smile_name, smile_cas in tqdm(zip(df_diff_smiles["smile_name"], df_diff_smiles["smile_cas"])):
  #print(smile_name, smile_cas)
  if smile_name is not None and smile_cas is not None and not((isinstance(smile_name, float) and math.isnan(smile_name)) or (isinstance(smile_cas, float) and math.isnan(smile_cas))):
    if smile_name != smile_cas:
      df_diff_smiles.loc[row, 'SMILES'] = smile_cas
      c+=1
  row+=1

print()
print(f'{c} rows with different SMILES inferred from name and CAS. The SMILES will be the SMILES inferred from the CAS number.')

## Look for mass weights higher than the max

In [None]:
c=0

for index, row in tqdm(df_diff_smiles.iterrows()):
    mw = row["Mw"]
    #print(smile)
    #print(j)
    if mw > MAX_MW:
        c+=1
        df_diff_smiles.loc[index, 'keep'] = False

print(f'{c} molecules have larger molecular weight than {MAX_MW}. They are going to be removed.')

## Look for none values

In [None]:
c=0
for smile_name, smile_cas in tqdm(zip(df_diff_smiles["smile_name"], df_diff_smiles["smile_cas"])):
  #print(smile_name, smile_cas)
  if smile_name == None and smile_cas == None:
    c+=1
  if (isinstance(smile_name, float) and math.isnan(smile_name)) and (isinstance(smile_cas, float) and math.isnan(smile_cas)):
    c+=1

print()
print(f'{c} null or NaN values against the original dataframe {nb_invalid_smiles}')

## Remove aromatics (optional)

In [None]:
from rdkit import Chem


def is_aromatic_from_smiles(smiles):
    '''
    Check if a molecule contains any aromatic bonds
    :param smiles: SMILES string
    :return: 1 if the molecule contains aromatic bonds, 0 otherwise and None if the SMILES is invalid
    '''
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None  # Invalid SMILES
        # Check if the molecule contains any aromatic bonds
        return int(any(bond.GetIsAromatic() for bond in mol.GetBonds()))
    except Exception as e:
        #print(f"Error processing SMILES {smiles}: {e}")
        return None

In [None]:
# if a molecule contains aromatic bonds, it will be removed (keep=False)
if DELETE_AROMATICS:
    
    c = 0
    for index in tqdm(range(df_diff_smiles.shape[0])):
        smiles = df_diff_smiles.iloc[index]["SMILES"]
        is_aromatic = is_aromatic_from_smiles(smiles)
        if is_aromatic:
            c += 1
            df_diff_smiles.loc[index, 'keep'] = False

    print(f'{c} aromatic molecules found. They are going to be removed.')
else:
    print("Aromatic molecules are not going to be removed.")


## Replace the NaN or None smiles

In [None]:
test=(
    pd.DataFrame({'a':[1,2,3], 'b':[4,5,6],'d':['a','a','c']})
    .assign(c=lambda x: x['a']+x['b'])
    .groupby('d').agg(N=('c','count'))
    # .loc[lambda qwe:qwe['c']>6]
    )
test

In [None]:
corrected_df=(
    df_diff_smiles
    .assign(SMILES=lambda x: x["SMILES"].combine_first(x["smile_name"]).combine_first(x["smile_cas"]))
    .loc[lambda df:df['keep']==True]
    .drop(columns=['smile_name', 'smile_cas'])
)

In [None]:
corrected_df = df_diff_smiles.copy(deep=True)

#del df_diff_smiles

for row in tqdm(range(corrected_df.shape[0])):
  #print(corrected_df.iloc[row])
  smile = corrected_df.iloc[row]['SMILES']
  if smile == None or (isinstance(smile, float) and math.isnan(smile)) or smile == "":
    smile_name = corrected_df.iloc[row]['smile_name']
    smile_cas = corrected_df.iloc[row]['smile_cas']
    #print(smile)
    if not(smile_cas == None or (isinstance(smile_cas, float) and math.isnan(smile_cas)) or smile_cas == "" or smile != smile_cas): # the CAS number is the most reliable, so also replace the SMILES with the CAS number when it is available and different from the original SMILES
      corrected_df.loc[row, 'SMILES'] = corrected_df.loc[row, 'smile_cas']
      #print(",")
    elif not(smile_name == None or (isinstance(smile_name, float) and math.isnan(smile_name) or smile_name == "")):
      corrected_df.loc[row, 'SMILES'] = corrected_df.loc[row, 'smile_name']
      #print(".", smile_name)
    #print(corrected_df.iloc[row]['SMILES'])
    #print("---")

corrected_df.drop(columns=['smile_name', 'smile_cas'], inplace=True)

In [None]:
corrected_df

In [None]:
corrected_df.to_csv(BASE+'/Dataset/Mass_spectra/corrected_df.csv', index=False)

## Delete the invalid smiles

In [None]:
c=0
d=0
e=0
f=0


for index, row in tqdm(corrected_df.iterrows()):
  #print(smile)
  #print(j)
  smile = row["SMILES"]
  if smile == None:
    c+=1
    corrected_df.loc[index, 'keep'] = False
  elif isinstance(smile, float) and math.isnan(smile):
    c+=1
    corrected_df.loc[index, 'keep'] = False
  else:
    try:
      mol = Chem.MolFromSmiles(smile)
      molecular_formula = rdMolDescriptors.CalcMolFormula(mol)
      if mol is None:
        d+=1
        corrected_df.loc[index, 'keep'] = False
      elif "." in smile:
        f+=1
        corrected_df.loc[index, 'keep'] = False
      elif corrected_df.loc[index, 'Form'] != molecular_formula:
        e+=1
        corrected_df.loc[index, 'keep'] = False
    except:
      d+=1
      corrected_df.loc[index, 'keep'] = False

keep_rows = corrected_df['keep'].tolist()
cleaned_df = corrected_df[keep_rows].copy(deep=True)
cleaned_df.reset_index(drop=True, inplace=True)

invalid_rows = [not x for x in keep_rows]
invalid_df = corrected_df[invalid_rows].copy(deep=True)
invalid_df.reset_index(drop=True, inplace=True)

cleaned_df.drop(columns=['keep'], inplace=True)
invalid_df.drop(columns=['keep'], inplace=True)

print(f'{c} None or NaN values')
print(f'{d} invalid SMILES')
print(f'{e} SMILES with different molecular formula')
print(f'{f} SMILES with .')


In [None]:
cleaned_df

In [None]:
invalid_df

In [None]:
cleaned_df.to_csv(BASE+'/Dataset/Mass_spectra/cleaned_df.csv', index=False)
invalid_df.to_csv(BASE+'/Dataset/Mass_spectra/invalid_df.csv', index=False)

In [None]:
print(cleaned_df["SMILES"].value_counts())

## See the covariance between the masses

# Calculate the covariance matrix
cov_matrix = cleaned_df.loc[:, 'mz1':'mz600'].cov()

# Plot the covariance matrix
plt.figure(figsize=(20, 16))
plt.imshow(cov_matrix, cmap='coolwarm', interpolation='none')
plt.colorbar(label='Covariance Value')

# Add labels
plt.xticks(range(len(cov_matrix.columns)), cov_matrix.columns)
plt.yticks(range(len(cov_matrix.columns)), cov_matrix.columns)


plt.title('Covariance Matrix Heatmap')
plt.show()

# Reduce the size