In [None]:
!pip install padelpy
import padelpy
import pandas as pd
import numpy as np
from padelpy import padeldescriptor, from_smiles

In [None]:
df = pd.read_csv('CombinedResearchPaperData.csv')
df.head(10)

In [None]:
df = df['SMILES']
df = df.to_frame(name='SMILES')
#df = df['SMILES']
df.head(10)

In [None]:
# create .smi file containing smiles
df['SMILES'].to_csv('CRPD.smi', index=None, header=None)

In [30]:
#generating 2D descriptors
padeldescriptor(mol_dir='CRPD.smi', d_file='2d.csv', d_2d=True, d_3d=False, fingerprints=False, retainorder=True,
                #convert3d=True, detectaromaticity=True, removesalt=True, retain3d=True,
                # standardizenitro=True, standardizetautomers=True
                )

In [None]:
df_2d = pd.read_csv('2d.csv')
df_2d = pd.concat([df, df_2d.drop("Name",axis = 1)], axis=1)
df_2d.head(10)

In [None]:
#generating 3D descriptors
padeldescriptor(mol_dir='CRPD.smi', d_file='3d.csv', d_2d=False, d_3d=True, fingerprints=False, retainorder=True,
                convert3d=True, #detectaromaticity=True, removesalt=True, retain3d=True,
                # standardizenitro=True, standardizetautomers=True
                )

df_3d = pd.read_csv('3d.csv')
df_3d = pd.concat([df, df_3d.drop("Name",axis = 1)], axis=1)

#add 0 for nan values
df_3d = df_3d.fillna(0)
df_3d.head(10)

In [None]:
#generating fingerprints
import os
#getting xml files from descriptors directory
xml_files = [i for i in os.listdir('descriptors_xml') if i.endswith('.xml')]
xml_files.sort()
xml_files

In [None]:
df_fingerprints = pd.read_csv('CombinedResearchPaperData.csv')
df = df_fingerprints['SMILES']
# df_fingerprints = df_fingerprints['SMILES'].head(10)
df_fingerprints = df_fingerprints.to_frame(name='SMILES')
df_fingerprints.head(10)

In [None]:
df_fingerprints['SMILES'].to_csv('CRPD.smi', index=None, header=None)

In [None]:
import time
import os
import pandas as pd
from padelpy import padeldescriptor

# Assuming xml_files is defined somewhere in your code
# xml_files = [...] 

start_time = time.perf_counter()
combined_fp = pd.DataFrame()
for i in range(len(xml_files)):
    xml_file = xml_files[i]
    if not os.path.isfile(xml_file):
        print(f"Error: Descriptor types file '{xml_file}' does not exist.")
        continue
    
    try:
        print(f"Processing: {xml_file}")
        padeldescriptor(
            mol_dir='CRPD.smi',
            d_file=f'{xml_file[:-4]}.csv',
            descriptortypes=xml_file,
            retainorder=True,
            fingerprints=True,
            d_2d=False,
            d_3d=False
        )
        fp_name = f'{xml_file[:-4]}'
        fp = pd.read_csv(f'{fp_name}.csv')
        fp = pd.concat([df, fp.drop('Name', axis=1)], axis=1)
        fp.to_csv(f'{fp_name}.csv', index=None)
        combined_fp = pd.concat([combined_fp, fp], axis=1)
        print(f"{fp_name} done")
    except RuntimeError as e:
        print(f"RuntimeError for file '{xml_file}': {e}")
    except Exception as e:
        print(f"An error occurred for file '{xml_file}': {e}")

end_time = time.perf_counter()
print(end_time - start_time, "seconds")


In [None]:
#concatenating all the fingerprint files from the csv files 
combined_fp.head(10)

#add 0 for nan values
combined_fp = combined_fp.fillna(0)


#save the final dataframe

final_df = pd.concat([df_2d, df_3d.drop("SMILES",axis = 1), combined_fp.drop("SMILES",axis = 1)], axis=1)

final_df.to_csv('final.csv', index=None)
final_df.head(10)

final_df.shape