In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem

# Function to calculate RDKit descriptors for a molecule
def calculate_rdkit_descriptors(mol):
    try:
        print("Calculating RDKit descriptors...")
        descriptors = [desc[1](mol) for desc in Descriptors.descList]
        return descriptors
    except Exception as e:
        print(f"Error calculating RDKit descriptors: {e}")
        return [None] * len(Descriptors.descList)

# Function to calculate Morgan fingerprints (2D)
def calculate_morgan_fingerprint(mol):
    try:
        print("Calculating Morgan fingerprints (2D)...")
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
        return list(fp)
    except Exception as e:
        print(f"Error calculating Morgan fingerprints: {e}")
        return [None] * 2048

# Function to calculate ECFP fingerprints (3D)
def calculate_ecfp_fingerprint(mol):
    try:
        print("Calculating ECFP fingerprints (3D)...")
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 3)
        return list(fp)
    except Exception as e:
        print(f"Error calculating ECFP fingerprints: {e}")
        return [None] * 2048

# Read SMILES strings from file
try:
    print("Reading SMILES strings from file...")
    data = pd.read_csv("CombinedResearchPaperData.csv")
except Exception as e:
    print(f"Error reading CSV file: {e}")
    exit()

# Initialize lists to store computed descriptors and fingerprints
rdkit_descriptors = []
morgan_fingerprints = []
ecfp_fingerprints = []

# Iterate over each SMILES string
print("Computing descriptors and fingerprints for each molecule...")
for index, row in data.iterrows():
    smiles = str(row['SMILES'])  # Convert to string
    print(f"Processing molecule {index + 1}/{len(data)}: {smiles}")
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        # Compute RDKit descriptors for the molecule
        rdkit_descriptors.append(calculate_rdkit_descriptors(mol))
        # Compute Morgan fingerprints (2D) for the molecule
        morgan_fingerprints.append(calculate_morgan_fingerprint(mol))
        # Compute ECFP fingerprints (3D) for the molecule
        ecfp_fingerprints.append(calculate_ecfp_fingerprint(mol))
    else:
        # If RDKit fails to generate a molecule, append None values for descriptors and fingerprints
        print("Failed to generate molecule. Skipping...")
        rdkit_descriptors.append([None] * len(Descriptors.descList))
        morgan_fingerprints.append([None] * 2048)
        ecfp_fingerprints.append([None] * 2048)




In [None]:
# Create a DataFrame for RDKit descriptors
print("Creating DataFrame for RDKit descriptors...")
rdkit_columns = [desc[0] for desc in Descriptors.descList]
rdkit_df = pd.DataFrame(rdkit_descriptors, columns=rdkit_columns)

# Create a DataFrame for Morgan fingerprints
print("Creating DataFrame for Morgan fingerprints...")
morgan_columns = [f"Morgan_{i}" for i in range(2048)]
morgan_df = pd.DataFrame(morgan_fingerprints, columns=morgan_columns)

# Create a DataFrame for ECFP fingerprints
print("Creating DataFrame for ECFP fingerprints...")
ecfp_columns = [f"ECFP_{i}" for i in range(2048)]
ecfp_df = pd.DataFrame(ecfp_fingerprints, columns=ecfp_columns)

In [None]:
# Concatenate all descriptors and fingerprints into a single DataFrame
print("Concatenating descriptors and fingerprints...")
features_df = pd.concat([rdkit_df, morgan_df, ecfp_df], axis=1)

In [None]:
# Concatenate features with original data (Toxicity and Source and SMILES)
print("Concatenating features with original data...")
result_df = pd.concat([data[['SMILES','Toxicity', 'Source']], features_df], axis=1)

# Save the result to CSV
try:
    print("Saving result to CSV...")
    result_df.to_csv("rdkit_features.csv", index=False)
    print("Process completed successfully.")
except Exception as e:
    print(f"Error saving CSV file: {e}")


In [None]:
import pandas as pd
result_df = pd.read_csv("rdkit_features.csv")
num_features_before = len(result_df.columns) - 3  # subtracting 3 for SMILES, Toxicity, and Source columns
# Drop features with all values as 0
# Count number of features before and after dropping 
print("Number of features before dropping:", num_features_before)
result_df = result_df.drop(columns=['SMILES','Toxicity', 'Source'])
result_df = result_df.dropna(axis=1, how='all')  # Drop columns with all NaN values
result_df = result_df.dropna(axis=0, how='all')  # Drop rows with all NaN values
result_df = result_df.loc[:, (result_df != 0).any(axis=0)]
num_features_after = len(result_df.columns) - 3
print("Number of features after dropping:", num_features_after)

# Plotting heatmap for correlated features
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Compute the correlation matrix
corr = result_df.drop(columns=['SMILES','Toxicity', 'Source']).corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
plt.figure(figsize=(10, 8))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap='coolwarm', vmax=1, vmin=-1, center=0, annot=True, fmt=".2f",
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.title('Correlation Heatmap of Features')
plt.show()

