In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
import numpy as np

input_csv = "C:/Users/20192891/Documents/Master/Q4/8CC00/Assignment 3/tested_molecules.csv"
output_csv = "C:/Users/20192891/Documents/Master/Q4/8CC00/Assignment 3/filtered_molecules222.csv"

# Step 1: Read the input CSV file
data = pd.read_csv(input_csv)

# Step 2: Calculate RDKit descriptors
descriptor_names = [desc[0] for desc in Descriptors.descList]
calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

descriptors_list = []
for smile in data.iloc[:, 0]:
    mol = Chem.MolFromSmiles(smile)
    if mol is not None:
        descriptors = calculator.CalcDescriptors(mol)
    else:
        descriptors = [None] * len(descriptor_names)
    descriptors_list.append(descriptors)

descriptors_df = pd.DataFrame(descriptors_list, columns=descriptor_names)

# Combine descriptors with the original data
combined_data = pd.concat([data, descriptors_df], axis=1)

# Isolate the descriptor columns for filtering steps
descriptor_data = combined_data[descriptor_names]

# Step 3: Convert zeros to NaNs in floating-point columns
float_columns = descriptor_data.select_dtypes(include=['float64']).columns
descriptor_data[float_columns] = descriptor_data[float_columns].replace(0, np.nan)

# Step 4: Filter out descriptors with missing values
descriptor_data = descriptor_data.dropna(axis=1)

# Step 5: Filter out descriptors with low variance (<= 0.001)
variances = descriptor_data.var()
low_variance_descriptors = variances[variances <= 0.001].index
descriptor_data = descriptor_data.drop(columns=low_variance_descriptors)
dropped_descriptors = dict()

for col in low_variance_descriptors:
    dropped_descriptors[col] = "Low variance"

# Step 6: Filter out descriptors with high pairwise correlation (>= 0.95)
corr_matrix = descriptor_data.corr().abs()
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

columns_to_drop = set()
for column in upper_triangle.columns:
    high_corr_columns = upper_triangle.index[upper_triangle[column] >= 0.95].tolist()
    if high_corr_columns:
        highest_variance_column = max(high_corr_columns + [column], key=lambda col: variances[col])
        high_corr_columns.remove(highest_variance_column)
        columns_to_drop.update(high_corr_columns)
        for col in high_corr_columns:
            dropped_descriptors[col] = f"High correlation with {column}, {variances[column]}"

descriptor_data = descriptor_data.drop(columns=columns_to_drop)

# Combine the filtered descriptor data with the original non-descriptor columns
filtered_data = pd.concat([data.iloc[:, :3], descriptor_data], axis=1)

# Save the resulting dataframe to a new CSV file
filtered_data.to_csv(output_csv, index=False)

print(f"Filtered descriptors saved to {output_csv}")
print("Dropped descriptors:")
for descriptor, reason in dropped_descriptors.items():
    print(f"{descriptor}: {reason}")