In [13]:
import os 
os.chdir('../')
import pandas as pd

In [22]:
dataframe = pd.read_csv('Work/chelombus/data/10M_ZINC_id_Sim_mqn.csv')

In [None]:
s = 'Work/chelombus/data/10M_ZINC_id_Sim_mqn.csv'


s.split('.')[-1]

In [4]:
from src.data_handler import DataHandler
data_handler = DataHandler()
smiles_list, features = data_handler.extract_smiles_and_features(dataframe)

In [None]:
features

In [6]:
from src.fingerprint_calculator import FingerprintCalculator
fp_calculator = FingerprintCalculator()

fingerprints = fp_calculator.calculate_fingerprints(smiles_list)

In [None]:
from rdkit.Chem import rdMolDescriptors    
from rdkit import Chem

m = Chem.MolFromSmiles('CO[C@@H]1[C@@H](OC(N)=O)[C@@H](O)[C@H](Oc2ccc3c(O)c(NC(=O)c4ccc(O)c(CC=C(C)C)c4)c(=O)oc3c2C)OC1(C)C')

ds = rdMolDescriptors.MQNs_(m)                                                                    

len(ds)

In [None]:
fingerprints.shape

In [10]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn import preprocessing


def estimate_components(data_sample):
    # Assume data_sample is a NumPy array of shape (n_samples, n_features)
    
    # Step 2: Fit PCA without reducing dimensions
    pca = PCA(n_components=len(fingerprints[1]))
    pca.fit(data_sample)
    
    # Step 3: Calculate cumulative explained variance
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    
    # Step 4: Determine the number of components to retain desired variance
    threshold = 0.95  # For 95% variance
    n_components = np.argmax(cumulative_variance >= threshold) + 1
    print(f"Number of components to retain {threshold*100}% of variance: {n_components}")
    
    # Optional: Plot cumulative explained variance
    import matplotlib.pyplot as plt
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')
    plt.xlabel('Number of Principal Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title('Explained Variance vs. Number of Components')
    plt.grid(True)
    plt.show()
    
    return n_components


In [None]:
estimate_components(fingerprints)

Not good, should look into other FP that can reduce the number of dimensions and still conserve the variance

In [18]:
dataframe = dataframe.drop(columns= ['target_id', 'standard_type', 'target_name', 'standard_relation', 'standard_value', 'standard_units', 'target_organism'])

In [19]:
dataframe.to_csv('data/cleaned_dataset.csv', index = False)

In [None]:
import csv
import itertools
from math import sqrt

def euclidean_distance(p1, p2):
    return sqrt(sum((a - b) ** 2 for a, b in zip(p1, p2)))

def find_most_similar_points(filename):
    points = []
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            points.append((float(row['x']), float(row['y']), float(row['z'])))

    min_distance = float('inf')
    most_similar = None

    for combo in itertools.combinations(range(len(points)), 10):
        p1, p2, p3 = [points[i] for i in combo]
        dist = (euclidean_distance(p1, p2) + 
                euclidean_distance(p2, p3) + 
                euclidean_distance(p3, p1))
        if dist < min_distance:
            min_distance = dist
            most_similar = (p1, p2, p3)

    return most_similar

# Usage
filename = '/home/afloresep/work/chelombus/data/output/batch_data_0.csv'
result = find_most_similar_points(filename)
print("The 3 most similar points are:", result)

This proves that same molecules are given the same coordinates

In [None]:
import csv
import itertools
from math import sqrt
from collections import defaultdict

def euclidean_distance(p1, p2):
    return sqrt(sum((a - b) ** 2 for a, b in zip(p1, p2)))

def find_most_similar_points(filename):
    points = defaultdict(list)
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            point = (float(row['x']), float(row['y']), float(row['z']))
            points[point].append(i)

    unique_points = list(points.keys())
    
    if len(unique_points) < 3:
        return "Not enough unique points to find 3 most similar."

    min_distance = float('inf')
    most_similar = None

    for combo in itertools.combinations(range(len(unique_points)), 3):
        p1, p2, p3 = [unique_points[i] for i in combo]
        dist = (euclidean_distance(p1, p2) + 
                euclidean_distance(p2, p3) + 
                euclidean_distance(p3, p1))
        if dist < min_distance:
            min_distance = dist
            most_similar = (p1, p2, p3)

    return most_similar, min_distance

# Usage
filename = '/home/afloresep/work/chelombus/data/output/batch_data_0.csv'
result, distance = find_most_similar_points(filename)
print("The 3 most similar non-duplicate points are:", result)
print("Total distance between these points:", distance)

In [None]:
import csv
import itertools
from math import sqrt
from collections import defaultdict
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem

def euclidean_distance(p1, p2):
    return sqrt(sum((a - b) ** 2 for a, b in zip(p1, p2)))

def find_most_similar_points(filename, n=10):
    points = defaultdict(list)
    smiles_dict = {}
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            point = (float(row['x']), float(row['y']), float(row['z']))
            points[point].append(i)
            smiles_dict[point] = row['smiles']

    unique_points = list(points.keys())
    
    if len(unique_points) < n:
        return f"Not enough unique points to find {n} most similar."

    distances = []
    for combo in itertools.combinations(range(len(unique_points)), n):
        combo_points = [unique_points[i] for i in combo]
        dist = sum(euclidean_distance(p1, p2) 
                   for p1, p2 in itertools.combinations(combo_points, 2))
        distances.append((dist, combo_points))

    distances.sort(key=lambda x: x[0])
    return distances[0]

def plot_smiles(smiles_list):
    mols = [Chem.MolFromSmiles(smile) for smile in smiles_list]
    
    for mol in mols:
        AllChem.Compute2DCoords(mol)
    
    rows = (len(smiles_list) + 4) // 5  # 5 molecules per row
    fig, axes = plt.subplots(rows, 5, figsize=(20, 4*rows))
    fig.suptitle("2D Structures of 10 Most Similar Molecules", fontsize=16)

    axes = axes.flatten()  # Flatten the 2D array of axes

    for i, (mol, ax) in enumerate(zip(mols, axes)):
        img = Draw.MolToImage(mol)
        ax.imshow(img)
        ax.axis('off')
        ax.set_title(f"Structure {i+1}", fontsize=10)

    # Hide any unused subplots
    for j in range(i+1, len(axes)):
        axes[j].axis('off')

    plt.tight_layout()
    plt.show()

# Usage
filename = '/home/afloresep/work/chelombus/data/output/batch_data_19531.csv'
result, distance = find_most_similar_points(filename)


In [None]:
data_handler = DataHandler()

data_handler.find_input_type(file_path='test.csv')



In [None]:
from tqdm import tqdm
import pickle


# Load all fingerprints with tqdm progress bar
fingerprints, smiles_list, features = [], [], []
for idx in tqdm(range(75), desc="Loading Fingerprints"):
    with open(f'data/fingerprints_chunk_{idx}.pkl', 'rb') as f:
        fps_chunk, smiles_chunk, features_chunk = pickle.load(f)
        fingerprints.extend(fps_chunk)
        smiles_list.extend(smiles_chunk)
        features.extend(features_chunk)

In [None]:
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem

def plot_smiles(smiles_list):
    mols = [Chem.MolFromSmiles(smile) for smile in smiles_list]
    
    # Generate 2D coordinates for each molecule
    for mol in mols:
        AllChem.Compute2DCoords(mol)
    
    # Create a 1x3 subplot
    fig, axes = plt.subplots(1, 6, figsize=(15, 5))
    fig.suptitle("2D Structures of SMILES", fontsize=16)

    for i, (mol, ax) in enumerate(zip(mols, axes)):
        img = Draw.MolToImage(mol)
        ax.imshow(img)
        ax.axis('off')
        ax.set_title(f"Structure {i+1}", fontsize=12)

    plt.tight_layout()
    plt.show()

# SMILES strings
smiles = ['Cc1cc(C(=O)N2CC[C@H]2CN(C)C(=O)C(C)(C)C(F)F)c(C)[nH]1','Cc1oc(C(C)C)cc1C(=O)N(C)C[C@@H]1CCN1C(=O)C(C)(F)F',
'CCCc1[nH]ccc1C(=O)N(C)C[C@@H]1CCN1C(=O)CC(F)(F)F',
'CCc1[nH]ccc1C(=O)N1CC[C@H]1CN(C)C(=O)C(C)(C)C(F)F',
'Cc1nc[nH]c1C(=O)N(C)C[C@H]1CCN1C(=O)C(C)(C)C(C)(F)F',
'Cc1[nH]nc(C(=O)N(C)C[C@@H]2CCN2C(=O)C(C)(C)C(F)F)c1C',
'Cc1nn(C(C)C)cc1C(=O)N(C)C[C@@H]1CCN1C(=O)C(C)(F)F']

plot_smiles(smiles)



In [5]:
import pandas as pd

df = pd.read_csv('/home/afloresep/work/chelombus/data/10M_ZINC_id_Sim_mqn.csv', nrows=1000000)

In [None]:
import rdkit
from rdkit import Chem

file_path = '/home/afloresep/work/chelombus/data/Enamine_REAL_HAC_29_38_1.3B_Part_2_CXSMILES.cxsmiles'

cxsmiles_batch = []
with open(file_path, 'r') as file:
    for i, line in enumerate(file):
        pass
print(i)

In [52]:
def get_total_chunks(file_path, chunksize):
    """ Calculate number of chunks based on chunksize. For tqdm """
    total_lines = sum(1 for _ in open(file_path)) - 1  # Subtract 1 for header
    total_chunks = (total_lines + chunksize - 1) // chunksize
    return total_chunks

In [None]:
get_total_chunks(file_path, chunksize=20000)

In [None]:
import os
# os.chdir('../')
import time
import sys
import pickle
from config import DATA_FILE_PATH, CHUNKSIZE
from tqdm import tqdm 
from sklearn.decomposition import IncrementalPCA

from src.data_handler import DataHandler, get_total_chunks
from src.fingerprint_calculator import FingerprintCalculator
from src.output_generator import OutputGenerator

# Initialize classes
data_handler = DataHandler(DATA_FILE_PATH, CHUNKSIZE)
output_gen = OutputGenerator()
fp_calculator = FingerprintCalculator()
# Load data in chunks
start = time.time()
data_chunks, total_chunks = data_handler.load_data()
# Process chunks with tqdm progress bar
num_chunks = 0
for idx, chunk in enumerate(tqdm(data_chunks, total=18295, desc="Processing Chunks")):
    num_chunks += 1
    chunk

In [None]:
with open('data/Enamine_REAL_HAC_29_38_1.3B_Part_2_CXSMILES.cxsmiles', 'r', encoding='utf-8') as f:
            while True:
                smiles = []
                features = [] 
                for _ in range(100):
                        line = f.readline().split()

In [None]:
with open('data/Enamine_REAL_HAC_29_38_1.3B_Part_2_CXSMILES.cxsmiles', 'r', encoding='utf-8') as f:
    for _ in range (1000):
        smiles_entry = line.strip().split('\t')[1:]
        print(smiles_entry)

In [6]:
import numpy as np
list = np.array(list)

In [14]:
list[:,0]

array(['CCOC(=O)C1=C(C)N=C(CNC(C2=CC(C)=CC=C2)C2=NN=NN2C(C)(C)C)O1',
       'CCOC(=O)C1=C(C)N=C(CNC(C2=CC(C)=CC=C2)C2=NN=NN2C2COC2)O1',
       'CCOC(=O)C1=C(C)N=C(CNC(C2=CC(C)=CC=C2)C2=NN=NN2CC2CC2)O1', ...,
       'CCOC(=O)C1=C(C)N=C(CNC(=O)N2CCOC(CN(C)C3=CC=CN=N3)C2)O1',
       'CCOC(=O)C1=C(C)N=C(CNC(=O)N2CCC(N(C)CC3=NC=C(CC)O3)C2)O1',
       'CCOC(=O)C1=C(C)N=C(CNC(=O)C2=NOC(COC3=C(Cl)C=CC=C3Cl)=C2)O1'],
      dtype='<U72')

In [4]:
smiles = [item[0] for item in list]

In [15]:
total_lines = sum(1 for _ in open('/home/afloresep/work/chelombus/data/Enamine_REAL_HAC_29_38_1.3B_Part_2_CXSMILES.cxsmiles')) - 1  # Subtract 1 for header


In [16]:
total_lines
664075400

664075400

In [19]:
import numpy as np

from sklearn.decomposition import IncrementalPCA

X = np.array([[-1, -1], [-2, -1], [-3, -2],

              [1, 1], [2, 1], [3, 2]])

ipca = IncrementalPCA(n_components=2, batch_size=3)

ipca.fit(X)
IncrementalPCA(batch_size=3, n_components=2)

coordinates = ipca.transform(X) 

In [22]:
coordinates

array([[-1.38340578, -0.2935787 ],
       [-2.22189802,  0.25133484],
       [-3.6053038 , -0.04224385],
       [ 1.38340578,  0.2935787 ],
       [ 2.22189802, -0.25133484],
       [ 3.6053038 ,  0.04224385]])

In [31]:
import pandas as pd
smiles_list = ['sdfsd', 'sdfsd', 'asdfasd', 'dfwfasdf', 'asdfasd', 'asdfasda']
batch_data = pd.DataFrame({
             'smiles': smiles_list })

In [32]:
coordinates[:, 0]

array([-1.38340578, -2.22189802, -3.6053038 ,  1.38340578,  2.22189802,
        3.6053038 ])

In [33]:
for i in range(len(coordinates[0])):
    batch_data[f'PCA_{i+1}'] = coordinates[:, i]

In [34]:
batch_data

Unnamed: 0,smiles,PCA_1,PCA_2
0,sdfsd,-1.383406,-0.293579
1,sdfsd,-2.221898,0.251335
2,asdfasd,-3.605304,-0.042244
3,dfwfasdf,1.383406,0.293579
4,asdfasd,2.221898,-0.251335
5,asdfasda,3.605304,0.042244
