In [None]:
import pandas as pd
file_paths = []
# Load the provided CSV files to check their structure
for i in range(1653):
    file_paths.append(f'/home/afloresep/work/chelombus/data/output/batch_data_{i}.csv')

# Load all data from the files (only x, y, z columns are needed for analysis)
combined_data = pd.DataFrame()

for file in file_paths:
    df = pd.read_csv(file)
    combined_data = pd.concat([combined_data, df[['smiles', 'x', 'y', 'z']]])

# Reset the index of the combined dataframe
combined_data.reset_index(drop=True, inplace=True)

# Display the shape of the combined data
combined_data.shape


In [None]:
combined_data.min()
combined_data.head(10)

In [None]:
import matplotlib.pyplot as plt
import scipy.stats as stats

# Create histograms and Q-Q plots for x, y, z
components = ['x', 'y', 'z']
fig, axes = plt.subplots(3, 2, figsize=(12, 12))

for i, comp in enumerate(components):
    # Plot Histogram
    axes[i, 0].hist(combined_data[comp], bins=50, alpha=0.7, color='b')
    axes[i, 0].set_title(f'Histogram of {comp}')
    
    # Plot Q-Q plot
    stats.probplot(combined_data[comp], dist="norm", plot=axes[i, 1])
    axes[i, 1].set_title(f'Q-Q plot of {comp}')

plt.tight_layout()
plt.show()


In [23]:
import numpy as np

x_values = combined_data['x']
y_values = combined_data['y']
z_values = combined_data['z']

percentiles_range = [0.01, 99.99]

# Calculate percentiles for x, y, z
percentiles_x = np.percentile(x_values, percentiles_range)
percentiles_y = np.percentile(y_values, percentiles_range)
percentiles_z = np.percentile(z_values, percentiles_range)

In [None]:
# Create the histograms
fig, axs = plt.subplots(3, 1, figsize=(8, 12))

# Histogram for X axis with percentiles
axs[0].hist(x_values, bins=30, edgecolor='black')
axs[0].set_title('Histogram for X values')
axs[0].set_xlabel('X values')
axs[0].set_ylabel('Frequency')
for p in percentiles_x:
    axs[0].axvline(p, color='r', linestyle='dashed', linewidth=1)
axs[0].legend([f'25th: {percentiles_x[0]:.2f}', f'50th: {percentiles_x[1]:.2f}', f'75th: {percentiles_x[2]:.2f}'])

# Histogram for Y axis with percentiles
axs[1].hist(y_values, bins=30, edgecolor='black')
axs[1].set_title('Histogram for Y values')
axs[1].set_xlabel('Y values')
axs[1].set_ylabel('Frequency')
for p in percentiles_y:
    axs[1].axvline(p, color='r', linestyle='dashed', linewidth=1)
axs[1].legend([f'25th: {percentiles_y[0]:.2f}', f'50th: {percentiles_y[1]:.2f}', f'75th: {percentiles_y[2]:.2f}'])

# Histogram for Z axis with more granularity and percentiles
axs[2].hist(z_values, bins=50, edgecolor='black')  # More bins for more detailed view
axs[2].set_title('Histogram for Z values (More Granular)')
axs[2].set_xlabel('Z values')
axs[2].set_ylabel('Frequency')
for p in percentiles_z:
    axs[2].axvline(p, color='r', linestyle='dashed', linewidth=1)
axs[2].legend([f'25th: {percentiles_z[0]:.2f}', f'50th: {percentiles_z[1]:.2f}', f'75th: {percentiles_z[2]:.2f}'])

plt.tight_layout()
plt.show()

In [None]:
print(percentiles_x, percentiles_y, percentiles_z)

In [31]:
## Mapping

import numpy as np

# Define the percentile ranges for each coordinate (based on 0.01 and 99.99 percentiles)
x_percentiles = [-23.65769251, 30.68569253]
y_percentiles = [-16.20908626, 23.04514974]
z_percentiles = [-10.74670722, 12.98772289]

# Calculate the step sizes based on the percentile ranges
x_step_size = (x_percentiles[1] - x_percentiles[0]) / 100
y_step_size = (y_percentiles[1] - y_percentiles[0]) / 100
z_step_size = (z_percentiles[1] - z_percentiles[0]) / 100


# Function to map coordinates into [0, 100] steps
def map_to_grid(coord, min_val, step_size):
    return np.clip(np.floor((coord - min_val) / step_size), 0, 99)

# Apply mapping to x, y, and z coordinates
mapped_x = map_to_grid(combined_data['x'], x_percentiles[0], x_step_size)
mapped_y = map_to_grid(combined_data['y'], y_percentiles[0], y_step_size)
mapped_z = map_to_grid(combined_data['z'], z_percentiles[0], z_step_size)

In [38]:
mapped_dataframe  ={
    'smiles': combined_data['smiles'], 
    'x': mapped_x,
    'y': mapped_y,
    'z': mapped_z
}

d = pd.DataFrame(data=mapped_dataframe)

In [None]:
x_step_size

In [None]:
x_50 = d[d['x'] == 50]
x_50_y_40 = x_50[x_50['y'] == 40]
z_47 = x_50_y_40[x_50_y_40['z'] == 47]
z_47

In [53]:
import pandas as pd

df = pd.DataFrame(data=mapped_dataframe)
# Sample data to simulate your situation

# Initialize an empty dictionary to store the coordinates as keys
coordinate_dict = {}

# Iterate through the DataFrame to populate the dictionary
for index, row in df.iterrows():
    # Create the pixel coordinate key
    key = f'pixel_{int(row["x"])}_{int(row["y"])}_{int(row["z"])}'
    
    # If the key already exists, append the SMILES to the list, otherwise create a new list
    if key in coordinate_dict:
        coordinate_dict[key].append(row['smiles'])
    else:
        coordinate_dict[key] = [row['smiles']]



## VISUALIZATION OF SMILES IN CERTAIN PIXEL

In [None]:
from rdkit import Chem
from rdkit.Chem import Draw
from IPython.display import display

def visualize_smiles_for_key(coordinate_dict, key):
    """
    Visualizes the molecules (SMILES) stored in a given pixel coordinate key.
    
    Parameters:
    coordinate_dict (dict): Dictionary containing pixel coordinates as keys and SMILES strings as values.
    key (str): The pixel key (e.g., 'pixel_50_40_47') for which the molecules will be visualized.
    
    print(s:
    None: Displays the molecular images.
    """
    if key in coordinate_dict:
        smiles_list = coordinate_dict[key]
        
        # Convert SMILES to RDKit molecule objects
        molecules = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
        
        # Generate molecular images
        img = Draw.MolsToGridImage(molecules, molsPerRow=4, subImgSize=(300, 300), useSVG=True)
        
        # Display the SVG image
        display(img)
    else:
        print(f"No SMILES found for key: {key}")

# Example usage with the dictionary we created earlier:
visualize_smiles_for_key(coordinate_dict, 'pixel_50_40_47')


In [None]:
import pandas as pd

# Initialize an empty dictionary to store the counts of SMILES per pixel
pixel_counts = {}

# Iterate through the DataFrame to count SMILES in each pixel
for index, row in df.iterrows():
    # Create the pixel coordinate key
    key = f'pixel_{int(row["x"])}_{int(row["y"])}_{int(row["z"])}'
    
    # If the key already exists, increment the count, otherwise start at 1
    if key in pixel_counts:
        pixel_counts[key] += 1
    else:
        pixel_counts[key] = 1

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np


# Initialize an empty dictionary to store the counts of SMILES per pixel
pixel_counts = {}

# Iterate through the DataFrame to count SMILES in each pixel
for index, row in df.iterrows():
    # Create the pixel coordinate key
    key = f'pixel_{int(row["x"])}_{int(row["y"])}_{int(row["z"])}'
    
    # If the key already exists, increment the count, otherwise start at 1
    if key in pixel_counts:
        pixel_counts[key] += 1
    else:
        pixel_counts[key] = 1

# Step 1: Sorting the pixels with the most molecules
sorted_pixel_counts = sorted(pixel_counts.items(), key=lambda item: item[1], reverse=True)

# Display the top 5 pixels with the most molecules
print("Top 5 pixels with the most molecules:")
for key, count in sorted_pixel_counts[:50]:
    print(f'{key}: {count} SMILES')

# Step 2: Prepare data for 3D plotting
# Extract x, y, z coordinates and counts from the pixel_counts dictionary
x_coords = []
y_coords = []
z_coords = []
counts = []

for key, count in pixel_counts.items():
    # Extract the pixel coordinates from the key (e.g., 'pixel_50_40_47')
    _, x, y, z = key.split('_')
    x_coords.append(int(x))
    y_coords.append(int(y))
    z_coords.append(int(z))
    counts.append(count)

# Convert to numpy arrays for plotting
x_coords = np.array(x_coords)
y_coords = np.array(y_coords)
z_coords = np.array(z_coords)
counts = np.array(counts)

# Step 3: Plotting the pixel distribution with color intensity based on counts
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Create a scatter plot with color coding based on the counts (log scale for better color distribution)
sc = ax.scatter(x_coords, y_coords, z_coords, c=np.log(counts), cmap='viridis', s=100)

# Add color bar to indicate the count intensity
cbar = plt.colorbar(sc)
cbar.set_label('Log(Number of SMILES)')

# Set labels and title
ax.set_xlabel('X Pixel')
ax.set_ylabel('Y Pixel')
ax.set_zlabel('Z Pixel')
ax.set_title('Distribution of SMILES Across Pixels (Color Coded by Count)')

plt.show()

In [None]:
sorted_pixel_counts

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

# Sample data to simulate your situation
df

# Initialize an empty dictionary to store the counts of SMILES per pixel
pixel_counts_grouped = {}

# Function to bin the pixels into larger groups (e.g., group every 10 pixels together)
def bin_coordinate(value, bin_size):
    return int(value // bin_size) * bin_size

# Choose a bin size to reduce the number of unique pixel coordinates
bin_size = 10

# Iterate through the DataFrame to count SMILES in each binned pixel
for index, row in df.iterrows():
    # Bin the x, y, z coordinates by the chosen bin size
    binned_x = bin_coordinate(row['x'], bin_size)
    binned_y = bin_coordinate(row['y'], bin_size)
    binned_z = bin_coordinate(row['z'], bin_size)
    
    # Create the pixel coordinate key
    key = f'pixel_{binned_x}_{binned_y}_{binned_z}'
    
    # If the key already exists, increment the count, otherwise start at 1
    if key in pixel_counts:
        pixel_counts[key] += 1
    else:
        pixel_counts[key] = 1

# Step 1: Sorting the pixels with the most molecules
sorted_pixel_counts = sorted(pixel_counts.items(), key=lambda item: item[1], reverse=True)

# Display the top 5 pixels with the most molecules
print("Top 50 pixels with the most molecules:")
for key, count in sorted_pixel_counts[:50]:
    print(f'{key}: {count} SMILES')

# Step 2: Prepare data for 3D plotting
# Extract x, y, z coordinates and counts from the pixel_counts dictionary
x_coords = []
y_coords = []
z_coords = []
counts = []

for key, count in pixel_counts.items():
    # Extract the pixel coordinates from the key (e.g., 'pixel_50_40_47')
    _, x, y, z = key.split('_')
    x_coords.append(int(x))
    y_coords.append(int(y))
    z_coords.append(int(z))
    counts.append(count)

# Convert to numpy arrays for plotting
x_coords = np.array(x_coords)
y_coords = np.array(y_coords)
z_coords = np.array(z_coords)
counts = np.array(counts)

# Step 3: Plotting the pixel distribution with color intensity based on counts
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Create a scatter plot with color coding based on the counts (log scale for better color distribution)
sc = ax.scatter(x_coords, y_coords, z_coords, c=np.log1p(counts), cmap='viridis', s=50)

# Add color bar to indicate the count intensity
cbar = plt.colorbar(sc)
cbar.set_label('Log(Number of SMILES + 1)')

# Set labels and title
ax.set_xlabel('X Pixel (Binned)')
ax.set_ylabel('Y Pixel (Binned)')
ax.set_zlabel('Z Pixel (Binned)')
ax.set_title(f'Distribution of SMILES Across Binned Pixels (Bin Size = {bin_size})')

plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt


# Initialize an empty dictionary to store the counts of SMILES per pixel
pixel_counts = {}

# Iterate through the DataFrame to count SMILES in each pixel
for index, row in df.iterrows():
    # Create the pixel coordinate key
    key = f'pixel_{int(row["x"])}_{int(row["y"])}_{int(row["z"])}'
    
    # If the key already exists, increment the count, otherwise start at 1
    if key in pixel_counts:
        pixel_counts[key] += 1
    else:
        pixel_counts[key] = 1

# Step 1: Sorting the pixels with the most molecules
sorted_pixel_counts = sorted(pixel_counts.items(), key=lambda item: item[1], reverse=True)


# Step 2: Prepare data for 3D plotting
# Extract x, y, z coordinates and counts from the pixel_counts dictionary
x_coords = []
y_coords = []
z_coords = []
counts = []

for key, count in pixel_counts.items():
    # Extract the pixel coordinates from the key (e.g., 'pixel_50_40_47')
    _, x, y, z = key.split('_')
    x_coords.append(int(x))
    y_coords.append(int(y))
    z_coords.append(int(z))
    counts.append(count)

# Convert to numpy arrays for plotting
x_coords = np.array(x_coords)
y_coords = np.array(y_coords)
z_coords = np.array(z_coords)
counts = np.array(counts)

In [None]:

# Assume pixel_counts is defined as before, here we prepare the data
data = []
weights = []
for key, count in pixel_counts.items():
    x, y, z = map(int, key.split('_')[1:])
    data.append([x, y, z])
    weights.append(2)

# Convert to NumPy array for clustering
data = np.array(data)
weights = np.array(weights)

# Apply DBSCAN to find clusters
dbscan = DBSCAN(eps=100, min_samples=500)
clusters = dbscan.fit_predict(data, sample_weight=weights)

# Step 3: Plotting the clusters
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Create a scatter plot with color coding based on the cluster label
scatter = ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=clusters, cmap='viridis', s=50)

# Add color bar to indicate the cluster labels
cbar = plt.colorbar(scatter)
cbar.set_label('Cluster Label')

# Set labels and title
ax.set_xlabel('X Pixel')
ax.set_ylabel('Y Pixel')
ax.set_zlabel('Z Pixel')
ax.set_title('DBSCAN Clustering of Pixel Data')

plt.show()

In [1]:
import os 
os.chdir('../')
from config import OUTPUT_FILE_PATH

os.path.abspath(os.curdir)

os.path.abspath(OUTPUT_FILE_PATH)

'/home/afloresep/work/chelombus/data/10M/output'

In [6]:
import os 
from config import OUTPUT_FILE_PATH
import pandas as pd

def _round_to_step(coordinate:float, min_value:float, max_value:float, step_size:float):
        # Map the coordinate to its closest value in the steps
        
        if coordinate < min_value: 
                return min_value
        elif coordinate > max_value:
                return max_value
        else: # min_value + Number of steps x Step Size
                mapped_coordinate = (min_value) + (step_size)*round((coordinate - min_value)/step_size)
                return mapped_coordinate

def fit_coordinates(output: str, percentiles:list):

        step_PCA_1 = (percentiles[0][1] - percentiles[0][0])/ 10
        step_PCA_2 = (percentiles[1][1] - percentiles[1][0])/ 10
        step_PCA_3 = (percentiles[2][1] - percentiles[2][0])/ 10
        
        df_output = pd.read_csv(os.path.join(OUTPUT_FILE_PATH, output))
        
        df_output['PCA_1'] = df_output['PCA_1'].apply(lambda x: _round_to_step(x, percentiles[0][0], percentiles[0][1], step_PCA_1))
        df_output['PCA_2'] = df_output['PCA_2'].apply(lambda x: _round_to_step(x, percentiles[1][0], percentiles[1][1], step_PCA_2))
        df_output['PCA_3'] = df_output['PCA_3'].apply(lambda x: _round_to_step(x, percentiles[2][0], percentiles[2][1], step_PCA_3))

        return df_output

percentiles = [(-23.69173900049878, 29.46851433296263), (-16.738348923448807, 23.54345794491185), (-10.819449416425204, 12.994996725530592)] 


for output in os.listdir(OUTPUT_FILE_PATH):
        mapped_coordinates = fit_coordinates(output, percentiles)
        

In [None]:
def _round_to_step(coordinate:float, min_value:float, max_value:float, step_size:float):
        # Map the coordinate to its closest value in the steps
        if coordinate < min_value: 
                print(min_value)
        elif coordinate > max_value:
                print(max_value)
        else: # min_value + Number of steps x Step Size
                mapped_coordinate = (min_value) + (step_size)*round((coordinate - min_value)/step_size)
                print(mapped_coordinate)
        

coordinates = [-12.3, -10.4, 5, 2, 13, 5.6, 5.55, 5.5, 8.12, -0.34, 11.3]
min_value = -11
max_value = 12
step_size = 1

for coordinate in coordinates:
        _round_to_step(coordinate, min_value, max_value, step_size)