In [None]:
import pandas as pd
import numpy as np

df = pd.read_parquet("./data/processed/deepchem_mol2vec.parquet")

exp_logp = df["exp"]
labels = df["exp"].to_numpy() 
features = df.drop(["exp","smiles", "CMPD_CHEMBLID"], axis = 1)
features = features.to_numpy()
labels = labels.reshape(-1, 1)
features = np.stack(features.squeeze())

In [None]:
extended_df = pd.read_parquet("./data/processed/deepchem_extended_mol2vec_300.parquet")
extended_features = extended_df.drop(["exp", "smiles", "CMPD_CHEMBLID"], axis=1)

mol2vec_col = extended_df["mol2vec"]
extended_descriptors = [
    "molwt", "clogp", "hba", "hbd",
    "tpsa",
    "num_rotatable_bonds",
    "num_rings",
    "num_aromatic_rings",
    "fraction_csp3",
    "num_heavy_atoms",
    "num_valence_electrons"]


features_mol2vec = np.array(mol2vec_col.tolist(), dtype=np.float64)
features_descriptors = extended_df[extended_descriptors].to_numpy(dtype=np.float64)

extended_features = np.hstack((features_mol2vec, features_descriptors))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

labels = labels.ravel()
abs_min = abs(min(labels))

shift_min_0 = labels + abs_min + 1
y_log = np.log(shift_min_0)

def plot_distribution(labels, save=False):
    plt.figure(figsize=(10, 6))
    sns.histplot(data=labels, kde=True)
    plt.xlabel("Values")
    plt.ylabel("Count")
    plt.title("Distribution of Values")
    plt.grid(True, alpha=0.3)
    if save:
        plt.savefig("data_distributions.png")
    plt.show()

plot_distribution(labels, save=False)
plot_distribution(y_log)

def std_ddof1(x):
    return x.std(ddof=1)



out = exp_logp.agg(['mean','sum',std_ddof1, 'min', 'max'])
num_negative_vals = sum(df["exp"] < 0)

print (out)
print(f"Negative values: {num_negative_vals}")

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

tsne_df = pd.read_parquet("./data/processed/deepchem_mol2vec.parquet")
m2v = tsne_df.drop(["CMPD_CHEMBLID", "smiles", "exp"], axis=1)

m2v = np.stack(np.squeeze(m2v))
pca25 = PCA(n_components=25)
m2v25 = pca25.fit_transform(m2v)

print(f"Total Explained Variance: {sum(pca25.explained_variance_ratio_)}\nExplained Variances: {pca25.explained_variance_ratio_}")

tsne30 = TSNE(n_components=3, learning_rate="auto", perplexity=30).fit_transform(m2v25)
tsne50 = TSNE(n_components=3, learning_rate="auto", perplexity=50).fit_transform(m2v25)

logp_values = tsne_df["exp"].values


In [None]:
fig = plt.figure(figsize=(15, 12))
ax1 = fig.add_subplot(111, projection="3d")
scatter3 = ax1.scatter(tsne30[:,0],tsne30[:,1],tsne30[:,2],
                     c=logp_values,
                     cmap="coolwarm",
                     alpha=0.7,
                     s=30)
ax1.set_xlabel("t-SNE dimension 1")
ax1.set_ylabel("t-SNE dimension 2")
plt.colorbar(scatter3, ax=ax1, label="Experimental LogP")
ax1.set_title("Perplexity 30")