In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import seaborn as sns
import efgs
from rdkit import Chem
from IPython.display import display, Image

In [None]:
target_smiles = 'Cc1cccc(-c2cccnc2)c1'

target_mol = Chem.MolFromSmiles(target_smiles)

img_text, _, _, _ = efgs.get_dec_fgs(target_mol)

display(Image(data=img_text))

# save the image
with open(f"images/labeled_{target_smiles}.png", "wb") as f:
    f.write(img_text)

In [None]:
# Load chembl data
chembl_data = pd.read_csv('data/chembl_35_fg_scaf.csv')

In [None]:
# Convert the fgs column to lists with lambda function
chembl_data['fgs'] = chembl_data['fgs'].apply(lambda x: eval(x) if pd.notnull(x) else [])

In [None]:
chembl_data.head()

In [None]:
# load curated data
curated_data = pd.read_pickle('data/chembl_35_fg_scaf_curated.pkl')

In [None]:
curated_data.head()

In [None]:
# Count functional groups
fg_counts = Counter()
curation_groups = ['[R][NH][R]', 'O=[C](O)[R]', 'C=C', '[NH2][Car]']
for fgs in curated_data['fgs']:
    # Only count curation groups once per molecule
    filtered_fgs = set(fg for fg in fgs if fg in curation_groups)
    fg_counts.update(filtered_fgs)

# print the counts
print(fg_counts)

# how many unique fgs
num_unique_fgs = len(fg_counts)

print(f"Number of unique functional groups: {num_unique_fgs}")

# Only keep top 50 for display
top_fg = dict(sorted(fg_counts.items(), key=lambda item: item[1], reverse=True)[:50])

labels = list(top_fg.keys())
values = np.array(list(top_fg.values()))
total_count = sum(fg_counts.values())

# Cumulative percentage (based on entire dataset)
cum_percentage = np.cumsum(values) / total_count * 100

# Style settings for cleaner thesis look
sns.set_theme(style="ticks")  # Removes background grid
plt.rcParams.update({
    "font.size": 12,
    "axes.titlesize": 14,
    "axes.labelsize": 12,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10
})

fig, ax1 = plt.subplots(figsize=(14, 6), dpi=300)

# Bar chart with logarithmic y-axis
bars = ax1.bar(range(len(labels)), values, color=sns.color_palette("Blues")[4])
ax1.set_xlabel('Functional Groups')
ax1.set_ylabel('Counts')
ax1.set_xticks(range(len(labels)))
ax1.set_xticklabels(labels, rotation=75, ha='right')

# Add subtle horizontal grid only
ax1.yaxis.grid(True, linestyle='--', alpha=0.3)
ax1.xaxis.grid(False)

# Cumulative percentage line (on secondary y-axis, linear scale)
ax2 = ax1.twinx()
ax2.plot(range(len(labels)), cum_percentage, color='red', marker='o', linewidth=2)
ax2.set_ylabel('Cumulative Percentage (%)')
ax2.set_ylim(0, 110)
ax2.axhline(80, color='gray', linestyle='--', alpha=0.5)

plt.title('Curated Functional Groups Distribution', pad=15)
plt.tight_layout()

plt.savefig("images/top50_pareto_cur.png", dpi=300, bbox_inches='tight')

plt.show()


# Total counts
total_count = sum(fg_counts.values())
top_50_count = sum([count for _, count in fg_counts.most_common(50)])
rest_count = total_count - top_50_count

# Percentages
top_50_pct = top_50_count / total_count * 100
rest_pct = rest_count / total_count * 100

# Seaborn color palette
colors = [sns.color_palette("Blues")[4], sns.color_palette("pastel")[1]]

# High-resolution figure
fig, ax = plt.subplots(figsize=(6, 6), dpi=300)

# Only returns wedges and texts when autopct=None
wedges, texts = ax.pie(
    [top_50_count, rest_count],
    labels=[f"Top 50\n({top_50_pct:.1f}%)", f"Other\n({rest_pct:.1f}%)"],
    colors=colors,
    startangle=90,
    wedgeprops={'edgecolor': 'white'}
)

# Style text
for text in texts:
    text.set_fontsize(12)

ax.set_title('Dataset Distribution: Top 50 Functional Groups', fontsize=14, pad=15)
plt.tight_layout()
plt.savefig("images/top50_pie.png", dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# what percent of functional groups appear less than 100 times
num_fgs_less_100 = sum(1 for count in fg_counts.values() if count <= 100)
print(f"Count of functional groups appearing less than 100 times: {num_fgs_less_100}")

avg_length_less_100 = (
    sum(sum(1 for c in fg if c.isupper()) for fg in fg_counts if fg_counts[fg] < 100)
    / num_fgs_less_100
    if num_fgs_less_100 > 0 else 0
)
print(f"Average length of functional groups appearing less than 100 times: {avg_length_less_100:.2f}")

# print the first smiles of a functional group appearing less than 100 times
for idx, fgs in enumerate(chembl_data['fgs']):
    if any(fg_counts[fg] <= 1 for fg in fgs):
        print(f"First SMILES meeting criteria: {chembl_data['smiles'].iloc[idx]}")
        break

percent_fgs_less_100 = num_fgs_less_100 / num_unique_fgs * 100

print(f"Percentage of functional groups appearing less than 100 times: {percent_fgs_less_100:.2f}%")

# what percent is the most common fg
most_common_fg_count = max(fg_counts.values())
print(f"Count of the most common functional group: {most_common_fg_count}")
percent_most_common_fg = most_common_fg_count / total_count * 100

print(f"Percentage of the most common functional group: {percent_most_common_fg:.2f}%")
