## Load Prediciton & True Data Path

In [17]:
import pandas as pd
import os

zeo_code = "SSF"

pred_path = os.path.join(
    ".../Zeolite_Type/", ##path to your model
    zeo_code,
    f"{zeo_code.lower()}1_candidates_threshold_comb.csv"
)
true_path = "/home/CelineGuo73/zeolites/data/all_data_ori.csv"
# true_path = "/home/CelineGuo73/zeolites/data/zeosyn_only.csv"

pred_df = pd.read_csv(pred_path)
true_df = pd.read_csv(true_path)
pred_df.columns = [c.strip().lower() for c in pred_df.columns]
true_df.columns = [c.strip().lower() for c in true_df.columns]


In [18]:
#Select all_data_ori with yield !=0
zeo_code = "SSF"
true_df = true_df[true_df["class"] != 0.0].copy()

true_osdas = set(true_df.loc[true_df["zeolite_code"] == zeo_code, "osda"].astype(str).unique())
print(f"Number of true OSDAs for {zeo_code}: {len(true_osdas)}")

Number of true OSDAs for SSF: 1153


In [19]:
##HIT RATE
top_n = [1,5,10,20,50,100,200]
hit_rates = {}
metrics = {}

for n in top_n:
    top_pred_osdas = set(pred_df.nsmallest(n, "rankinslate")["osda"].astype(str))
    hits = len(true_osdas & top_pred_osdas)

    # Precision = hits / N  → labeled as Hit Rate
    hit_rate = hits / n * 100

    # Recall = hits / total true  → labeled as Recovery
    recovery = hits / len(true_osdas) * 100 if len(true_osdas) > 0 else 0

    metrics[n] = (hit_rate, recovery)

# Print results
print("\n=== MFI Zeolite Ranking Metrics ===")
for n, (hit_rate, recovery) in metrics.items():
    print(f"Top {n}:  Hit Rate = {hit_rate:.2f}%   Recovery = {recovery:.2f}%")


=== MFI Zeolite Ranking Metrics ===
Top 1:  Hit Rate = 100.00%   Recovery = 0.09%
Top 5:  Hit Rate = 100.00%   Recovery = 0.43%
Top 10:  Hit Rate = 100.00%   Recovery = 0.87%
Top 20:  Hit Rate = 90.00%   Recovery = 1.56%
Top 50:  Hit Rate = 96.00%   Recovery = 4.16%
Top 100:  Hit Rate = 82.00%   Recovery = 7.11%
Top 200:  Hit Rate = 81.00%   Recovery = 14.05%


## Plot hit rates per zeolite

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# -----------------------------
# Data
# -----------------------------
data = {
    "Zeolite": [
        "LEV", "SOD", "AFI", "MFI", "BEA", "FER", "ERI", "AFS", "SOV", "DOH",
        "CHA", "SVR", "SVY", "TER", "THO", "VFI", "SSF"
    ],
    "Top-1 (%)":  [
        0,   100, 100, 100, 100, 100, 0,   100, 100, 0,
        0,   0,   0,   0,   0,   100, 100
    ],
    "Top-5 (%)":  [
        0,   100, 100, 80,  60,  20,  0,   100, 60,  60,
        0,   0,   0,   40,  40,  80,  100
    ],
    "Top-10 (%)": [
        0,   60,  80,  90,  60,  30,  10,  100, 70,  50,
        20,  0,   0,   60,  50,  60,  100
    ],
    "Top-20 (%)": [
        0,   65,  90,  90,  55,  30,  25,  100, 80,  40,
        35,  0,   0,   55,  70,  50,  90
    ],
    "Top-50 (%)": [
        6,   66,  92,  85,  56,  24,  18,  90,  74,  32,
        20,  0,   0,   48,  60,  62,  96
    ],
    "Top-100 (%)": [
        5,   40,  93,  68,  57,  18,  13,  93,  82,  29,
        24,  0,   0.5, 47,  53,  60,  82
    ],
    "#True Data in Training": [
        429, 79, 901, 960, 963, 252, 751, 1102, 1043, 86,
        788, 1,  13,  198, 66, 1070, 1153
    ]
}


df = pd.DataFrame(data).set_index("Zeolite")

df_sorted = df.sort_values(by="#True Data in Training", ascending=False)
topk_df = df_sorted.loc[:, df_sorted.columns.str.contains("Top")]

plt.figure(figsize=(12, 8))
sns.set(style="whitegrid")

ax = sns.heatmap(
    topk_df,
    annot=True, fmt=".0f",
    cmap="YlGn",
    cbar_kws={'label': 'Hit Rate (%)'},
    linewidths=0.5,
    vmin=0, vmax=100)

ax.set_title("Zeolite Top-K Hit Rate (%) (Ordered by Training Data Volume)", fontsize=15, pad=16)
ax.set_xlabel("Top-K")
ax.set_ylabel("Zeolite")

ax.set_yticklabels(
    [f"{label.get_text()}  ({df_sorted.loc[label.get_text(), '#True Data in Training']})"
     for label in ax.get_yticklabels()],
    rotation=0
)

plt.tight_layout()
plt.show()


## Plot of mean hit rates(%) vs Number of data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# -----------------------------
# Data
# -----------------------------
data = {
    "Zeolite": ["LEV", "SOD", "AFI", "MFI", "BEA", "FER", "ERI", "AFS", "SOV", "DOH",
                "CHA", "SVR", "SVY", "TER", "THO", "VFI", "SSF"],

    "Top-1 (%)":      [0, 100, 100, 100, 100, 100, 0, 100, 100, 0, 0, 0, 0, 0, 0, 100, 100],
    "Top-5 (%)":      [0, 60, 100, 100, 80, 60, 0, 80, 100, 0, 0, 0, 0, 0, 0, 100, 100],
    "Top-10 (%)":     [0, 30, 100, 100, 80, 70, 10, 90, 100, 0, 10, 0, 0, 10, 10, 100, 100],
    "Top-20 (%)":     [5, 15, 90, 95, 70, 75, 40, 95, 95, 0, 40, 0, 0, 25, 35, 95, 95],
    "Top-50 (%)":     [6, 6, 90, 78, 56, 64, 76, 92, 98, 4, 76, 0, 0, 28, 46, 92, 98],
    "Top-100 (%)":    [12, 3, 94, 81, 52, 53, 78, 83, 94, 4, 78, 0, 0, 26, 49, 99, 99],

    "#True Data in Training": [
        429, 79, 901, 960, 963, 252, 751, 1102, 1043, 86,
        788, 1, 13, 198, 66, 1070, 1153
    ]
}


df = pd.DataFrame(data)

# -----------------------------
# Compute mean Top-K hit rate
# -----------------------------
topk_cols = [col for col in df.columns if "Top" in col]
df["Mean Top-K (%)"] = df[topk_cols].mean(axis=1)

# -----------------------------
# Scatter + regression plot
# -----------------------------
plt.figure(figsize=(8, 6))
sns.set(style="whitegrid")

ax = sns.regplot(
    data=df,
    x="#True Data in Training",
    y="Mean Top-K (%)",
    scatter_kws={"s": 80, "color": "#2b83ba"},
    line_kws={"color": "#d7191c", "lw": 2}
)

# Annotate points with zeolite codes
for i, row in df.iterrows():
    ax.text(row["#True Data in Training"] + 10, row["Mean Top-K (%)"] + 1, row["Zeolite"],
            fontsize=9, color="black", alpha=0.7)

ax.set_xscale("log")  # Log scale helps spread uneven data
ax.set_xlabel("Number of True Data in Training", fontsize=12)
ax.set_ylabel("Mean Top-K Hit Rate (%)", fontsize=12)
ax.set_title("Relationship between Training Data Volume and Top-K Performance", fontsize=14)
plt.tight_layout()
plt.show()
