In [1]:
import pickle
with open("overlap_clusters.pkl", "rb") as f:
    overlap_clusters = pickle.load(f)

In [2]:
overlap_clusters

[                                              abstracts  cluster_labels
 60    Perovskite absorbers have witnessed a remarkab...               0
 161   Comparing to other carbon materials, the gener...               0
 208   Iodine-doped graphite carbon nitride is utiliz...               0
 214   A p-type and highly conductive reduced graphen...               0
 216   Graphdiyne (GDY) is introduced into the FA0. 8...               0
 ...                                                 ...             ...
 3130  The hybrid perovskite/graphene oxide composite...               0
 3236  Graphene oxide sheets (GOSs) are introduced be...               0
 3280  A high open circuit voltage close to 1. 4 V is...               0
 3302  Solution-processed oxo-functionalized graphene...               0
 3356  Carbon‐based perovskite solar cells (PVSCs) wi...               0
 
 [64 rows x 2 columns],
                                               abstracts  cluster_labels
 48    Fully solution-pro

In [3]:
# for contrastive loss function 

def generate_pairs(overlap_clusters):
    positive_pairs = []
    negative_pairs = []
    
    for cluster_df in overlap_clusters:
        abstracts = cluster_df["abstracts"].tolist()
        positive_pairs.extend([(a, b) for i, a in enumerate(abstracts) for b in abstracts[i+1:]])

    for i in range(len(overlap_clusters) - 1):
        cluster_1 = overlap_clusters[i]["abstracts"].tolist()
        for j in range(i + 1, len(overlap_clusters)):
            cluster_2 = overlap_clusters[j]["abstracts"].tolist()
            negative_pairs.extend([(a, b) for a in cluster_1 for b in cluster_2])

    return positive_pairs, negative_pairs


In [4]:
generate_pairs(overlap_clusters)
positive_pairs, negative_pairs = generate_pairs(overlap_clusters)

In [6]:
# Get the embedding for lower diemension
import pickle

with open("embedding_map.pkl", "rb") as f:
    embedding_map = pickle.load(f)


def get_embedding(text):
    return embedding_map.get(text, None)

In [7]:
import random

#  triplets loss function (Anchor, Positive, Negative)
def generate_triplets_from_list_of_dfs(cluster_dfs, num_triplets=5000):

    clusters = {i: df["abstracts"].tolist() for i, df in enumerate(cluster_dfs)}

    # Track how many times each abstract is used as an anchor
    anchors_used = {abstract: 0 for df in cluster_dfs for abstract in df["abstracts"].tolist()}

    triplets = []

    while len(triplets) < num_triplets:
        # Prioritize underused 
        anchor = min(anchors_used, key=anchors_used.get)
        anchor_cluster_idx = None


        for cluster_idx, abstracts in clusters.items():
            if anchor in abstracts:
                anchor_cluster_idx = cluster_idx
                break

        if anchor_cluster_idx is None:
            continue  

        # Select a positive sample from the same cluster (excluding anchor)
        anchor_cluster = clusters[anchor_cluster_idx]
        if len(anchor_cluster) > 1:
            positive = random.choice([p for p in anchor_cluster if p != anchor])
        else:
            continue  

        # Select a negative sample from a different cluster
        negative_cluster_idx = random.choice([i for i in clusters.keys() if i != anchor_cluster_idx])
        negative = random.choice(clusters[negative_cluster_idx])

        
        triplets.append((anchor, positive, negative))
        anchors_used[anchor] += 1  # Increase anchor selection count

    print(f"Generated {len(triplets)} triplets, ensuring balanced anchor selection.")
    return triplets



In [8]:
from sklearn.model_selection import train_test_split
train_clusters, holdout_clusters = [], []

for cluster_df in overlap_clusters:
    train_df, holdout_df = train_test_split(cluster_df, test_size=0.2, random_state=42)
    train_clusters.append(train_df)
    holdout_clusters.append(holdout_df)

In [None]:
holdout_clusters

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pickle
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import hdbscan
import pandas as pd
import random

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define a simple projection model to learnable transform embeddings
class ProjectionHead(nn.Module):
    def __init__(self, input_dim=768, output_dim=768):
        super(ProjectionHead, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim)
        self.activation = nn.ReLU()
        self.fc2 = nn.Linear(output_dim, output_dim)
        self.layer_norm = nn.LayerNorm(output_dim)  # Use LayerNorm instead

    def forward(self, x):
        x = self.fc(x)
        x = self.activation(x)
        x = self.fc2(x)
        x = self.layer_norm(x)  
        return x


projection_model = ProjectionHead(input_dim=768, output_dim=768).to(device)
optimizer = optim.Adam(projection_model.parameters(), lr=1e-5)

# Define the dataset class for triplet loss
class TripletDataset(Dataset):
    def __init__(self, triplets, embedding_map, projection_model):
        self.triplets = triplets
        self.embedding_map = embedding_map
        self.projection_model = projection_model  # Pass projection model

    def __len__(self):
        return len(self.triplets)

    def __getitem__(self, idx):
        anchor_text, positive_text, negative_text = self.triplets[idx]

        # Convert embeddings to tensors & ensure requires_grad=True
        anchor = torch.tensor(self.embedding_map[anchor_text], dtype=torch.float32, requires_grad=True).to(device)
        positive = torch.tensor(self.embedding_map[positive_text], dtype=torch.float32, requires_grad=True).to(device)
        negative = torch.tensor(self.embedding_map[negative_text], dtype=torch.float32, requires_grad=True).to(device)

        # Pass through projection model to enable learning
        anchor = self.projection_model(anchor)
        positive = self.projection_model(positive)
        negative = self.projection_model(negative)

        return anchor, positive, negative

# Triplet loss function
class TripletLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative):
        pos_dist = torch.nn.functional.pairwise_distance(anchor, positive)
        neg_dist = torch.nn.functional.pairwise_distance(anchor, negative)
        loss = torch.clamp(pos_dist - neg_dist + self.margin, min=0.0).mean()
        return loss


# Generate triplets
triplets = generate_triplets_from_list_of_dfs(train_df, num_triplets=5000)

dataset = TripletDataset(triplets, embedding_map, projection_model)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)


# Initialize triplet loss function
criterion = TripletLoss(margin=1.0).to(device)

# Training loop
print("start training")
num_epochs = 10
for epoch in range(num_epochs):
    total_loss = 0.0
    for anchor, positive, negative in data_loader:
        optimizer.zero_grad()
        loss = criterion(anchor, positive, negative)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}')

print('Training complete!')

# Save the trained model
torch.save(projection_model.state_dict(), 'projection_model.pth')
print('Model saved successfully!')


In [12]:
import numpy as np
new_embeddings = np.array([
    projection_model(torch.tensor(embedding_map[abstract], dtype=torch.float32).to(device)).detach().cpu().numpy()
    for abstract in embedding_map.keys()
])

new_embedding_map = {abstract: embedding for abstract, embedding in zip(embedding_map.keys(), new_embeddings)}


NameError: name 'projection_model' is not defined

In [None]:
new_embeddings_array = np.array(list(new_embedding_map.values()))

print("Shape of new embeddings:", new_embeddings_array.shape)

In [None]:
import pickle

embedding_path = 'embedding_map.pkl'

with open(embedding_path, 'rb') as f:
    embedding_map = pickle.load(f)

In [None]:
def generate_holdout_triplets(holdout_clusters, num_triplets=500):
    triplets = []
    for _ in range(num_triplets):
        # Choose one cluster
        cluster_pos = random.choice(holdout_clusters)
        if len(cluster_pos) < 2:
            continue
        anchor, positive = cluster_pos.sample(2)["abstracts"].values

        # Choose a different cluster for negative
        negative_cluster = random.choice([c for c in holdout_clusters if not c.equals(cluster_pos)])
        negative = negative_cluster.sample(1)["abstracts"].values[0]

        triplets.append((anchor, positive, negative))
    return triplets


In [None]:
def generate_holdout_triplets(holdout_clusters, num_triplets=500):
    triplets = []
    for _ in range(num_triplets):
        # Choose one cluster
        cluster_pos = random.choice(holdout_clusters)
        if len(cluster_pos) < 2:
            continue
        anchor, positive = cluster_pos.sample(2)["abstracts"].values

        # Choose a different cluster for negative
        negative_cluster = random.choice([c for c in holdout_clusters if not c.equals(cluster_pos)])
        negative = negative_cluster.sample(1)["abstracts"].values[0]

        triplets.append((anchor, positive, negative))
    return triplets


In [None]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Put your model in eval mode
projection_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 1: Compute centroids from training clusters (overlap_clusters)
cluster_centroids = []
for cluster_df in train_clusters:
    vecs = []
    for abs_text in cluster_df["abstracts"]:
        if abs_text in embedding_map:
            emb = torch.tensor(embedding_map[abs_text], dtype=torch.float32).to(device)
            proj = projection_model(emb).detach().cpu().numpy()
            vecs.append(proj)
    if vecs:
        centroid = np.mean(vecs, axis=0)
        cluster_centroids.append(centroid)

# Step 2: Predict on holdout
# Step 2: Predict on holdout, with tracking of abstract names
true_labels = []
predicted_labels = []
abstract_names = []  # NEW: track abstract identifiers

for true_cluster_idx, cluster_df in enumerate(holdout_clusters):
    for abstract in cluster_df["abstracts"]:
        if abstract not in embedding_map:
            continue
        emb = torch.tensor(embedding_map[abstract], dtype=torch.float32).to(device)
        proj = projection_model(emb).detach().cpu().numpy()

        # Assign to nearest centroid
        distances = [np.linalg.norm(proj - c) for c in cluster_centroids]
        predicted_cluster = int(np.argmin(distances))

        true_labels.append(true_cluster_idx)
        predicted_labels.append(predicted_cluster)
        abstract_names.append(abstract)  # Save the abstract name


# Step 3: Accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f" Prediction Accuracy: {accuracy:.2%}")

# Step 4: Confusion Matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues",
            xticklabels=range(len(cluster_centroids)),
            yticklabels=range(len(cluster_centroids)))
plt.xlabel("Predicted Cluster")
plt.ylabel("True Cluster")
plt.title(f"Confusion Matrix (Accuracy: {accuracy:.2%})")
plt.tight_layout()
plt.show()


In [None]:
closest_clusters = []        # Closest cluster indices
second_closest_clusters = [] # Second-closest cluster indices
second_closest_distances = [] # Distances to second closest

for true_cluster_idx, cluster_df in enumerate(holdout_clusters):
    for abstract in cluster_df["abstracts"]:
        if abstract not in embedding_map:
            continue
        emb = torch.tensor(embedding_map[abstract], dtype=torch.float32).to(device)
        proj = projection_model(emb).detach().cpu().numpy()

        distances = [np.linalg.norm(proj - c) for c in cluster_centroids]
        sorted_indices = np.argsort(distances)

        closest_idx = int(sorted_indices[0])
        second_idx = int(sorted_indices[1])

        true_labels.append(true_cluster_idx)
        predicted_labels.append(closest_idx)
        abstract_names.append(abstract)

        closest_clusters.append(closest_idx)
        second_closest_clusters.append(second_idx)
        second_closest_distances.append(distances[second_idx])

In [None]:
from collections import defaultdict

cluster_distance_stats = {}
distance_distributions = defaultdict(list)

for cluster_idx, cluster_df in enumerate(train_clusters):
    centroid = cluster_centroids[cluster_idx]
    for abstract in cluster_df["abstracts"]:
        if abstract in embedding_map:
            emb = torch.tensor(embedding_map[abstract], dtype=torch.float32).to(device)
            proj = projection_model(emb).detach().cpu().numpy()
            dist = np.linalg.norm(proj - centroid)
            distance_distributions[cluster_idx].append(dist)

# Compute mean and std per cluster
for idx, distances in distance_distributions.items():
    cluster_distance_stats[idx] = {
        'mean': np.mean(distances),
        'std': np.std(distances),
        'count': len(distances)
    }

In [None]:
for idx, dists in distance_distributions.items():
    plt.hist(dists, bins=20, alpha=0.6, label=f"Cluster {idx}")
plt.legend()
plt.xlabel("Distance to Cluster Centroid")
plt.ylabel("Frequency")
plt.title("Distance Distributions per Cluster")
plt.tight_layout()
plt.show()

In [None]:
!pip install tf-keras

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict


# Settings
TOP_N = 15  # Top 10 bigrams per cluster

# ➡️ Custom words to exclude (you can extend this list)
custom_exclude_words = {'solar', 'perovskite', 'conversion',"perovskites","transport","cells pscs","perovskite films"}

# Prepare output
dataset_cluster_bigrams = []

# Process each dataset
for idx, df in enumerate(overlap_clusters):
    cluster_top_bigrams = defaultdict(list)

    for cluster_id in df['cluster_labels'].unique():
        abstracts_in_cluster = df[df['cluster_labels'] == cluster_id]['abstracts']

        # Initialize CountVectorizer for bigrams
        vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english')
        X = vectorizer.fit_transform(abstracts_in_cluster)

        # Sum bigram counts
        bigram_counts = X.sum(axis=0).A1
        bigrams = vectorizer.get_feature_names_out()

        # Sort bigrams by frequency
        sorted_bigrams = sorted(zip(bigrams, bigram_counts), key=lambda x: x[1], reverse=True)

        # Select top non-repetitive, non-excluded bigrams
        selected_bigrams = []
        selected_words = set()
        for bigram, count in sorted_bigrams:
            words = bigram.split()
            # Check if any word is in the exclusion list
            if any(word in custom_exclude_words for word in words):
                continue
            if not (words[0] in selected_words or words[1] in selected_words):
                selected_bigrams.append(bigram)
                selected_words.update(words)
            if len(selected_bigrams) == TOP_N:
                break

        cluster_top_bigrams[cluster_id] = selected_bigrams

    dataset_cluster_bigrams.append(cluster_top_bigrams)

# Display results
for i, clusters in enumerate(dataset_cluster_bigrams):
    print(f"\nDataset {i}:")
    for cluster_id, bigrams in clusters.items():
        print(f"  Cluster {cluster_id}: {bigrams}")



Dataset 0:
  Cluster 0: ['carbon based', 'cells pscs', 'graphene oxide', 'spiro ometad', 'efficient stable', 'photovoltaic performance', 'efficiency pce', 'formula text', 'low cost', 'hole extraction', 'charge recombination', 'long term', 'solution processed', 'counter electrode', 'htm free']

Dataset 1:
  Cluster 1: ['sheet resistance', 'efficiency pce', 'tin oxide', 'bending cycles', 'cells pscs', 'composite electrode', 'device performance', 'low cost', 'transparent conductive', 'wearable electronics', 'flexible pescs', 'metal substrate', 'power sources', 'radius mm', 'solution processed']

Dataset 2:
  Cluster 2: ['low temperature', 'cells pscs', 'zno based', 'solution processed', 'efficient stable', 'layer etl', 'charge recombination', 'efficiency pce', 'device stability', 'electron extraction', 'metal halide', 'surface modification', 'work function', 'active layers', 'aspect ratio']

Dataset 3:
  Cluster 3: ['cells pscs', 'tio2 layer', 'low temperature', 'efficiency pce', 'open c

In [4]:
!pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.2 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading joblib-1.5.0-py3-none-any.whl (307 kB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.5.0 scikit-learn-1.6.1 threadpoolctl-3.6.0


In [9]:
final_label_list = []

for dataset_bigrams in dataset_cluster_bigrams:
    for cluster_bigrams in dataset_bigrams.values():
        final_label_list.extend(cluster_bigrams)

# Optional: remove duplicates
final_label_list = list(set(final_label_list))

In [10]:
final_label_list

['zno based',
 'ticl4 treatment',
 'nio films',
 'niox film',
 'device stability',
 'sheet resistance',
 'device performance',
 'organic inorganic',
 'carbon based',
 'small molecule',
 'titanium dioxide',
 'cu cu2o',
 'fluorine doped',
 'tin oxide',
 'photovoltaic performance',
 '3d hybrid',
 'csxfa1 xpbi3',
 'low temperature',
 'ambient conditions',
 'black phase',
 'pedot pss',
 'material htm',
 'f4 tcnq',
 'layer etl',
 'metal substrate',
 'open circuit',
 'bending cycles',
 'highly efficient',
 'aspect ratio',
 'spacer cations',
 'layer htl',
 'designed synthesized',
 'ma 3pb4i13',
 'high performance',
 'cesium lead',
 'relative humidity',
 'hole transporting',
 'active layers',
 'spiro ometad',
 'metal halide',
 'formula text',
 'spray pyrolysis',
 'quasi 2d',
 'band alignment',
 'solution processed',
 'composite electrode',
 'dopant free',
 'nickel oxide',
 'binding energy',
 'efficient stable',
 'power sources',
 'materials htms',
 'counter electrode',
 'efficiency pce',
 'low 

In [13]:
raw_abstracts= pd.read_csv('../api_fetcher/cleaned_abstracts.csv')
raw_abstracts = raw_abstracts["abstracts"]

In [14]:
raw_abstracts[0]

'We modulated a solvent-mediated adduct for one-step crystallization of lead-free AgBi2I7 at a lower temperature (90 °C) and to obtain remnant BiI3 by controlling the nature of the substrate and precursor concentration. '

In [16]:
raw_abstracts[1]

'The device performance is enhanced by doping a small percentage of polyethylenimine (PEI) into the PCBM. '

In [17]:
raw_abstracts[2]

'Lead halide perovskites have emerged as successful optoelectronic materials with high photovoltaic power conversion efficiencies and low material cost. However, substantial challenges remain in the scalability, stability and fundamental understanding of the materials. Here we present the application of radiative thermal annealing, an easily scalable processing method for synthesizing formamidinium lead iodide (FAPbI3) perovskite solar absorbers. Devices fabricated from films formed via radiative thermal annealing have equivalent efficiencies to those annealed using a conventional hotplate. By coupling results from in situ X-ray diffraction using a radiative thermal annealing system with device performances, we mapped the processing phase space of FAPbI3 and corresponding device efficiencies. Our map of processing-structure-performance space suggests the commonly used FAPbI3 annealing time, 10 min at 170 °C, can be significantly reduced to 40 s at 170 °C without affecting the photovolt

In [18]:
raw_abstracts[3]

'Several applications of perovskite solar cells (PSCs) demand a semitransparent top electrode to afford top‐illumination or see‐through devices. Transparent conductive oxides, such as indium tin oxide (ITO), typically require postdeposition annealing at elevated temperatures, which would thermally decompose the perovskite. In contrast, silver nanowires (AgNWs) in dispersions of water would be a very attractive alternative that can be deposited at ambient conditions. Water is environmentally friendly without safety concerns associated with alcohols, such as flammability. Due to the notorious moisture sensitivity of lead‐halide perovskites, aqueous processing of functional layers, such as electrodes, on top of a perovskite device stack is elusive. Here, impermeable electron transport layers (ETLs) are shown to enable the deposition of semitransparent AgNW electrodes from green aqueous dispersions on top of the perovskite cell without damage. The polyvinylpyrrolidone (PVP) capping agent o

In [19]:
with open('final_label_list.pkl', 'wb') as f:
    pickle.dump(final_label_list, f)

In [20]:
final_label_list

['zno based',
 'ticl4 treatment',
 'nio films',
 'niox film',
 'device stability',
 'sheet resistance',
 'device performance',
 'organic inorganic',
 'carbon based',
 'small molecule',
 'titanium dioxide',
 'cu cu2o',
 'fluorine doped',
 'tin oxide',
 'photovoltaic performance',
 '3d hybrid',
 'csxfa1 xpbi3',
 'low temperature',
 'ambient conditions',
 'black phase',
 'pedot pss',
 'material htm',
 'f4 tcnq',
 'layer etl',
 'metal substrate',
 'open circuit',
 'bending cycles',
 'highly efficient',
 'aspect ratio',
 'spacer cations',
 'layer htl',
 'designed synthesized',
 'ma 3pb4i13',
 'high performance',
 'cesium lead',
 'relative humidity',
 'hole transporting',
 'active layers',
 'spiro ometad',
 'metal halide',
 'formula text',
 'spray pyrolysis',
 'quasi 2d',
 'band alignment',
 'solution processed',
 'composite electrode',
 'dopant free',
 'nickel oxide',
 'binding energy',
 'efficient stable',
 'power sources',
 'materials htms',
 'counter electrode',
 'efficiency pce',
 'low 