# V.2 Prototype Phase 2: Backend Data Pipeline

This iteration of Prototype Phase 2 includes the clustering algorithms `(DBSCAN, OPTICS)` utilised in V.1. The additional features are as follows:

1. Search query implementation
2. Automated keyword extraction from query
3. Filtering papers according to query keywords
4. Clustering only these filtered papers
5. Visualising both Similarity Score and Cluster label on a 2D and 3D scatter plot


---



## Dependencies

In [191]:

!pip install sentence-transformers umap-learn plotly scikit-learn keybert -q


## Libraries

In [192]:

import pandas as pd
import plotly.express as px
import plotly.io as pio
import umap
from sklearn.cluster import DBSCAN, OPTICS
from sentence_transformers import SentenceTransformer
import numpy as np
import os
from IPython.display import display
from sklearn.metrics.pairwise import cosine_similarity
from keybert import KeyBERT

# Directories
os.makedirs("outputs", exist_ok=True)

pio.renderers.default = "colab"


## Dataset

In [193]:

# Load dataset: CSV file (must have 2 columns only)
df = pd.read_csv('/content/outputs/full_renewable_energy_papers.csv')

# Combine title and abstract
corpus = df['title'] + ' ' + df['abstract']
titles = df['title'].tolist()


## Load Data into Models

In [None]:

# Loading Dataset into Transformer Model
model = SentenceTransformer('all-mpnet-base-v2')
corpus_embeddings = model.encode(corpus, show_progress_bar=True)
query_embedding = model.encode([query])[0]

# Cosine similarity between query and papers
semantic_scores = cosine_similarity([query_embedding], corpus_embeddings)[0]

# For automatic keyword extraction from query
kw_model = KeyBERT(model)


## Input Search Query Below:

In [174]:
# INPUT: search query
query = "renewable energy and its contribution to sustainable development"

## Keyword Extraction

In [175]:

# Automatic extraction of top 3 keywords/keyphrases
keywords = kw_model.extract_keywords(query, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=4)
user_keywords = [kw[0] for kw in keywords]

# Check keywords extracted
print("Keywords:", user_keywords)


Keywords: ['renewable energy', 'renewable', 'sustainable development', 'energy contribution']


## Filtering Articles

In [176]:

# Filter for keyword match
def contains_any_keywords(text, keywords):
    text_lower = text.lower()
    return any(keyword.lower() in text_lower for keyword in keywords)

# Filter indices and data based on keywords
filtered_indices = []
filtered_titles = []
filtered_abstracts = []
filtered_embeddings = []
filtered_scores = []

for i, (title, abstract) in enumerate(zip(df['title'], df['abstract'])):
    full_text = f"{title} {abstract}"
    if contains_any_keywords(full_text, user_keywords):
        filtered_indices.append(i)
        filtered_titles.append(title)
        filtered_abstracts.append(abstract)
        filtered_embeddings.append(corpus_embeddings[i])
        filtered_scores.append(semantic_scores[i])


In [177]:

# Sort results by semantic similarity
results = sorted(results, key=lambda x: x["Score"], reverse=True)


## Clustering

In [None]:

# Convert to array
filtered_embeddings = np.array(filtered_embeddings)

# Clustering on filtered embeddings only
dbscan_labels = DBSCAN(eps=0.7, min_samples=2, metric='euclidean').fit_predict(filtered_embeddings)
optics_labels = OPTICS(min_samples=2, xi=0.05, min_cluster_size=0.05).fit_predict(filtered_embeddings)

# UMAP reduction
umap_2d = umap.UMAP(n_components=2, random_state=42).fit_transform(filtered_embeddings)
umap_3d = umap.UMAP(n_components=3, random_state=42).fit_transform(filtered_embeddings)


## Cluster Visualisation

### DBSCAN: 2D

In [184]:

# Scale Score for Visual Representation -- Clearer size differences
df_2d['Scaled Similarity Score'] = df_2d['Similarity Score'] ** 2

# Create and Show 2D Plot
fig_dbscan_2d = px.scatter(
    df_2d,
    x='UMAP_1',
    y='UMAP_2',
    color='DBSCAN Cluster',
    hover_name='Title',
    size='Scaled Similarity Score',
    size_max=40,
    hover_data={'Similarity Score': True},
    title='2D UMAP Clustering of Filtered Papers'
)
fig_dbscan_2d.show()


### DBSCAN: 3D

In [185]:

# Scale Score for Visual Representation -- Clearer size differences
df_3d['Scaled Similarity Score'] = df_3d['Similarity Score'] ** 2


# Create and Show 3D Plot
fig_dbscan_3d = px.scatter_3d(
    df_3d,
    x='UMAP_1',
    y='UMAP_2',
    z='UMAP_3',
    color='DBSCAN Cluster',
    hover_name='Title',
    size='Scaled Similarity Score',
    size_max= 75,
    hover_data={'Similarity Score': True},
    title='3D UMAP Clustering of Filtered Papers'
)
fig_dbscan_3d.show()


### OPTICS: 2D

In [186]:

# Scale Score for Visual Representation -- Clearer size differences
df_2d['Scaled Similarity Score'] = df_2d['Similarity Score'] ** 2

# Create and Show 2D Plot
fig_optics_2d = px.scatter(
    df_2d,
    x='UMAP_1',
    y='UMAP_2',
    color='OPTICS Cluster',
    hover_name='Title',
    size='Scaled Similarity Score',
    size_max=40,
    hover_data={'Similarity Score': True},
    title='2D UMAP Clustering of Filtered Papers'
)
fig_optics_2d.show()


### OPTICS: 3D

In [187]:

# Scale Score for Visual Representation -- Clearer size differences
df_3d['Scaled Similarity Score'] = df_3d['Similarity Score'] ** 2


# Create and Show 3D Plot
fig_optics_3d = px.scatter_3d(
    df_3d,
    x='UMAP_1',
    y='UMAP_2',
    z='UMAP_3',
    color='OPTICS Cluster',
    hover_name='Title',
    size='Scaled Similarity Score',
    size_max= 100,
    hover_data={'Similarity Score': True},
    title='3D UMAP Clustering of Filtered Papers'
)
fig_optics_3d.show()


## Summary Table of Similarity Score and Clusters

In [188]:

summary_df = pd.DataFrame({
    'Title': filtered_titles,             # list of filtered paper titles
    'Similarity Score': filtered_scores,  # relevance to query
    'HDBSCAN Cluster': hdbscan_labels,
    'DBSCAN Cluster': dbscan_labels,
    'OPTICS Cluster': optics_labels
})

# Sort table by Descending Similarity Score (most relevant first)
summary_df = summary_df.sort_values(by='Similarity Score', ascending=False)

display(summary_df)


Unnamed: 0,Title,Similarity Score,HDBSCAN Cluster,DBSCAN Cluster,OPTICS Cluster
21,Renewable energy and sustainable development: ...,0.849123,-1,1,0
15,"Renewable energy, non-renewable energy and sus...",0.817395,-1,1,0
14,Renewable energy for sustainable development,0.802587,-1,1,0
18,Advances in renewable energy for sustainable d...,0.750512,-1,1,0
19,Renewable Energy and Sustainable Development,0.737899,-1,1,1
13,Sustainable development using renewable energy...,0.726526,-1,1,0
5,The Role of Renewable Energy in Driving Econom...,0.709763,-1,-1,-1
20,Renewable energy for sustainable development i...,0.700749,-1,1,1
16,Renewable energy and sustainable development i...,0.67438,-1,-1,-1
24,Reflections—The Economics of Renewable Energy ...,0.674044,-1,-1,2
