# Prototype Phase 2: Backend Data Pipeline

## Dependencies

In [None]:

!pip install sentence-transformers umap-learn hdbscan plotly scikit-learn


## Libraries

In [13]:

import pandas as pd
import plotly.express as px
import plotly.io as pio
import umap
import hdbscan
from sklearn.cluster import DBSCAN, OPTICS
from sentence_transformers import SentenceTransformer
import numpy as np
import os
from IPython.display import display

# Directories
os.makedirs("outputs", exist_ok=True)

pio.renderers.default = "colab"


## Dataset

In [14]:

# Load dataset: CSV file (must have 2 columns only)
df = pd.read_csv('/content/outputs/full_renewable_energy_papers.csv')

# Combine title and abstract
corpus = df['title'] + ' ' + df['abstract']
titles = df['title'].tolist()


## Loading Data into Model

In [None]:

# Loading Dataset into Transformer Model

model = SentenceTransformer('all-mpnet-base-v2')
X = model.encode(corpus)


## Clustering

In [None]:

# Clustering
hdbscan_labels = hdbscan.HDBSCAN(min_cluster_size=2).fit_predict(X)
dbscan_labels = DBSCAN(eps=0.7, min_samples=2, metric='euclidean').fit_predict(X)
optics_labels = OPTICS(min_samples=2, xi=0.05, min_cluster_size=0.05).fit_predict(X)

# Dimensionality reduction
X_umap_2d = umap.UMAP(n_components=2, random_state=42).fit_transform(X)
X_umap_3d = umap.UMAP(n_components=3, random_state=42).fit_transform(X)


## Cluster Visualisation

In [19]:

# Interactive Visualisation Using Plotly
df_2d = pd.DataFrame({
    'UMAP_1': X_umap_2d[:, 0],
    'UMAP_2': X_umap_2d[:, 1],
    'Title': titles,
    'HDBSCAN Cluster': hdbscan_labels,
    'DBSCAN Cluster': dbscan_labels,
    'OPTICS Cluster': optics_labels
})

fig_hdbscan_2d = px.scatter(df_2d, x='UMAP_1', y='UMAP_2', color='HDBSCAN Cluster', hover_name='Title', title='2D UMAP - HDBSCAN')
fig_dbscan_2d = px.scatter(df_2d, x='UMAP_1', y='UMAP_2', color='DBSCAN Cluster', hover_name='Title', title='2D UMAP - DBSCAN')
fig_optics_2d = px.scatter(df_2d, x='UMAP_1', y='UMAP_2', color='OPTICS Cluster', hover_name='Title', title='2D UMAP - OPTICS')

fig_hdbscan_2d.show()
fig_dbscan_2d.show()
fig_optics_2d.show()


df_3d = pd.DataFrame({
    'UMAP_1': X_umap_3d[:, 0],
    'UMAP_2': X_umap_3d[:, 1],
    'UMAP_3': X_umap_3d[:, 2],
    'Title': titles,
    'HDBSCAN Cluster': hdbscan_labels,
    'DBSCAN Cluster': dbscan_labels,
    'OPTICS Cluster': optics_labels
})

fig_hdbscan_3d = px.scatter_3d(df_3d, x='UMAP_1', y='UMAP_2', z='UMAP_3', color='HDBSCAN Cluster', hover_name='Title', title='3D UMAP - HDBSCAN')
fig_dbscan_3d = px.scatter_3d(df_3d, x='UMAP_1', y='UMAP_2', z='UMAP_3', color='DBSCAN Cluster', hover_name='Title', title='3D UMAP - DBSCAN')
fig_optics_3d = px.scatter_3d(df_3d, x='UMAP_1', y='UMAP_2', z='UMAP_3', color='OPTICS Cluster', hover_name='Title', title='3D UMAP - OPTICS')

fig_hdbscan_3d.show()
fig_dbscan_3d.show()
fig_optics_3d.show()



## Summary Table of Clusters

In [20]:

summary_df = pd.DataFrame({
    'Title': titles,
    'HDBSCAN Cluster': hdbscan_labels,
    'DBSCAN Cluster': dbscan_labels,
    'OPTICS Cluster': optics_labels
})
summary_df.sort_values(['HDBSCAN Cluster', 'DBSCAN Cluster', 'OPTICS Cluster'])



Unnamed: 0,Title,HDBSCAN Cluster,DBSCAN Cluster,OPTICS Cluster
0,The role of hybrid renewable energy systems in...,-1,-1,-1
6,Towards energy security: Could renewable energ...,-1,-1,-1
7,Energy metabolism in health and diseases,-1,-1,-1
10,Digital Economics,-1,-1,-1
11,Empowering smart grid: A comprehensive review ...,-1,-1,-1
22,How do ICT and renewable energy impact sustain...,-1,-1,-1
27,On the economics of renewable energy sources,-1,-1,2
12,Energy and Human Health,-1,-1,4
13,Energy-Technology Innovation,-1,-1,4
31,21st Century’s energy: Hydrogen energy system,-1,-1,4
