In [None]:
!pip install sentence-transformers scikit-learn numpy pandas matplotlib seaborn


In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sentence_transformers import SentenceTransformer

In [None]:
# Load preprocessed text
input_json = "processed_papers.json"

In [None]:
with open(input_json, "r", encoding="utf-8") as f:
    papers = json.load(f)

# Extract text and filenames
texts = [paper["text"] for paper in papers]
file_names = [paper["file_name"] for paper in papers]


In [None]:
# Load pre-trained BERT model for embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")  # Efficient and accurate


In [None]:
# Generate BERT embeddings
embeddings = model.encode(texts, show_progress_bar=True)


In [None]:
# Compute similarity metrics
cosine_sim_matrix = cosine_similarity(embeddings)
euclidean_dist_matrix = euclidean_distances(embeddings)


In [None]:
# Convert similarity matrices to DataFrames for better readability
cosine_df = pd.DataFrame(cosine_sim_matrix, index=file_names, columns=file_names)
euclidean_df = pd.DataFrame(euclidean_dist_matrix, index=file_names, columns=file_names)


In [None]:
# Clustering using K-Means
num_clusters = 5  # You can experiment with this
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(embeddings)



In [None]:
# Attach cluster labels to papers
for i, paper in enumerate(papers):
    paper["cluster"] = int(clusters[i])


In [None]:
# Save clustered data
output_json = "clustered_papers.json"
with open(output_json, "w", encoding="utf-8") as f:
    json.dump(papers, f, indent=4)



In [None]:
# Visualization: Plot clusters in 2D using PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=reduced_embeddings[:, 0], y=reduced_embeddings[:, 1], hue=clusters, palette="Set2")
plt.title("Document Clustering (PCA Projection)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title="Cluster")
plt.show()

print("Clustering complete! Results saved in 'clustered_papers.json'")