### Subfield Classification of Research Papers using Text Clustering

This script performs text clustering on a dataset of research papers. The main steps include:

1. Data Loading and Preprocessing: The script reads peer-reviewed paper data along with associated IDs and publication years. It preprocesses the text data by combining abstracts and keywords, cleaning the text, and filtering out unwanted entries.

2. Clustering with TF-IDF and KMeans: Using the TF-IDF representation of the text data, the script applies KMeans clustering to group similar papers. It also generates descriptive cluster names based on the top keywords in each cluster.

3. Clustering with Sentence Embeddings and KMeans: The script uses a pre-trained SentenceTransformer model to generate embeddings for the text data and applies KMeans clustering to form clusters based on semantic similarity.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import numpy as np

# Step 1: Data Loading and Preprocessing
# --------------------------------------

# Load the peer review data from a CSV file
pr = pd.read_csv("data/cleaned_peer_review_data_brief/cleaned_peer_review_data_brief.csv")

# Load the paper IDs for each year and create a mapping of paper_id to year
ids = {}
for year in range(2017, 2024):
    with open(f"data/id_data/{year}.txt") as f:
        ids[year] = f.read().splitlines()

# Invert the dictionary to map paper_id to year
ids = {paper_id: year for year, paper_ids in ids.items() for paper_id in paper_ids}

# Map the paper_id to year and add as a new column
pr["year"] = pr["paper_id"].map(ids)

# Filter out unwanted entries and duplicates
pr = pr[~pr["paper_name"].isin(["paper_name", "Withdraw"])]
pr = pr.drop_duplicates(subset=["paper_id", "year"])

# Select relevant columns and filter out entries with missing abstracts
df = pr[["paper_name", "abstract", "keywords", "paper_id"]]
df = df[df["abstract"].notna()]

# Fill missing keywords with empty strings and combine abstract and keywords into one text field
df["keywords"] = df["keywords"].fillna("")
df['text'] = df['abstract'] + ' ' + df['keywords']

# Clean the text by removing certain characters
df['text'] = df['text'].str.replace('"', '').replace("'", '').replace("\n", " ")

# Reset the index
df = df.reset_index(drop=True)

# Step 2: Clustering with TF-IDF vectors and KMeans
# -------------------------------------------------

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df['text'])

# Specify the number of clusters
k_tfidf = 10

# Perform KMeans clustering
kmeans_tfidf = KMeans(n_clusters=k_tfidf, random_state=42)
kmeans_tfidf.fit(X_tfidf)

# Assign cluster labels to the DataFrame
df['cluster_tfidf'] = kmeans_tfidf.labels_

# Generate cluster names based on top TF-IDF features in each cluster
feature_names = vectorizer.get_feature_names_out()
cluster_names = {}
top_n = 3  # Number of top keywords to consider for cluster names

for i in range(k_tfidf):
    # Get indices of papers in the current cluster
    cluster_indices = np.where(kmeans_tfidf.labels_ == i)[0]
    
    # Sum TF-IDF scores for each feature within the cluster
    cluster_sum = X_tfidf[cluster_indices].sum(axis=0)
    
    # Get top feature indices
    top_indices = np.argsort(cluster_sum).tolist()[0][-top_n:][::-1]
    
    # Get the corresponding feature names
    top_keywords = [feature_names[idx] for idx in top_indices]
    
    # Create a cluster name
    cluster_name = '_'.join(top_keywords)
    cluster_names[i] = cluster_name

# Print the cluster names
for i in range(k_tfidf):
    print(f"Cluster {i+1} ({cluster_names[i]}):\n")
    # Optionally, print paper names or other details
    # cluster_papers = df[df['cluster_tfidf'] == i]
    # print(cluster_papers['paper_name'].tolist())

# Step 3: Clustering with Sentence Embeddings and KMeans
# ------------------------------------------------------

# Load a pre-trained SentenceTransformer model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

# Generate embeddings for the text data
embeddings = model.encode(df['text'].tolist())

# Specify the number of clusters
k_embeddings = 3

# Perform KMeans clustering on the embeddings
kmeans_embeddings = KMeans(n_clusters=k_embeddings, random_state=42)
kmeans_embeddings.fit(embeddings)

# Assign cluster labels to the DataFrame
df['cluster_embeddings'] = kmeans_embeddings.labels_

# Optionally, print cluster assignments or analyze clusters
for i in range(k_embeddings):
    print(f"Embedding Cluster {i+1}:\n")
    # cluster_papers = df[df['cluster_embeddings'] == i]
    # print(cluster_papers['paper_name'].tolist())


### Manual Classification of Fields through Keywords

Simply classify papers based on the keywords in the abstracts and titles.

In [5]:
tran_et_al = {
    'theory': ['theorem', 'prove', 'proof'],
    'cv': ['vision', 'object detection', 'segmentation', 'pose estimation', 'optical character recognition', 'ocr', 'structure from motion', 'recognition', 'cnn', 'convolution', 'vision'],
    'nlp': ['language', 'nlp', 'named-entity', 'translation', 'translate', 'word embeddings', 'speech', 'bert', 'transformer', 'elmo', 'attention'],
    'robustness': ['adversarial', 'attack', 'poison', 'backdoor','robust'],
    'generative': ['generative', 'gan', 'vae', 'autoencoder', 'auto-encoder', 'diffusion', 'generation'],
    'optimization': ['optimization', 'convergence', 'convex', 'stationary point'],
    'graphs': ['graph', 'gnn', 'node', 'edge'],
    'bayesian': ['bayes', 'prior', 'posterior', 'gmm', 'gaussian mixture', 'mixture model', 'mcmc', 'monte carlo'],
}

df["total"] = 0

for col in tran_et_al.keys():
    df[col] = 0
    for word in tran_et_al[col]:
        df[col] += df["text"].str.contains(word).astype(int)
    df[col] = (df[col] > 0) * 1
    df["total"] += df[col]

for col in tran_et_al.keys():
    df[col] = df[col] / df["total"]
    df[col] = df[col].fillna(0)