In [1]:
import requests
import numpy as np
import pandas as pd
from openai import OpenAI
import json

In [2]:
DATA_PATH = r"..\data\arxiv_metadata.parquet.gzip"
CATEGORIES_PATH = r"..\data\categories.json"

In [3]:
def load_data(p):
    return pd.read_parquet(p)

def load_categories(p):
    with open(p, "r") as f:
        return json.load(f)

In [4]:
df = load_data(DATA_PATH)
display(df.head())
print(df.shape)

categories = load_categories(CATEGORIES_PATH)
display(categories)
print(len(categories))

Unnamed: 0,id,title,abstract,categories,update_date
0,704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,hep-ph,2008-11-26
1,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",math.CO cs.CG,2008-12-13
2,704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,physics.gen-ph,2008-01-13
3,704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,math.CO,2007-05-23
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,math.CA math.FA,2013-10-15


(2468403, 5)


{'acc-phys': 'Accelerator Physics',
 'adap-org': 'Not available',
 'q-bio': 'Not available',
 'cond-mat': 'Not available',
 'chao-dyn': 'Not available',
 'patt-sol': 'Not available',
 'dg-ga': 'Not available',
 'solv-int': 'Not available',
 'bayes-an': 'Not available',
 'comp-gas': 'Not available',
 'alg-geom': 'Not available',
 'funct-an': 'Not available',
 'q-alg': 'Not available',
 'ao-sci': 'Not available',
 'atom-ph': 'Atomic Physics',
 'chem-ph': 'Chemical Physics',
 'plasm-ph': 'Plasma Physics',
 'mtrl-th': 'Not available',
 'cmp-lg': 'Not available',
 'supr-con': 'Not available',
 'econ.GN': 'General Economics',
 'econ.TH': 'Theoretical Economics',
 'eess.SY': 'Systems and Control',
 'astro-ph': 'Astrophysics',
 'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
 'astro-ph.EP': 'Earth and Planetary Astrophysics',
 'astro-ph.GA': 'Astrophysics of Galaxies',
 'astro-ph.HE': 'High Energy Astrophysical Phenomena',
 'astro-ph.IM': 'Instrumentation and Methods for Astrophysics'

176


In [11]:
df_cs = df[df["categories"].apply(lambda x: x.startswith("cs."))]
print(df_cs.shape)

categories_cs = {k: v for k, v in categories.items() if "cs" in k}
print(len(categories_cs))

(503248, 5)
62


In [18]:
# Sort categories by number of papers
top_categories = df_cs["categories"].value_counts().sort_values(ascending=False)
top_categories = top_categories[top_categories > 1000]

len(top_categories)

# Get N papers for each category
N = 1000

def get_papers(df, N):
    return pd.concat([df[df["categories"] == c].sample(N) for c in df["categories"].unique()])

df_cs_top = df_cs[df_cs["categories"].isin(top_categories.index)]

df_cs_top_sample = get_papers(df_cs_top, N)

print(df_cs_top_sample.shape)

(29000, 5)


In [19]:
# Make sure to `pip install openai` first

client = OpenAI(base_url="http://localhost:5000/v1", api_key="lm-studio")


def get_embeddings(texts, model="second-state/All-MiniLM-L6-v2-Embedding-GGUF"):
   """
   texts: list of strings
   """

   return client.embeddings.create(input = texts, model=model)

In [20]:
texts = df_cs_top_sample["title"].tolist()
display(texts[:5])
embeddings = get_embeddings(texts)
#

['Online Matching with Stochastic Rewards: Optimal Competitive Ratio via\n  Path Based Formulation',
 'On the Configuration-LP of the Restricted Assignment Problem',
 'A Polynomial Kernel for Diamond-Free Editing',
 'Asymptotically exact streaming algorithms',
 'Fast Prefix Search in Little Space, with Applications']

In [32]:
# Save embeddings
# embeddings = np.array([embeddings.data[i].embedding for i in range(len(embeddings.data))])
# np.save(r"embeddings.npy", embeddings)

# Save texts
# with open(r"texts.txt", "w") as f:
#     f.write("\n".join([t.replace("\n", " ") for t in texts]))

In [44]:
embeddings = np.load(r"embeddings.npy")
labels = df_cs_top_sample["categories"].tolist()

label2id = {label: i for i, label in enumerate(np.unique(labels))}

labels = np.array([label2id[label] for label in labels])


In [45]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)

# Create a pipeline with a standard scaler and logistic regression classifier
pipeline = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))

# Train the classifier
pipeline.fit(X_train, y_train)


In [46]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report for detailed metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.5684
Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.36      0.38       198
           1       0.63      0.68      0.65       205
           2       0.55      0.56      0.55       189
           3       0.55      0.57      0.56       194
           4       0.59      0.60      0.60       199
           5       0.66      0.72      0.69       192
           6       0.65      0.65      0.65       205
           7       0.57      0.61      0.59       191
           8       0.41      0.40      0.41       191
           9       0.59      0.59      0.59       213
          10       0.54      0.50      0.52       212
          11       0.72      0.74      0.73       192
          12       0.48      0.41      0.44       212
          13       0.45      0.49      0.47       180
          14       0.60      0.60      0.60       184
          15       0.58      0.61      0.60       208
          16       0.65      0.73      0.

In [48]:
from sklearn.cluster import KMeans

# Number of clusters is set to the number of unique labels
num_clusters = len(np.unique(labels))

# Perform KMeans clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)

from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score

# Adjusted Rand Index (ARI)
ari = adjusted_rand_score(labels, cluster_labels)
print(f"Adjusted Rand Index (ARI): {ari:.4f}")

# Normalized Mutual Information (NMI)
nmi = normalized_mutual_info_score(labels, cluster_labels)
print(f"Normalized Mutual Information (NMI): {nmi:.4f}")

# Silhouette Score
silhouette_avg = silhouette_score(embeddings, labels)
print(f"Silhouette Score: {silhouette_avg:.4f}")


  super()._check_params_vs_input(X, default_n_init=10)


Adjusted Rand Index (ARI): 0.2519
Normalized Mutual Information (NMI): 0.4182
Silhouette Score: 0.0052
