In [54]:
import json
import os

import chromadb
import pandas as pd
import seaborn as sns
import torch
import torch.nn.functional as F
from chromadb.utils import embedding_functions
from tqdm.notebook import tqdm
from transformers import AutoModel, AutoTokenizer

In [55]:
# https://docs.trychroma.com/guides

CHROMA_DATA_PATH = "chroma_data/"
EMBED_MODEL = "all-MiniLM-L6-v2"
# EMBED_MODEL = "all-mpnet-base-v2"
COLLECTION_NAME = "arxiv_papers"
BATCH_SIZE = 5000

CHROMA_DATA_PATH = os.path.join(CHROMA_DATA_PATH, EMBED_MODEL)

In [56]:
cache_dir = 'cache'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

parquet_path = '../data/arxiv_metadata_sample.parquet.gzip'
arxiv_df = pd.read_parquet(parquet_path)

In [57]:
def text_processing(sample):
    title = sample['title']
    abstract = sample['abstract']

    # remove special characters
    title = title.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    abstract = abstract.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')

    # remove multiple spaces
    title = ' '.join(title.split())
    abstract = ' '.join(abstract.split())

    return f"Title: {title} - Abstract: {abstract}"

In [58]:
arxiv_df['text'] = arxiv_df.apply(text_processing, axis=1)
arxiv_df.head(3)

Unnamed: 0,id,title,abstract,categories,update_date,title_words,abstract_words,mapped_categories,amount_categories,update_year,text
0,1911.1286,Double-Scaling Limit in Principal Chiral Model...,"We initiate a systematic, non-perturbative s...",[hep-th],2020-05-20,10,116,[High Energy Physics - Theory],1,2020,Title: Double-Scaling Limit in Principal Chira...
1,astro-ph/9509086,Genus Statistics of the Large-Scale Structure ...,As a statistical measure to quantify the top...,[astro-ph],2009-10-28,10,95,[Astrophysics],1,2009,Title: Genus Statistics of the Large-Scale Str...
2,1106.3718,Quantum Efficiency of Intermediate-Band Solar ...,As an appealing concept for developing next-...,[cond-mat.mtrl-sci],2012-10-10,12,165,[Materials Science],1,2012,Title: Quantum Efficiency of Intermediate-Band...


In [59]:
def create_collection(client, collection_name, embedding_function):
    collection = client.create_collection(
        name=collection_name,
        embedding_function=embedding_function,
        metadata={"hnsw:space": "cosine"},
        get_or_create=True,
    )

    return collection

def delete_collection_data(client, collection, collection_name):
    print(f"Deleting data from collection {collection_name} with {collection.count()} documents")
    client.delete_collection(collection_name)

In [60]:
# delete the collection if it exists
client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBED_MODEL,
    device="cuda",
)

collection = create_collection(client, COLLECTION_NAME, embedding_func)
delete_collection_data(client, collection, COLLECTION_NAME)
collection = create_collection(client, COLLECTION_NAME, embedding_func)

Deleting data from collection arxiv_papers with 30000 documents


In [61]:
metadatas = []
for categories in arxiv_df['mapped_categories']:
    metadata = {f"category_{i}": category for i, category in enumerate(categories)}
    metadatas.append(metadata)

In [62]:
for i in tqdm(range(0, len(arxiv_df), BATCH_SIZE)):
    collection.upsert(
        documents=arxiv_df['text'].iloc[i:i + BATCH_SIZE].tolist(),
        ids=arxiv_df['id'].iloc[i:i + BATCH_SIZE].tolist(),
        metadatas=metadatas[i:i + BATCH_SIZE],
    )

  0%|          | 0/6 [00:00<?, ?it/s]

In [63]:
# Define query
words_per_line = 10
# define papers to show
top_n_papers = 3
query = "Black holes in the universe and suns in the galaxy"
print("Query:\n", query, "\n")
query_results = collection.query(query_texts=[query], n_results=top_n_papers)
for _id, _doc, _dist, _meta in zip(query_results["ids"][0], query_results["documents"][0], query_results["distances"][0], query_results["metadatas"][0]):
    print(f"#####   ID: {_id}   #####")
    print(f"Distance: {_dist}")
    print(f"Metadata: {_meta}")
    _doc_lines = _doc.split()
    for i in range(0, len(_doc_lines), words_per_line):
        print(" ".join(_doc_lines[i:i + words_per_line]))
    print("\n")

Query:
 Black holes in the universe and suns in the galaxy 

#####   ID: astro-ph/9910088   #####
Distance: 0.32320648431777954
Metadata: {'category_0': 'Astrophysics'}
Title: Evolution of Black Holes in the Galaxy - Abstract:
In this article we consider the formation and evolution of
black holes, especially those in binary stars where radiation from
the matter falling on them can be seen. We consider
a number of effects introduced by some of us, which
are not traditionally included in binary evolution of massive stars.
These are (i) hypercritical accretion, which allows neutron stars to
accrete enough matter to collapse to a black hole during
their spiral-in into another star. (ii) the strong mass loss
of helium stars, which causes their evolution to differ from
that of the helium core of a massive star. (iii)
The direct formation of low-mass black holes ($M\sim2\msun$) from single
stars, a consequence of a significant strange-matter content of the
nuclear-matter equation of state at 

In [64]:
response = collection.get(include=["metadatas", "documents", "embeddings"])
df = pd.DataFrame({
    "id": response["ids"],
    "document": response["documents"],
    "embedding": response["embeddings"],
    "categories": [list(m.values()) for m in response["metadatas"]],
})
all_categories = df["categories"].explode().unique()
df["first_category"] = df["categories"].apply(lambda x: x[0])
df.loc[:, all_categories] = df["categories"].apply(lambda x: [1 if cat in x else 0 for cat in all_categories]).tolist()
df.head(3)

Unnamed: 0,id,document,embedding,categories,first_category,Quantum Physics,General Relativity and Quantum Cosmology,Strongly Correlated Electrons,High Energy Physics - Phenomenology,Mesoscale and Nanoscale Physics,Materials Science,High Energy Physics - Theory,Analysis of PDEs,Computer Vision and Pattern Recognition,Computation and Language,Astrophysics
0,1001.0116,Title: One Dimensional Magnetized TG Gas Prope...,"[-0.06618823856115341, -0.03881627321243286, -...",[Quantum Physics],Quantum Physics,1,0,0,0,0,0,0,0,0,0,0
1,1001.0359,Title: Circular Orbits in Extremal Reissner No...,"[0.004333099815994501, -0.024190759286284447, ...",[General Relativity and Quantum Cosmology],General Relativity and Quantum Cosmology,0,1,0,0,0,0,0,0,0,0,0
2,1001.046,Title: Non-uniqueness of the Dirac theory in a...,"[-0.14928904175758362, 0.0017066379077732563, ...",[General Relativity and Quantum Cosmology],General Relativity and Quantum Cosmology,0,1,0,0,0,0,0,0,0,0,0


In [65]:
# from renumics import spotlight

# spotlight.show(df)

# cluster

In [81]:
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

classes = arxiv_df['mapped_categories'].explode().unique()
classes = {c: i for i, c in enumerate(classes)}
num_classes = len(classes)
print(f"Number of classes: {num_classes}")
arxiv_df['class'] = arxiv_df['mapped_categories'].apply(lambda x: classes[x[0]])

Number of classes: 11


In [78]:
kmeans = KMeans(init="k-means++", n_clusters=num_classes, n_init=4, random_state=0)
estimator = make_pipeline(StandardScaler(), kmeans).fit(df["embedding"].tolist())
# estimator = make_pipeline(kmeans).fit(df["embedding"].tolist())

df["cluster"] = estimator.predict(df["embedding"].tolist())
merged_df = pd.merge(df[['id', 'cluster']], arxiv_df, on="id")

accuracy = metrics.accuracy_score(merged_df['class'], merged_df['cluster'])
print(f"Accuracy: {accuracy}")

Accuracy: 0.17893333333333333
