In [1]:
import duckdb

# Connect to the DuckDB database
db_path = 'isrecon_AIS11.duckdb'

# Connect to the DuckDB database and fetch the data
with duckdb.connect(database=db_path, read_only=True) as conn:
    query = 'SELECT title, abstract FROM papers LIMIT 500'
    df = conn.execute(query).fetchdf()

In [27]:
print(df)

                                                 title  \
0    Examining interdependence between product user...   
1    Computer support for strategic organizational ...   
2            Essence: facilitating software innovation   
3    The dark side of data ecosystems: A longitudin...   
4    Symbolic Action Research in Information System...   
..                                                 ...   
495  Pricing Models for Online Advertising: CPM vs....   
496                Ad-Blockers: A Blessing or a Curse?   
497  Assessing the benefits from e-business transfo...   
498  Trust and technologies: Implications for organ...   
499  The ethical and social implications of persona...   

                                              abstract  
0    Firm-sponsored online user communities have be...  
1    While information systems continue to be promo...  
2    This paper suggests ways to facilitate creativ...  
3    Data are often vividly depicted as strategic a...  
4    An essay is p

In [2]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange
comet_ml is installed but `COMET_API_KEY` is not set.


In [4]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
texts = (df['title'] + ' ' + df['abstract']).tolist() 
embeddings = model.encode(texts)

In [13]:
df['title'] = df['title'].fillna('')
df['abstract'] = df['abstract'].fillna('')

In [14]:
def clean_metadata(metadata_list):
    for metadata in metadata_list:
        for key, value in metadata.items():
            if value is None:
                metadata[key] = ''
            elif not isinstance(value, (str, int, float, bool)):
                metadata[key] = str(value)
    return metadata_list

In [15]:
import chromadb

In [16]:
ids = [str(i) for i in range(len(df))]
titles = [row.title for row in df.itertuples()]
abstracts = [row.abstract for row in df.itertuples()]
embeddings_list = [embedding.tolist() for embedding in embeddings]

In [17]:
metadatas = [{"title": title, "abstract": abstract} for title, abstract in zip(titles, abstracts)]
metadatas = clean_metadata(metadatas)

In [18]:
client = chromadb.Client()

In [19]:
collection = client.create_collection('paper_embeddings')

UniqueConstraintError: Collection paper_embeddings already exists

In [20]:
try:
    collection.add(ids=ids, embeddings=embeddings_list, metadatas=metadatas)
    print("Data inserted successfully.")
except ValueError as e:
    print(f"Error inserting data: {e}")

Data inserted successfully.


In [28]:
query = "give me info about this article Examining interdependence between product"
results = collection.query(
    query_texts=[query], # Chroma will embed this for you
    n_results=2 # how many results to return
)

# Inspect the raw results
print("Raw results:")
print(results)

Raw results:
{'ids': [['333', '145']], 'distances': [[10.06134033203125, 10.169628143310547]], 'metadatas': [[{'abstract': 'Searching for the right information and making quick, accurate decisions within time-pressured settings is often non-trivial. We contrast the relative efficacies of written English (Text) and a more concise, compact communication mode (Image) for information search and decision making by using a financial incentive scheme to apply implicit time pressure on subjects. We found that, while Image users earned as much as Text users, they achieved this earnings parity by following speedier but less accurate strategies. We conclude with thoughts on possible refinements to our work that could steer subjects in the ideal direction of fast, accurate, lucrative decisions with languages like Image.', 'title': 'Searching for information in a time-pressured setting: experiences with a Text-based and an Image-based decision support system'}, {'abstract': 'We appreciate the inter

In [26]:
collections = client.list_collections()
print("Existing collections before deletion:")
for collection in collections:
    print(f" - {collection.name}")

Existing collections before deletion:
 - paper_embeddings


In [29]:
collection_name = 'paper_embeddings'
if collection_name in [col.name for col in collections]:
    client.delete_collection(collection_name)
    print(f"\nCollection '{collection_name}' has been deleted.")
else:
    print(f"\nCollection '{collection_name}' does not exist.")


Collection 'paper_embeddings' has been deleted.
