In [18]:
import duckdb

# Connect to the DuckDB database
db_path = 'isrecon_AIS11.duckdb'

# Connect to the DuckDB database and fetch the data
with duckdb.connect(database=db_path, read_only=True) as conn:
    query = 'SELECT * FROM papers LIMIT 500'
    df = conn.execute(query).fetchdf()

In [19]:
print(df)

     article_id                                            citekey  \
0             1  kevin_yan_j_examining_interdependence_between_...   
1             2  a_wilson_f_computer_support_for_strategic_orga...   
2             3  aaen_i_essence_facilitating_software_innovatio...   
3             4  aaen_j_the_dark_side_of_data_ecosystems_a_long...   
4             5  aakhus_m_symbolic_action_research_in_informati...   
..          ...                                                ...   
495         496  asdemir_k_pricing_models_for_online_advertisin...   
496         497  aseri_m_ad_blockers_a_blessing_or_a_curse_2020...   
497         498  ash_c_assessing_the_benefits_from_e_business_t...   
498         499  ashleigh_m_trust_and_technologies_implications...   
499         500  ashman_h_the_ethical_and_social_implications_o...   

                                               authors  year  \
0    (Kevin) Yan, Jie; Leidner, Dorothy E.; Benbya,...  2021   
1                              

In [20]:
columns_to_embed = ['title', 'abstract']
df["combined_text"] = df[columns_to_embed].astype(str).agg(' '.join, axis=1)
print(df["combined_text"])

0      Examining interdependence between product user...
1      Computer support for strategic organizational ...
2      Essence: facilitating software innovation This...
3      The dark side of data ecosystems: A longitudin...
4      Symbolic Action Research in Information System...
                             ...                        
495    Pricing Models for Online Advertising: CPM vs....
496    Ad-Blockers: A Blessing or a Curse? Users who ...
497    Assessing the benefits from e-business transfo...
498    Trust and technologies: Implications for organ...
499    The ethical and social implications of persona...
Name: combined_text, Length: 500, dtype: object


In [21]:
from sentence_transformers import SentenceTransformer

In [22]:
model= SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [23]:
embeddings = model.encode(df["combined_text"].tolist())

In [24]:
import chromadb


In [25]:
client = chromadb.Client()

In [26]:
collection = client.create_collection('paper_title_embeddings')

In [10]:
documents = df['combined_text'].tolist()
ids = df['article_id'].astype(str).tolist()
metadatas = [{'title': str(row['title']), 'abstract': str(row['abstract'])} for _, row in df.iterrows()]

In [27]:
try:
    collection.add(
        documents=documents,
        ids=ids,
        metadatas=metadatas
    )
    print("Data inserted successfully!")
except ValueError as e:
    print("Error inserting data:", e)
    for i, metadata in enumerate(metadatas):
        for key, value in metadata.items():
            if not isinstance(value, (str, int, float, bool)):
                print(f"Invalid metadata at index {i}, key '{key}': {value} ({type(value)})")

Data inserted successfully!


In [28]:
import pprint
results = collection.query(
    query_texts=[
        "Ad-Blockers"
    ],
    n_results=2
)

pprint.pprint(results)

{'data': None,
 'distances': [[0.7421619892120361, 1.1711894273757935]],
 'documents': [['Ad-Blockers: A Blessing or a Curse? Users who have an '
                'ad-blocker installed present a genuine predicament for a '
                'website (also known as the publisher): On the one hand, these '
                'users do not generate revenue for the website; on the other '
                'hand, denying them access can shrink the user base and '
                'adversely affect the popularity of the website, ultimately '
                'reducing traffic over the long run. This has led some '
                'websites to require that ad-block users “white-list” them for '
                'obtaining access to an “ad-light” experience. We model the '
                'decision problem for a website facing two user segments: '
                'regular users and ad-block users. The first-level decision or '
                'gating strategy is whether to allow ad-free access to '
    

In [13]:
collections = client.list_collections()
print("Existing collections before deletion:")
for collection in collections:
    print(f" - {collection.name}")

Existing collections before deletion:
 - paper_title_embeddings


In [17]:
collection_name = 'paper_title_embeddings'
if collection_name in [col.name for col in collections]:
    client.delete_collection(collection_name)
    print(f"\nCollection '{collection_name}' has been deleted.")
else:
    print(f"\nCollection '{collection_name}' does not exist.")


Collection 'paper_title_embeddings' has been deleted.
