In [18]:
import os
import duckdb
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document

In [19]:
db_path = '/Users/kiri/Documents/HWR/master/SS/Text & Web/Rag_project-1/duck_db/isrecon_all.duckdb'

with duckdb.connect(database=db_path, read_only=True) as conn:
    query = 'SELECT article_id, title, abstract FROM papers LIMIT 500'
    df = conn.execute(query).fetchdf()

In [20]:
df.head(5)

Unnamed: 0,article_id,title,abstract
0,1,Examining interdependence between product user...,Firm-sponsored online user communities have be...
1,2,Information management as an enabler of knowle...,This paper explores the much ignored but criti...
2,3,A Business Process Perspective on Enterprise C...,The huge amount of content in today’s work lif...
3,4,A Holistic Approach for Enriching Information ...,Past literature has indicated the need for add...
4,5,A NEW LOOK AT USER COMMITMENT TOWARDS INFORMAT...,This study is concerned with the factors affec...


In [21]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
texts = (df['title'] + ' ' + df['abstract']).tolist()

In [22]:
persist_directory = 'chroma_db'

In [23]:
for _, row in df.iterrows():
    article_id = row['article_id']
    title = row['title']
    abstract = row['abstract']
    
    if article_id is None:
        print(f"None value found for article_id in row with title={title} and abstract={abstract}")
    if title is None:
        print(f"None value found for title in row with article_id={article_id} and abstract={abstract}")
    if abstract is None:
        print(f"None value found for abstract in row with article_id={article_id} and title={title}")

None value found for abstract in row with article_id=9 and title=Data Quality: Success Factors
None value found for abstract in row with article_id=90 and title=Multi-sided platforms
None value found for abstract in row with article_id=113 and title=Guest Editorial
None value found for abstract in row with article_id=121 and title=Designing and Managing Human-AI Interactions
None value found for abstract in row with article_id=140 and title=‘Employees First’: The Relationship between Employee Experience Management Systems and Customer Experience Management
None value found for title in row with article_id=144 and abstract=None
None value found for abstract in row with article_id=144 and title=None
None value found for abstract in row with article_id=159 and title=Investigating Users' Continuous Adoption of Cryptocurrency
None value found for abstract in row with article_id=180 and title=Adapting a Process Model of Initial Representation Formation to a Knowledge Management Application
N

In [24]:
df['article_id'].fillna('Unknown article_id', inplace=True)
df['title'].fillna('No title available', inplace=True)
df['abstract'].fillna('No abstract available', inplace=True)

In [25]:
documents = [
    Document(page_content=text, metadata={'id': row['article_id'], 'title': row['title'], 'abstract': row['abstract']})
    for text, (_, row) in zip(texts, df.iterrows())
]

In [26]:
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L6-v2')

In [27]:
vectordb = Chroma.from_documents(documents=documents, 
                                 embedding=embedding_model,
                                 persist_directory=persist_directory,
                                 collection_name="title_abstract_chroma_db")

In [28]:
vectordb.persist()
vectordb = None

In [29]:
vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding_model,
                  collection_name="title_abstract_chroma_db")

In [30]:
retriever = vectordb.as_retriever()

In [31]:
def query_vectordb(query, top_k=1):
    results = retriever.get_relevant_documents(query, k=top_k)
    return results

In [32]:
query = "AD blockers"
results = query_vectordb(query)

In [33]:
print(results)

[Document(page_content='Adaptive Advertisement Recommender Systems for Digital Signage With the incorporation of new technologies, digital signages can adopt their content in real time to the audience demographic and temporal features. This research proposes an adaptive advertisement recommender system for digital signage. Our objective is to create a quantitative method for targeted advertising. After analyzing digital signage advertisement viewing data collected over the course of two months, our results show that learning-to-rank approach using Stochastic Gradient-Boosted Trees (SGBT) yields the best adaptive advertisement recommender system. Our system can identify the best sequence of advertisements to attract the most viewing. More importantly, we can use the same method for different business objectives like attracting the longest time of viewing or targeting a certain age groups or genders.', metadata={'abstract': 'With the incorporation of new technologies, digital signages ca

In [34]:
for i, doc in enumerate(results, 1):
    print(f"Document {i}:")
    print(f"Title: {doc.metadata['title']}")
    print(f"Abstract: {doc.metadata['abstract']}")
    print()

Document 1:
Title: Adaptive Advertisement Recommender Systems for Digital Signage
Abstract: With the incorporation of new technologies, digital signages can adopt their content in real time to the audience demographic and temporal features. This research proposes an adaptive advertisement recommender system for digital signage. Our objective is to create a quantitative method for targeted advertising. After analyzing digital signage advertisement viewing data collected over the course of two months, our results show that learning-to-rank approach using Stochastic Gradient-Boosted Trees (SGBT) yields the best adaptive advertisement recommender system. Our system can identify the best sequence of advertisements to attract the most viewing. More importantly, we can use the same method for different business objectives like attracting the longest time of viewing or targeting a certain age groups or genders.

Document 2:
Title: Adaptive Advertisement Recommender Systems for Digital Signage


In [36]:
import chromadb
print(chromadb.__file__)


/opt/homebrew/lib/python3.11/site-packages/chromadb/__init__.py
