In [1]:
import os
import duckdb
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document

  from tqdm.autonotebook import tqdm, trange
comet_ml is installed but `COMET_API_KEY` is not set.


Connecting to DuckDB and setting which columns do we use for the vector database. We have decided to only use papers from MISQ journal that are mentioned in the article "MISQ Research Curation on IS Use"

In [2]:
db_path = '../duck_db/isrecon_AIS11.duckdb'

In [3]:
with duckdb.connect(database=db_path, read_only=True) as conn:
    query = '''SELECT article_id,authors, year, title, journal, abstract, keywords, citation_count FROM papers
    WHERE title IN (
    'Technology Adaptation: The Case of a Computer-Supported Inter-organizational Virtual Team',
    'How Do Suppliers Benefit from Information Technology Use in Supply Chain Relationships?',
    'A Multilevel Model of Resistance to Information Technology Implementation',
    'Understanding User Responses to Information Technology: A Coping Model of User Adaptation',
    'A Comprehensive Conceputalization of the Post-Adoptive Behaviors Associated with IT-Enabled Work Systems',
    'Information Technology and the Performance of the Customer Service Process: A Resource-Based Analysis',
    'Toward a Deeper Understanding of System Usage in Organizations: A Multilevel Perspective',
    'How Habit Limits the Predictive Power of Intention: The Case of Information Systems Continuance',
    'Predicting Different Conceptualizations of System Use: The Competing Roles of Behavioral Intention, Facilitating Conditions, and Behavioral Expectation',
    'The Integrative Framework of Technology Use: An Extension and Test',
    'Why Break the Habit of a Lifetime? Rethinking the Roles of Intention, Habit, and Emotion in Continuing Information Technology Use',
    'An Alternative to Methodological Individualism: A Non-Reductionist Approach to Studying Technology Adoption by Groups',
    'Capturing Bottom-Up Information Technology Use Processes: A Complex Adaptive Systems Model',
    'Understanding User Revisions When Using Information System Features: Adaptive System Use and Triggers',
    'Interfirm IT Capability Profiles and Communications for Cocreating Relational Value: Evidence from the Logistics Industry',
    'A Dramaturgical Model of the Production of Performance Data',
    'The Embeddedness of Information Systems Habits in Organizational and Individual Level Routines: Development and Disruption',
    'When Does Technology Use Enable Network Change in Organizations? A Comparative Study of Feature Use and Shared Affordances',
    'Nature and Nurture: The Impact of Automaticity and the Structuration of Communication on Virtual Team Behavior and Performance',
    'An Investigation of Information Systems Use Patterns: Technological Events as Triggers, the Effect of Time, and Consequences for Performance',
    'Toward Generalizable Sociomaterial Inquiry: A Computational Approach for Zooming In and Out of Sociomaterial Routines',
    'Coping with Information Technology: Mixed Emotions, Vacillation, and Nonconforming Use Patterns',
    'Information Technology Use as a Learning Mechanism: The Impact of IT Use on Knowledge Transfer Effectiveness, Absorptive Capacity, and Franchisee Performance',
    'ICT, Intermediaries, and the Transformation of Gendered Power Structures',
    'Multiplex Appropriation in Complex Systems Implementation: The Case of Brazil''s Correspondent Banking System',
    'Revisiting Group-Based Technology Adoption as a Dynamic Process: The Role of Changing Attitude-Rationale Configurations',
    'Capturing the Complexity of Malleable IT Use: Adaptive Structuration Theory for Individuals',
    'A Temporally Situated Self-Agency Theory of Information Technology Reinvention'
);'''
    df_article = conn.execute(query).fetchdf()

Controlling the values in the dataframe

In [4]:
print(df_article.head())

   article_id                                     authors  year  \
0         926        Bartelt, Valerie L.; Dennis, Alan R.  2014   
1        1033          Beaudry, Anne; Pinsonneault, Alain  2005   
2        1658  Burton-Jones, Andrew; Gallivan, Michael J.  2007   
3        6541                                Kim, Sung S.  2009   
4        7061           Lapointe, Liette; Rivard, Suzanne  2005   

                                               title  \
0  Nature and Nurture: The Impact of Automaticity...   
1  Understanding User Responses to Information Te...   
2  Toward a Deeper Understanding of System Usage ...   
3  The Integrative Framework of Technology Use: A...   
4  A Multilevel Model of Resistance to Informatio...   

                                    journal  \
0  Management Information Systems Quarterly   
1  Management Information Systems Quarterly   
2  Management Information Systems Quarterly   
3  Management Information Systems Quarterly   
4  Management Information

In [5]:
print(df_article.shape)

(19, 8)


In [6]:
null_counts = df_article.isnull().sum()

Null values have to be handled before creating vector database because they are causing error

In [7]:
print(null_counts)

article_id        0
authors           0
year              0
title             0
journal           0
abstract          0
keywords          0
citation_count    0
dtype: int64


We see that there are no missing values but in different tests we hade some null values so we implement filling of them just in case for the future.

In [8]:
df_article = df_article.fillna('Information not provided in the source DB')

In this step we concatenate the columns of the dataframe to create a new column called page content. We do this because we want cannot use tabular data as input to the embeddings model. We need to convert the tabular data into a text format.

In [9]:
page_content = (df_article['article_id'].astype(str) + ' ' + df_article['authors'] + ' ' + df_article['year'].astype(str) + ' ' + df_article['title'] + ' ' + df_article['journal'] + ' ' + df_article['abstract'] + ' ' + df_article['keywords'] + ' ' + df_article['citation_count'].astype(str)).tolist()

Creating a persist directory where the vector database will be stored

In [10]:
persist_directory = '../RAG_multiple_vector_stores/article_chroma_db_MISQ'

 Here we are creating object with concatenated text (page content) with metadata that is associated to the page content.

In [11]:
documents = [
    Document(page_content=text, metadata={'article_id': row['article_id'], 'authors': row['authors'], 'year': row['year'], 'title': row['title'], 'journal': row['journal'], 'abstract': row['abstract'], 'keywords': row['keywords'], 'citation_count': row['citation_count']})
    for text, (_, row) in zip(page_content, df_article.iterrows())
]

We use sentence transformers model for our embeddings model

In [12]:
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L6-v2')

In this step we create our vector database where we take doucuments which is the source of information and which will get embedded and stored in the vector database, embedding model which is "sentence transformes", and we store this vector database in persist directory for later use. This way the vector database is stored on disk and we can access it later without recreating it from the scratch.

In [13]:
vectordb_articles = Chroma.from_documents(documents=documents, 
                                 embedding=embedding_model,
                                 persist_directory=persist_directory)

In [29]:
retriever = vectordb_articles.as_retriever()

In [30]:
def query_vectordb(query, top_k=1):
    results = retriever.get_relevant_documents(query, k=top_k)
    return results

In [31]:
query = "which article has highest citation count?"
results = query_vectordb(query)
print(results)

[Document(page_content='1658 Burton-Jones, Andrew; Gallivan, Michael J. 2007 Toward a Deeper Understanding of System Usage in Organizations: A Multilevel Perspective Management Information Systems Quarterly The objective of this paper is to contribute to a deeper understanding of system usage in organizations by examining its multilevel nature. Past research on system usage has suffered from a levels bias, with researchers studying system usage at single levels of analysis only (e.g., the individual, group, or organizational level). Although single-level research can be useful, we suggest that studying organizations one level at a time will ultimately lead to an unnatural, incomplete, and very disjointed view of how information systems are used in practice. To redress this situation, we draw on recent advances in multilevel theory to present system usage as a multilevel construct and provide an illustration for what it takes for researchers to study it as such. The multilevel perspecti

  warn_deprecated(


We take the same steps for sentences table where we have full article separated into sentences --> each row = 1 sentence

In [14]:
with duckdb.connect(database=db_path, read_only=True) as conn:
    query = '''SELECT title , last_section_title, sentence_type, sentence_original FROM sentences
                JOIN papers ON sentences.article_id = papers.article_id
                WHERE title IN (
                'Technology Adaptation: The Case of a Computer-Supported Inter-organizational Virtual Team',
                'How Do Suppliers Benefit from Information Technology Use in Supply Chain Relationships?',
                'A Multilevel Model of Resistance to Information Technology Implementation',
                'Understanding User Responses to Information Technology: A Coping Model of User Adaptation',
                'A Comprehensive Conceputalization of the Post-Adoptive Behaviors Associated with IT-Enabled Work Systems',
                'Information Technology and the Performance of the Customer Service Process: A Resource-Based Analysis',
                'Toward a Deeper Understanding of System Usage in Organizations: A Multilevel Perspective',
                'How Habit Limits the Predictive Power of Intention: The Case of Information Systems Continuance',
                'Predicting Different Conceptualizations of System Use: The Competing Roles of Behavioral Intention, Facilitating Conditions, and Behavioral Expectation',
                'The Integrative Framework of Technology Use: An Extension and Test',
                'Why Break the Habit of a Lifetime? Rethinking the Roles of Intention, Habit, and Emotion in Continuing Information Technology Use',
                'An Alternative to Methodological Individualism: A Non-Reductionist Approach to Studying Technology Adoption by Groups',
                'Capturing Bottom-Up Information Technology Use Processes: A Complex Adaptive Systems Model',
                'Understanding User Revisions When Using Information System Features: Adaptive System Use and Triggers',
                'Interfirm IT Capability Profiles and Communications for Cocreating Relational Value: Evidence from the Logistics Industry',
                'A Dramaturgical Model of the Production of Performance Data',
                'The Embeddedness of Information Systems Habits in Organizational and Individual Level Routines: Development and Disruption',
                'When Does Technology Use Enable Network Change in Organizations? A Comparative Study of Feature Use and Shared Affordances',
                'Nature and Nurture: The Impact of Automaticity and the Structuration of Communication on Virtual Team Behavior and Performance',
                'An Investigation of Information Systems Use Patterns: Technological Events as Triggers, the Effect of Time, and Consequences for Performance',
                'Toward Generalizable Sociomaterial Inquiry: A Computational Approach for Zooming In and Out of Sociomaterial Routines',
                'Coping with Information Technology: Mixed Emotions, Vacillation, and Nonconforming Use Patterns',
                'Information Technology Use as a Learning Mechanism: The Impact of IT Use on Knowledge Transfer Effectiveness, Absorptive Capacity, and Franchisee Performance',
                'ICT, Intermediaries, and the Transformation of Gendered Power Structures',
                'Multiplex Appropriation in Complex Systems Implementation: The Case of Brazil''s Correspondent Banking System',
                'Revisiting Group-Based Technology Adoption as a Dynamic Process: The Role of Changing Attitude-Rationale Configurations',
                'Capturing the Complexity of Malleable IT Use: Adaptive Structuration Theory for Individuals',
                'A Temporally Situated Self-Agency Theory of Information Technology Reinvention'
                );'''
    df_sentences= conn.execute(query).fetchdf()

In [16]:
print(df_sentences.head())

                                               title last_section_title  \
0  Nature and Nurture: The Impact of Automaticity...               None   
1  Nature and Nurture: The Impact of Automaticity...           Abstract   
2  Nature and Nurture: The Impact of Automaticity...           Abstract   
3  Nature and Nurture: The Impact of Automaticity...           Abstract   
4  Nature and Nurture: The Impact of Automaticity...           Abstract   

  sentence_type                                  sentence_original  
0     PARAGRAPH  NATURE AND NURTURE: THE IMPACT OF AUTOMATICITY...  
1      ABSTRACT  Much prior research on virtual teams has exami...  
2      ABSTRACT  In this paper, we examine how the social struc...  
3      ABSTRACT  During habitual use situations, team members e...  
4      ABSTRACT  These genre rules influence how teams interact...  


In [17]:
print(df_sentences.shape)

(17388, 4)


In [18]:
null_counts = df_sentences.isnull().sum()

In [19]:
print(null_counts)

title                    0
last_section_title    6009
sentence_type            0
sentence_original        0
dtype: int64


In [20]:
df_sentences = df_sentences.fillna('No section information')

In [21]:
print(df_sentences['sentence_type'].value_counts())

sentence_type
PARAGRAPH        10229
TABLE             5462
HEADER             819
TABLE_HEADER       243
CAPTION            236
ABSTRACT           199
ANNEX              128
FIGURE_HEADER       26
FORMULA             25
FIGURE              20
RQ                   1
Name: count, dtype: int64


In [22]:
df_sentences = df_sentences.fillna('No section info')

In [25]:
page_content2 = (df_sentences['title'].astype(str) + ' ' + df_sentences['last_section_title'].astype(str) + ' ' + df_sentences['sentence_type'].astype(str) + ' ' + df_sentences['sentence_original'].astype(str)).tolist()

In [26]:
documents2 = [
    Document(page_content=text, metadata={'title': row['title'], 'last_section_title': row['last_section_title'], 'sentence_type': row['sentence_type'], 'sentence_original': row['sentence_original']})
    for text, (_, row) in zip(page_content, df_sentences.iterrows())
]

In [27]:
persist_directory2 = '../RAG_multiple_vector_stores/sentence_chroma_db_MISQ'

In [28]:
vectordb_sentences = Chroma.from_documents(documents=documents2, 
                                 embedding=embedding_model,
                                 persist_directory=persist_directory2)