In [1]:
import os
import duckdb
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document

  from tqdm.autonotebook import tqdm, trange
2024-07-21 16:49:22.086914: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
comet_ml is installed but `COMET_API_KEY` is not set.


Connecting to DuckDB and setting which columns do we use for the vector database. We have decided to only use papers from MISQ journal that are mentioned in the article "MISQ Research Curation on IS Use"

In [2]:
db_path = '../duck_db/isrecon_AIS11.duckdb'

In [3]:
with duckdb.connect(database=db_path, read_only=True) as conn:
    query = '''SELECT title, authors, year, abstract, keywords, citation_count FROM papers
    WHERE title IN (
    'Technology Adaptation: The Case of a Computer-Supported Inter-organizational Virtual Team',
    'How Do Suppliers Benefit from Information Technology Use in Supply Chain Relationships?',
    'A Multilevel Model of Resistance to Information Technology Implementation',
    'Understanding User Responses to Information Technology: A Coping Model of User Adaptation',
    'A Comprehensive Conceputalization of the Post-Adoptive Behaviors Associated with IT-Enabled Work Systems',
    'Information Technology and the Performance of the Customer Service Process: A Resource-Based Analysis',
    'Toward a Deeper Understanding of System Usage in Organizations: A Multilevel Perspective',
    'How Habit Limits the Predictive Power of Intention: The Case of Information Systems Continuance',
    'Predicting Different Conceptualizations of System Use: The Competing Roles of Behavioral Intention, Facilitating Conditions, and Behavioral Expectation',
    'The Integrative Framework of Technology Use: An Extension and Test',
    'Why Break the Habit of a Lifetime? Rethinking the Roles of Intention, Habit, and Emotion in Continuing Information Technology Use',
    'An Alternative to Methodological Individualism: A Non-Reductionist Approach to Studying Technology Adoption by Groups',
    'Capturing Bottom-Up Information Technology Use Processes: A Complex Adaptive Systems Model',
    'Understanding User Revisions When Using Information System Features: Adaptive System Use and Triggers',
    'Interfirm IT Capability Profiles and Communications for Cocreating Relational Value: Evidence from the Logistics Industry',
    'A Dramaturgical Model of the Production of Performance Data',
    'The Embeddedness of Information Systems Habits in Organizational and Individual Level Routines: Development and Disruption',
    'When Does Technology Use Enable Network Change in Organizations? A Comparative Study of Feature Use and Shared Affordances',
    'Nature and Nurture: The Impact of Automaticity and the Structuration of Communication on Virtual Team Behavior and Performance',
    'An Investigation of Information Systems Use Patterns: Technological Events as Triggers, the Effect of Time, and Consequences for Performance',
    'Toward Generalizable Sociomaterial Inquiry: A Computational Approach for Zooming In and Out of Sociomaterial Routines',
    'Coping with Information Technology: Mixed Emotions, Vacillation, and Nonconforming Use Patterns',
    'Information Technology Use as a Learning Mechanism: The Impact of IT Use on Knowledge Transfer Effectiveness, Absorptive Capacity, and Franchisee Performance',
    'ICT, Intermediaries, and the Transformation of Gendered Power Structures',
    'Multiplex Appropriation in Complex Systems Implementation: The Case of Brazil''s Correspondent Banking System',
    'Revisiting Group-Based Technology Adoption as a Dynamic Process: The Role of Changing Attitude-Rationale Configurations',
    'Capturing the Complexity of Malleable IT Use: Adaptive Structuration Theory for Individuals',
    'A Temporally Situated Self-Agency Theory of Information Technology Reinvention'
);'''
    df_article = conn.execute(query).fetchdf()

IOException: IO Error: Cannot open database "/Users/yashill/Documents/2. Semester BIPM/Text, Web and Social Media/Rag_project_3/Rag_project/RAG_3_vectordb_3_separate codes/../duck_db/isrecon_AIS11.duckdb" in read-only mode: database does not exist

Controlling the values in the dataframe

In [None]:
print(df_article.head())

                                               title  \
0  Nature and Nurture: The Impact of Automaticity...   
1  Understanding User Responses to Information Te...   
2  Toward a Deeper Understanding of System Usage ...   
3  The Integrative Framework of Technology Use: A...   
4  A Multilevel Model of Resistance to Informatio...   

                                      authors  year  \
0        Bartelt, Valerie L.; Dennis, Alan R.  2014   
1          Beaudry, Anne; Pinsonneault, Alain  2005   
2  Burton-Jones, Andrew; Gallivan, Michael J.  2007   
3                                Kim, Sung S.  2009   
4           Lapointe, Liette; Rivard, Suzanne  2005   

                                            abstract  \
0  Much prior research on virtual teams has exami...   
1  This paper defines user adaptation as the cogn...   
2  The objective of this paper is to contribute t...   
3  The integrative framework of technology use (I...   
4  To better explain resistance to information te...

In [None]:
print(df_article.shape)

(19, 6)


In [None]:
null_counts = df_article.isnull().sum()

Null values have to be handled before creating vector database because they are causing error

In [None]:
print(null_counts)

title             0
authors           0
year              0
abstract          0
keywords          0
citation_count    0
dtype: int64


We see that there are no missing values but in different tests we hade some null values so we implement filling of them just in case for the future.

In [None]:
df_article = df_article.fillna('Information not provided in the source DB')

In this step we concatenate the columns of the dataframe to create a new column called page content. We do this because we want cannot use tabular data as input to the embeddings model. We need to convert the tabular data into a text format.

First we create empty list. In this list we will hold the disctionaries.
We create this loop that iterates over each row in the dataframe.
A the end we append all documents (one document=1 row from Dataframe) intho one list.

In [None]:
def concatenate_with_headers(df):
    header_mapping = {
        "authors": "Authors",
        "year": "Publication Year",
        "title": "Title",
        "abstract": "Abstract",
        "keywords": "Keywords",
        "citation_count": "Citation Count"
    }

    concatenated_rows = []
    for index, row in df.iterrows():
        concatenated_row = " ".join([f"{header_mapping[col]}: {row[col]}" for col in df.columns])
        concatenated_rows.append(concatenated_row)
    return concatenated_rows

In [None]:
documents = [Document(page_content=row) for row in concatenate_with_headers(df_article)]

In [None]:
for doc in documents[:5]:
    print(doc.page_content)
    print(doc.metadata)

Title: Nature and Nurture: The Impact of Automaticity and the Structuration of Communication on Virtual Team Behavior and Performance Authors: Bartelt, Valerie L.; Dennis, Alan R. Publication Year: 2014 Abstract: Much prior research on virtual teams has examined the impact of the features and capabilities of different communication tools (the nature of communication) on team performance. In this paper, we examine how the social structures (i.e., genre rules) that emerge around different communication tools (the nurture of communication) can be as important in influencing performance. During habitual use situations, team members enact genre rules associated with communication tools without conscious thought via automaticity. These genre rules influence how teams interact and ultimately how well they perform. We conducted an experimental study to examine the impact of different genre rules that have developed for two communication tools: instant messenger and discussion forum. Our result

In [None]:
documents = [
    Document(page_content=row_content, metadata=row.to_dict())
    for row_content, (_, row) in zip(concatenate_with_headers(df_article), df_article.iterrows())
]

In [None]:
for doc in documents[:5]:
    print("Page Content:", doc.page_content)
    print("Metadata:", doc.metadata)

Page Content: Title: Nature and Nurture: The Impact of Automaticity and the Structuration of Communication on Virtual Team Behavior and Performance Authors: Bartelt, Valerie L.; Dennis, Alan R. Publication Year: 2014 Abstract: Much prior research on virtual teams has examined the impact of the features and capabilities of different communication tools (the nature of communication) on team performance. In this paper, we examine how the social structures (i.e., genre rules) that emerge around different communication tools (the nurture of communication) can be as important in influencing performance. During habitual use situations, team members enact genre rules associated with communication tools without conscious thought via automaticity. These genre rules influence how teams interact and ultimately how well they perform. We conducted an experimental study to examine the impact of different genre rules that have developed for two communication tools: instant messenger and discussion for

Creating a persist directory where the vector database will be stored

In [None]:
persist_directory = '../RAG_3_vectordb_3_separate codes/article_chroma_db'

 Here we are creating object with concatenated text (page content) with metadata that is associated to the page content.

We use sentence transformers model for our embeddings model

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L6-v2')

In this step we create our vector database where we take doucuments which is the source of information and which will get embedded and stored in the vector database, embedding model which is "sentence transformes", and we store this vector database in persist directory for later use. This way the vector database is stored on disk and we can access it later without recreating it from the scratch.

In [None]:
vectordb_articles = Chroma.from_documents(documents=documents, 
                                 embedding=embedding_model,
                                 persist_directory=persist_directory)

We take the same steps for sentences table where we have full article separated into sentences --> each row = 1 sentence

In [None]:
with duckdb.connect(database=db_path, read_only=True) as conn:
    query = '''SELECT  title, year, paragraphs.paragraph, sentences.last_section_title   FROM paragraphs
                JOIN papers ON paragraphs.article_id = papers.article_id
                join sentences on paragraphs.para_id = sentences.para_id
                WHERE title IN (
                'Technology Adaptation: The Case of a Computer-Supported Inter-organizational Virtual Team',
                'How Do Suppliers Benefit from Information Technology Use in Supply Chain Relationships?',
                'A Multilevel Model of Resistance to Information Technology Implementation',
                'Understanding User Responses to Information Technology: A Coping Model of User Adaptation',
                'A Comprehensive Conceputalization of the Post-Adoptive Behaviors Associated with IT-Enabled Work Systems',
                'Information Technology and the Performance of the Customer Service Process: A Resource-Based Analysis',
                'Toward a Deeper Understanding of System Usage in Organizations: A Multilevel Perspective',
                'How Habit Limits the Predictive Power of Intention: The Case of Information Systems Continuance',
                'Predicting Different Conceptualizations of System Use: The Competing Roles of Behavioral Intention, Facilitating Conditions, and Behavioral Expectation',
                'The Integrative Framework of Technology Use: An Extension and Test',
                'Why Break the Habit of a Lifetime? Rethinking the Roles of Intention, Habit, and Emotion in Continuing Information Technology Use',
                'An Alternative to Methodological Individualism: A Non-Reductionist Approach to Studying Technology Adoption by Groups',
                'Capturing Bottom-Up Information Technology Use Processes: A Complex Adaptive Systems Model',
                'Understanding User Revisions When Using Information System Features: Adaptive System Use and Triggers',
                'Interfirm IT Capability Profiles and Communications for Cocreating Relational Value: Evidence from the Logistics Industry',
                'A Dramaturgical Model of the Production of Performance Data',
                'The Embeddedness of Information Systems Habits in Organizational and Individual Level Routines: Development and Disruption',
                'When Does Technology Use Enable Network Change in Organizations? A Comparative Study of Feature Use and Shared Affordances',
                'Nature and Nurture: The Impact of Automaticity and the Structuration of Communication on Virtual Team Behavior and Performance',
                'An Investigation of Information Systems Use Patterns: Technological Events as Triggers, the Effect of Time, and Consequences for Performance',
                'Toward Generalizable Sociomaterial Inquiry: A Computational Approach for Zooming In and Out of Sociomaterial Routines',
                'Coping with Information Technology: Mixed Emotions, Vacillation, and Nonconforming Use Patterns',
                'Information Technology Use as a Learning Mechanism: The Impact of IT Use on Knowledge Transfer Effectiveness, Absorptive Capacity, and Franchisee Performance',
                'ICT, Intermediaries, and the Transformation of Gendered Power Structures',
                'Multiplex Appropriation in Complex Systems Implementation: The Case of Brazil''s Correspondent Banking System',
                'Revisiting Group-Based Technology Adoption as a Dynamic Process: The Role of Changing Attitude-Rationale Configurations',
                'Capturing the Complexity of Malleable IT Use: Adaptive Structuration Theory for Individuals',
                'A Temporally Situated Self-Agency Theory of Information Technology Reinvention'
                );'''
    df_paragraphs= conn.execute(query).fetchdf()

In [None]:
print(df_paragraphs.head())

                                               title  year  \
0  Nature and Nurture: The Impact of Automaticity...  2014   
1  Nature and Nurture: The Impact of Automaticity...  2014   
2  Nature and Nurture: The Impact of Automaticity...  2014   
3  Nature and Nurture: The Impact of Automaticity...  2014   
4  Nature and Nurture: The Impact of Automaticity...  2014   

                                           paragraph last_section_title  
0  NATURE AND NURTURE: THE IMPACT OF AUTOMATICITY...               None  
1  Much prior research on virtual teams has exami...           Abstract  
2  Much prior research on virtual teams has exami...           Abstract  
3  Much prior research on virtual teams has exami...           Abstract  
4  Much prior research on virtual teams has exami...           Abstract  


In [None]:
df_paragraphs = df_paragraphs.drop_duplicates(subset=['paragraph'])
print(df_paragraphs.head())

                                                title  year  \
0   Nature and Nurture: The Impact of Automaticity...  2014   
1   Nature and Nurture: The Impact of Automaticity...  2014   
10  Nature and Nurture: The Impact of Automaticity...  2014   
13  Nature and Nurture: The Impact of Automaticity...  2014   
19  Nature and Nurture: The Impact of Automaticity...  2014   

                                            paragraph last_section_title  
0   NATURE AND NURTURE: THE IMPACT OF AUTOMATICITY...               None  
1   Much prior research on virtual teams has exami...           Abstract  
10  Prior research has argued-and demonstrated emp...     Introduction 1  
13  In this paper, we argue that nurture has an eq...     Introduction 1  
19  Genre rules are like many other social structu...     Introduction 1  


In [None]:
print(df_paragraphs.shape)

(2464, 4)


In [None]:
df_paragraphs = df_paragraphs.fillna('No section information')

In [None]:
def concatenate_with_headers(df):
    header_mapping = {
        "year": "Publication Year",
        "title": "Title",
        "paragraph": "Paragraph",
        "last_section_title": "Title of the section",

    }

    concatenated_rows = []
    for index, row in df.iterrows():
        concatenated_row = " ".join([f"{header_mapping[col]}: {row[col]}" for col in df.columns])
        concatenated_rows.append(concatenated_row)
    return concatenated_rows

In [None]:
def concatenate_with_headers(df):
    concatenated_rows = []
    for index, row in df.iterrows():
        concatenated_row = " ".join([f"{col}: {row[col]}" for col in df.columns])
        concatenated_rows.append(concatenated_row)
    return concatenated_rows

In [None]:
documents = [
    Document(
        page_content=f"Title: {row['title']} Year: {row['year']} Last Section Title: {row['last_section_title']} Paragraph: {row['paragraph']}",
        metadata={
            'title': row['title'],
            'year': row['year'],
            'last_section_title': row['last_section_title'],
        }
    )
    for _, row in df_paragraphs.iterrows()
]

In [None]:
for doc in documents[:5]:
    print("Page Content:", doc.page_content)
    print("Metadata:", doc.metadata)

Page Content: Title: Nature and Nurture: The Impact of Automaticity and the Structuration of Communication on Virtual Team Behavior and Performance Year: 2014 Last Section Title: No section information Paragraph: NATURE AND NURTURE: THE IMPACT OF AUTOMATICITY AND THE STRUCTURATION OF COMMUNICATION ON VIRTUAL TEAM BEHAVIOR AND PERFORMANCE 
Metadata: {'title': 'Nature and Nurture: The Impact of Automaticity and the Structuration of Communication on Virtual Team Behavior and Performance', 'year': 2014, 'last_section_title': 'No section information'}
Page Content: Title: Nature and Nurture: The Impact of Automaticity and the Structuration of Communication on Virtual Team Behavior and Performance Year: 2014 Last Section Title: Abstract Paragraph: Much prior research on virtual teams has examined the impact of the features and capabilities of different communication tools (the nature of communication) on team performance. In this paper, we examine how the social structures (i.e., genre rules

In [None]:
persist_directory2 = '../RAG_3_vectordb_3_separate codes/paragraphs_chroma_db'

In [None]:
vectordb_articles = Chroma.from_documents(documents=documents, 
                                 embedding=embedding_model,
                                 persist_directory=persist_directory2)

In [None]:
with duckdb.connect(database=db_path, read_only=True) as conn:
    query = '''SELECT papers.title, papers.year, sentence, ent_id, label,  FROM entities
                JOIN papers ON entities.article_id = papers.article_id
                WHERE title IN (
                'Technology Adaptation: The Case of a Computer-Supported Inter-organizational Virtual Team',
                'How Do Suppliers Benefit from Information Technology Use in Supply Chain Relationships?',
                'A Multilevel Model of Resistance to Information Technology Implementation',
                'Understanding User Responses to Information Technology: A Coping Model of User Adaptation',
                'A Comprehensive Conceputalization of the Post-Adoptive Behaviors Associated with IT-Enabled Work Systems',
                'Information Technology and the Performance of the Customer Service Process: A Resource-Based Analysis',
                'Toward a Deeper Understanding of System Usage in Organizations: A Multilevel Perspective',
                'How Habit Limits the Predictive Power of Intention: The Case of Information Systems Continuance',
                'Predicting Different Conceptualizations of System Use: The Competing Roles of Behavioral Intention, Facilitating Conditions, and Behavioral Expectation',
                'The Integrative Framework of Technology Use: An Extension and Test',
                'Why Break the Habit of a Lifetime? Rethinking the Roles of Intention, Habit, and Emotion in Continuing Information Technology Use',
                'An Alternative to Methodological Individualism: A Non-Reductionist Approach to Studying Technology Adoption by Groups',
                'Capturing Bottom-Up Information Technology Use Processes: A Complex Adaptive Systems Model',
                'Understanding User Revisions When Using Information System Features: Adaptive System Use and Triggers',
                'Interfirm IT Capability Profiles and Communications for Cocreating Relational Value: Evidence from the Logistics Industry',
                'A Dramaturgical Model of the Production of Performance Data',
                'The Embeddedness of Information Systems Habits in Organizational and Individual Level Routines: Development and Disruption',
                'When Does Technology Use Enable Network Change in Organizations? A Comparative Study of Feature Use and Shared Affordances',
                'Nature and Nurture: The Impact of Automaticity and the Structuration of Communication on Virtual Team Behavior and Performance',
                'An Investigation of Information Systems Use Patterns: Technological Events as Triggers, the Effect of Time, and Consequences for Performance',
                'Toward Generalizable Sociomaterial Inquiry: A Computational Approach for Zooming In and Out of Sociomaterial Routines',
                'Coping with Information Technology: Mixed Emotions, Vacillation, and Nonconforming Use Patterns',
                'Information Technology Use as a Learning Mechanism: The Impact of IT Use on Knowledge Transfer Effectiveness, Absorptive Capacity, and Franchisee Performance',
                'ICT, Intermediaries, and the Transformation of Gendered Power Structures',
                'Multiplex Appropriation in Complex Systems Implementation: The Case of Brazil''s Correspondent Banking System',
                'Revisiting Group-Based Technology Adoption as a Dynamic Process: The Role of Changing Attitude-Rationale Configurations',
                'Capturing the Complexity of Malleable IT Use: Adaptive Structuration Theory for Individuals',
                'A Temporally Situated Self-Agency Theory of Information Technology Reinvention'
                );'''
    df_entities= conn.execute(query).fetchdf()

In [None]:
print(df_entities.head())

                                               title  year  \
0  Understanding User Responses to Information Te...  2005   
1  Understanding User Responses to Information Te...  2005   
2  Understanding User Responses to Information Te...  2005   
3  Understanding User Responses to Information Te...  2005   
4  Understanding User Responses to Information Te...  2005   

                                            sentence  \
0  UNDERSTANDING USER RESPONSES TO INFORMATION TE...   
1  This paper defines user adaptation as the cogn...   
2  Drawing on coping theory , we posit that users...   
3  Drawing on coping theory , we posit that users...   
4  On that basis , we identify four adaptation st...   

                          ent_id          label  
0                  IS technology     TECHNOLOGY  
1                  IS technology     TECHNOLOGY  
2              theoretical model  MODEL_ELEMENT  
3                  IS technology     TECHNOLOGY  
4  theory of bounded rationality        

In [None]:
def concatenate_with_headers(df):
    concatenated_rows = []
    for index, row in df.iterrows():
        concatenated_row = " ".join([f"{col}: {row[col]}" for col in df.columns])
        concatenated_rows.append(concatenated_row)
    return concatenated_rows

In [None]:
documents = [
    Document(
        page_content=f"Sentence: {row['sentence']} Entity: {row['ent_id']} Label of entity: {row['label']}",
        metadata={
            'title': row['title'],
            'year': row['year'],
            'ent_id': row['ent_id'],
            'label': row['label'],
        }
    )
    for _, row in df_entities.iterrows()
]

In [None]:
for doc in documents[:5]:
    print("Page Content:", doc.page_content)
    print("Metadata:", doc.metadata)

Page Content: Sentence: UNDERSTANDING USER RESPONSES TO INFORMATION TECHNOLOGY : A COPING MODEL OF USER ADAPTATION 1 Entity: IS technology Label of entity: TECHNOLOGY
Metadata: {'title': 'Understanding User Responses to Information Technology: A Coping Model of User Adaptation', 'year': 2005, 'ent_id': 'IS technology', 'label': 'TECHNOLOGY'}
Page Content: Sentence: This paper defines user adaptation as the cognitive and behavioral efforts performed by users to cope with significant information technology events that occur in their work environment . Entity: IS technology Label of entity: TECHNOLOGY
Metadata: {'title': 'Understanding User Responses to Information Technology: A Coping Model of User Adaptation', 'year': 2005, 'ent_id': 'IS technology', 'label': 'TECHNOLOGY'}
Page Content: Sentence: Drawing on coping theory , we posit that users choose different adaptation strategies based on a combination of primary appraisal ( i.e. , a user 's assessment of the 1 Ritu Agarwal was the acc

In [None]:
persist_directory3 = '../RAG_3_vectordb_3_separate codes/entities_chroma_db'

In [None]:
vectordb_sentences = Chroma.from_documents(documents=documents, 
                                 embedding=embedding_model,
                                 persist_directory=persist_directory3)