In [2]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import math
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity



In [3]:
# Define the documents and the query
D1 = """Since OpenAI released its blockbuster bot ChatGPT in November, users have casually 
experimented with the tool, with even Insider reporters trying to simulate news stories or 
message potential dates.To older millennials who grew up with IRC chat rooms — a text 
instant message system — the personal tone of conversations with the bot can evoke the 
experience of chatting online. But ChatGPT, the latest in technology known as "large 
language model tools," doesn't speak with sentience and doesn't "think" the way people do."""
    
    
D2 = """Other tech companies like Google and Meta have developed their own large language 
model tools, which use programs that take in human prompts and devise sophisticated 
responses. OpenAI, in a revolutionary move, also created a user interface that is letting the 
general public experiment with it directly. Some recent efforts to use chat bots for real-world 
services have proved troubling — with odd results. The mental health company Koko came 
under fire this month after its founder wrote about how the company used GPT-3 in an 
experiment to reply to users."""
D3 =  """The founder of the controversial DoNotPay service, which claims its GPT-3-driven chat 
bot helps users resolve customer service disputes, also said an AI "lawyer" would advise 
defendants in actual courtroom traffic cases in real time, though he later walked that
back over concerns about its risks. Chat GPT is an AI Chatbot developed by Open AI. The 
chatbot has a language-based model that the developer fine-tunes for human interaction in a 
conversational manner. Effectively it’s a simulated chatbot primarily designed for customer 
service; people use it for various other purposes too though. These range from writing essays 
to drafting business plans, to generating code. But what is it and what can it really do? """

D4= """Chat GPT is an AI chatbot auto-generative system created by Open AI for online 
customer care. It is a pre-trained generative chat, which makes use of (NLP) Natural 
Language Processing. The source of its data is textbooks, websites, and various articles, 
which it uses to model its own language for responding to human interaction. The main 
feature of Chat GPT is generating responses like those humans would provide, in a text box. 
Therefore, it is suitable for chatbots, AI system conversations, and virtual 
assistants. However, it can also give natural answers to questions in a conversational tone and 
can generate stories poems and more. Moreover, it can: Write code, Write an article or blog 
post, Translate, Debug, Write a story/poem, Recommend chords and lyrics"""

q = "OpenAI chatbot chatGPT"

# Create a list of documents
documents = [D1, D2, D3, D4]


In [4]:
def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords


In [9]:
def retrieveResults(D, query):
    #Takes in a set of documents and a query statement
    x = pd.DataFrame({'text': D})
    # Tokenize each document in the 'text' column and preprocess the text
    tokenized_docs = [preprocess_text(doc) for doc in x['text']]

    # Create a consolidated list to make a vocabulary of all terms
    # Flatten the list of tokenized documents into a single list of tokens
    all_tokens = [token for doc in tokenized_docs for token in doc]

    #Get a list of unique tokens
    unique_tokens = list(set(all_tokens))
    
    #Uncomment to print all unique tokens
    #print(unique_tokens)

    # Create a dictionary to store the document frequency for each token
    doc_freq = {token:0 for token in unique_tokens}

    # Iterate through each tokenized document and update the document frequency
    # how many times a term appears in a document
    for doc_tokens in tokenized_docs:
        for token in set(doc_tokens):
            doc_freq[token] += 1
    # Convert the dictionary to a dataframe
    df_doc_freq = pd.DataFrame.from_dict(doc_freq, orient='index', columns=['document_frequency'])
    
    #Uncomment to print dataframe for document frequency
    #print(df_doc_freq)
    
    # Sort the data frame by document frequency in descending order
    df_doc_freq = df_doc_freq.sort_values(by=['document_frequency'], ascending=False)
    
    # Get the total number of documents in the corpus
    num_docs = len(x)

    # Calculate the idf for each term in the corpus
    df_doc_freq['idf'] = df_doc_freq.apply(lambda row: math.log(num_docs / row['document_frequency']), axis=1)
    
    
    #Print idf values dataframe
    #print(df_doc_freq)
    # Sort the dataframe by idf in ascending order
    df_doc_freq = df_doc_freq.sort_values(by=['idf'], ascending=True)
    # Create a dataframe to store the term frequency for each term in each document
    df_tf = pd.DataFrame(columns=unique_tokens)

    # Iterate through each document in the corpus
    for i, doc_tokens in enumerate(tokenized_docs):
        # Create a dictionary to store the term frequency for the current document
        term_freq = {token:0 for token in unique_tokens}

        # Iterate through each token in the current document and update the term frequency
        for token in doc_tokens:
            term_freq[token] += 1

        # Calculate the term frequency using the formula tf_ij = 1 + log(f_ij)
        for token in unique_tokens:
            tf_ij = 1 + math.log(term_freq[token]) if term_freq[token] > 0 else 0
            df_tf.loc[i, token] = tf_ij

    df_doc_freq = df_doc_freq.sort_index(axis=0, ascending=True)

    df_tf = df_tf.sort_index(axis=1)
    
    #To print term frequency matrix
    #print(df_tf)

    tf_idf_matrix = df_tf.mul(df_doc_freq ['idf'], axis=1)
    
    #Print tf_idf
    #print(tf_idf_matrix)
    
    # Preprocess the query text
    query_tokens = preprocess_text(query)
    # Calculate the TF-IDF vector for the query
    query_tfidf = np.zeros((1, len(df_doc_freq)))
    df_tfq = pd.DataFrame(columns=unique_tokens)

    
    for token in query_tokens:
        if token in df_doc_freq.index:
            idx = df_doc_freq.index.get_loc(token)
            query_tfidf[0, idx] += 1

    query_tfidf = query_tfidf[0]
    #print(query_tfidf)
    for i in range(len(query_tfidf)):
        l = query_tfidf[i]
        if(l != 0):
            query_tfidf[i] = 1 + np.log(query_tfidf[i])

    query_vector = np.array(query_tfidf).reshape(1, -1)

    sim = {}
    l = []

    for i in range(len(x)):
        doc1 = np.array(tf_idf_matrix.iloc[i].tolist()).reshape(1, -1)
        similarity = cosine_similarity(query_vector, doc1)
        dname = "Document "+ str(i+1)
        sim[dname] = similarity[0][0]
        l.append(similarity[0][0])

    x['qSim'] = l

    #The ranked documents are
    print(x.sort_values(by=['qSim'], ascending=False))

In [10]:
retrieveResults(documents, q)

                                                text      qSim
0  Since OpenAI released its blockbuster bot Chat...  0.205515
2  The founder of the controversial DoNotPay serv...  0.088627
1  Other tech companies like Google and Meta have...  0.046062
3  Chat GPT is an AI chatbot auto-generative syst...  0.039781


In [13]:
retrieveResults(documents, "controversial")

                                                text      qSim
2  The founder of the controversial DoNotPay serv...  0.146293
0  Since OpenAI released its blockbuster bot Chat...  0.000000
1  Other tech companies like Google and Meta have...  0.000000
3  Chat GPT is an AI chatbot auto-generative syst...  0.000000
