In [2]:
#imports
import pandas as pd
import numpy as np
import numpy as np
from nltk.tokenize import  word_tokenize 
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

In [3]:
#read data in

dfInventory = pd.read_excel("../data/inventory.xlsx")
dfRepo1 = pd.read_csv("../data/tudelft_repository_1679269258.csv")
dfRepo2 = pd.read_csv("../data/tudelft_repository_1679269271.csv")
dfRepo3 = pd.read_csv("../data/tudelft_repository_1679269276.csv")
dfRepo4 = pd.read_csv("../data/tudelft_repository_1679269281.csv")
dfRepo5 = pd.read_csv("../data/tudelft_repository_1679269287.csv")
dfRepo6 = pd.read_csv("../data/tudelft_repository_1679269292.csv")


In [4]:
dfRepo = pd.concat([dfRepo1, dfRepo2, dfRepo3, dfRepo4, dfRepo5, dfRepo6])
dfWithAbstract = dfRepo.dropna(subset=["abstract"]) #elements without abstract

In [5]:
dfWithAbstract.iloc[855,6]

'Artificial Intelligence (AI) is increasingly helping people with all kinds of tasks, due to its promising capabilities. In some tasks, an AI system by itself will take over tasks, but in other tasks, an AI system making decisions on its own would be undesired due to ethical and legal reasons. In those cases, AI can still be of help by forming human-AI teams, in which humans get advice from the AI system helping them with making their final decisions. Human-AI teams are for instance used in the medical and legal fields. One problem arises, in which instances should one trust an AI system and in which not? Trusting the AI system when it is correct and trusting yourself when you are correct, results in a high appropriate reliance. If users appropriately rely on AI systems, it is possible to achieve complementary team performance, which is better than any single teammate. However, as known from previous literature, people struggle with assessing their performance and knowing how well they

In [9]:
#implement tf-idf:

npWithAbstract = dfWithAbstract.to_numpy()[:1000, :] #NOTE: only taking 1k first rows for now

In [10]:
#tf-idf with sklearn
allAbstracts = npWithAbstract[:, 6]
corpus = allAbstracts

In [11]:
tfIdfVectorizer=TfidfVectorizer(use_idf=True)
tfIdfMatrix = tfIdfVectorizer.fit_transform(corpus) #this is the matrix
df = pd.DataFrame(tfIdfMatrix[0].T.todense(), index=tfIdfVectorizer.get_feature_names_out(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
tfIdfMatrix =  tfIdfMatrix.toarray()

In [12]:
uniqueWords = tfIdfVectorizer.get_feature_names_out() #unique words
uniqueWordsIndexDict = dict()
for idx, word in enumerate(uniqueWords):
    uniqueWordsIndexDict[word] = idx

In [13]:
invDocFreq = dict.fromkeys(uniqueWords, 0)
numDocuments = len(corpus)

for text in corpus:

    tokens = set(text.split()) #unique
    for token in tokens: 
        if (token in invDocFreq):
            invDocFreq[token] += 1
for key, value in invDocFreq.items():
    invDocFreq[key] = np.log(numDocuments/(value+1)) 

avgDocFreq = np.mean(np.array(list(invDocFreq.values())))

In [14]:
# inference

def executeQuery(query: str):
	query = query.lower()

	#make new vector of the input query
	Q = np.zeros(len(uniqueWords)) 
	tokens = query.split()
	counter = Counter(tokens)
	words_count = len(tokens)
	#calculate tf-idf scores for the input query
	for token in np.unique(tokens):
		if token not in uniqueWords: #cannot calc for word that does not exist
			print(f"word {token} does not exist in vocabulary")
			continue

		tf = counter[token]/words_count
		idf = invDocFreq.get(token, avgDocFreq)
		tfIdf = tf*idf
		#find idx of word in the vector
		idx = uniqueWordsIndexDict.get(token, None)
		if (idx is None):
			continue

		Q[idx] = tfIdf  

	#compare the input query vector to the vectors of all documents in corpus
	res = []
	for idx, doc in enumerate(tfIdfMatrix):
		cosineSim = np.dot(doc,Q)/(np.linalg.norm(doc)*np.linalg.norm(Q))
		res.append((idx, cosineSim))
	res = np.array(res)

	#sort the results
	res = res[res[:, 1].argsort()[::-1]]

	# if we want to return the actualy abstracts, then return this, else is returns the indices
	"""
	orderedCorpusAccordingToQuery = []
	for idx, cosineSim in res:
		orderedCorpusAccordingToQuery.append((corpus[int(idx)]))
	return orderedCorpusAccordingToQuery
	"""

	return np.array(res)[:, 0]  #only return the indices

In [18]:
results = executeQuery("AI in python with pytorch") #results are a list of indices from most relvant to least relevant from the corpus


numberOfDocumentsToShow = 3
for i in range(numberOfDocumentsToShow):
	print(dfWithAbstract.iloc[int(results[i]), 6])



word pytorch does not exist in vocabulary
Artificial Intelligence (AI) is increasingly helping people with all kinds of tasks, due to its promising capabilities. In some tasks, an AI system by itself will take over tasks, but in other tasks, an AI system making decisions on its own would be undesired due to ethical and legal reasons. In those cases, AI can still be of help by forming human-AI teams, in which humans get advice from the AI system helping them with making their final decisions. Human-AI teams are for instance used in the medical and legal fields. One problem arises, in which instances should one trust an AI system and in which not? Trusting the AI system when it is correct and trusting yourself when you are correct, results in a high appropriate reliance. If users appropriately rely on AI systems, it is possible to achieve complementary team performance, which is better than any single teammate. However, as known from previous literature, people struggle with assessing th

OLD CODE

In [None]:
# manual tf-idf implementations
#abstract is index number 6 on a row
"""
abstractIndex = 6

sentences = []
word_set = []

for i in tqdm(range(npWithAbstract.shape[0])):
    abstractText = npWithAbstract[i, abstractIndex]
    
    x = [i.lower() for  i in word_tokenize(abstractText) if i.isalpha()]
    sentences.append(x)
    for word in x:
        if word not in word_set:
            word_set.append(word)

 
word_set = set(word_set)

total_documents = (npWithAbstract.shape[0])
 
#Creating an index for each word in our vocab.
index_dict = {} #Dictionary to store index for each word
i = 0
for word in word_set:
    index_dict[word] = i
    i += 1

#Create a count dictionary
 
def count_dict(sentences):
    word_count = {}
    for word in tqdm(word_set):
        word_count[word] = 0
        for sent in sentences:
            if word in sent:
                word_count[word] += 1
    return word_count
 
word_count = count_dict(sentences)

def termfreq(document, word):
    N = len(document)
    occurance = len([token for token in document if token == word])
    return occurance/N

def inverse_doc_freq(word):
    try:
        word_occurance = word_count[word] + 1
    except:
        word_occurance = 1
    return np.log(total_documents/word_occurance)

def tf_idf(sentence):
    tf_idf_vec = np.zeros((len(word_set),))
    for word in sentence:
        tf = termfreq(sentence,word)
        idf = inverse_doc_freq(word)
         
        value = tf*idf
        tf_idf_vec[index_dict[word]] = value 
    return tf_idf_vec


vectors = []
for sent in tqdm(sentences):
    vec = tf_idf(sent)
    vectors.append(vec)
"""

'\nabstractIndex = 6\n\nsentences = []\nword_set = []\n\nfor i in tqdm(range(npWithAbstract.shape[0])):\n    abstractText = npWithAbstract[i, abstractIndex]\n    \n    x = [i.lower() for  i in word_tokenize(abstractText) if i.isalpha()]\n    sentences.append(x)\n    for word in x:\n        if word not in word_set:\n            word_set.append(word)\n\n \nword_set = set(word_set)\n\ntotal_documents = (npWithAbstract.shape[0])\n \n#Creating an index for each word in our vocab.\nindex_dict = {} #Dictionary to store index for each word\ni = 0\nfor word in word_set:\n    index_dict[word] = i\n    i += 1\n\n#Create a count dictionary\n \ndef count_dict(sentences):\n    word_count = {}\n    for word in tqdm(word_set):\n        word_count[word] = 0\n        for sent in sentences:\n            if word in sent:\n                word_count[word] += 1\n    return word_count\n \nword_count = count_dict(sentences)\n\ndef termfreq(document, word):\n    N = len(document)\n    occurance = len([token fo

In [None]:
#inference with tf-idf
#each word in the query has a value for each document, add up these values for all words in the query, and then sort after the values
# -> convert the query to a vector, calc tf from the vector, and df from all documents

"""
query = "eindhoven cultural heritage"


Q = np.zeros((17020)) # number of unique words
tokens = query.split()
counter = Counter(tokens)
words_count = len(tokens)
query_weights = {}
for token in np.unique(tokens):
    if token not in uniqueWords: #cannot calc for word that does not exist
        print(f"word {token} does not exist in vocabulary")
        continue

    tf = counter[token]/words_count
    idf = invDocFreq.get(token, avgDocFreq)
    tfIdf = tf*idf
    #find idx of word in the vector
    idx = uniqueWordsIndexDict.get(token, None)
    if (idx is None):
        continue

    Q[idx] = tfIdf  

#do cosine sim between Q and all vectors in tfIdfMatrix

res[res[:, 1].argsort()[::-1]]

#now do cosine sim between all vectors and Q
res = []
for idx, doc in enumerate(tfIdfMatrix):
    cosineSim = np.dot(doc,Q)/(np.linalg.norm(doc)*np.linalg.norm(Q))
    res.append((idx, cosineSim))
res = np.array(res)
# arr = tfIdfMatrix.flatten()
"""


'\nquery = "eindhoven cultural heritage"\n\n\nQ = np.zeros((17020)) # number of unique words\ntokens = query.split()\ncounter = Counter(tokens)\nwords_count = len(tokens)\nquery_weights = {}\nfor token in np.unique(tokens):\n    if token not in uniqueWords: #cannot calc for word that does not exist\n        print(f"word {token} does not exist in vocabulary")\n        continue\n\n    tf = counter[token]/words_count\n    idf = invDocFreq.get(token, avgDocFreq)\n    tfIdf = tf*idf\n    #find idx of word in the vector\n    idx = uniqueWordsIndexDict.get(token, None)\n    if (idx is None):\n        continue\n\n    Q[idx] = tfIdf  \n\n#do cosine sim between Q and all vectors in tfIdfMatrix\n\nres[res[:, 1].argsort()[::-1]]\n\n#now do cosine sim between all vectors and Q\nres = []\nfor idx, doc in enumerate(tfIdfMatrix):\n    cosineSim = np.dot(doc,Q)/(np.linalg.norm(doc)*np.linalg.norm(Q))\n    res.append((idx, cosineSim))\nres = np.array(res)\n# arr = tfIdfMatrix.flatten()\n'