In [1]:
import os
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 

from natsort import natsorted

import pandas as pd
import numpy as np

import math
# import nltk
# nltk.download('punkt')  # Add this line for word_tokenize
# from sklearn.feature_extraction.text import TfidfVectorizer



# Read Files

In [29]:
def read_files(file):
    if 'txt' in file:
        with open(f'Articles/'+file, 'r') as f:
            return f.read()



In [31]:
documents = []
# List files in the "Articles" directory
for file in os.listdir('Articles'):
    documents.append(read_files(file))

len(documents)

10

In [4]:
def preprocessing(doc):
    token_docs = word_tokenize(doc)
    
    porter = PorterStemmer()
    stemmed_tokens = [porter.stem(token) for token in token_docs]
    
    # Join the stemmed tokens into a string
    prepared_doc = " ".join(stemmed_tokens)
    
    return prepared_doc

In [34]:
fileno = 1

pos_index = {}

# Sort file names in a natural order
file_names = natsorted(os.listdir("Articles"))
print(file_names)

for file_name in file_names:
    # Read file content
    with open(f'Articles/{file_name}', 'r') as f:
        stuff = f.read()


    final_token_list = preprocessing(stuff)
#     print(final_token_list)

    # Split the string into a list of terms
    terms = final_token_list.split()

    #enumerate returns pairs containing the pos and his term
    for pos, term in enumerate(terms):
#         print(term)
#         print(pos)        
        
        # If term already exists in the positional index dictionary.
        if term in pos_index:
            # Increment total freq by 1.
            pos_index[term][0] = pos_index[term][0] + 1
                
            # Check if the term has existed in that DocID before.
            if fileno in pos_index[term][1]:
                pos_index[term][1][fileno].append(pos)
            else:
                pos_index[term][1][fileno] = [pos]

        # If term does not exist in the positional index dictionary
        else:
            # Initialize the list.
            pos_index[term] = []
            # The total frequency is 1.
            pos_index[term].append(1)
            # The postings list is initially empty.
            pos_index[term].append({})     
            # Add doc ID to postings list.
            pos_index[term][1][fileno] = [pos]

    # Increment the file no. counter for document ID mapping             
    fileno += 1




['1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt', '8.txt', '9.txt', '10.txt']


In [35]:
pos_index

{'antoni': [3, {1: [0], 2: [0], 6: [0]}],
 'brutu': [3, {1: [1], 2: [1], 4: [0]}],
 'caeser': [5, {1: [2], 2: [2], 4: [1], 5: [0], 6: [1]}],
 'cleopatra': [1, {1: [3]}],
 'merci': [5, {1: [4], 3: [0], 4: [2], 5: [1], 6: [2]}],
 'worser': [4, {1: [5], 3: [1], 4: [3], 5: [2]}],
 'calpurnia': [1, {2: [3]}],
 'angel': [3, {7: [0], 8: [0], 9: [0]}],
 'fool': [4, {7: [1], 8: [1], 9: [1], 10: [0]}],
 'fear': [3, {7: [2], 8: [2], 10: [1]}],
 'in': [4, {7: [3], 8: [3], 9: [2], 10: [2]}],
 'rush': [4, {7: [4], 8: [4], 9: [3], 10: [3]}],
 'to': [4, {7: [5], 8: [5], 9: [4], 10: [4]}],
 'tread': [4, {7: [6], 8: [6], 9: [5], 10: [5]}],
 'where': [4, {7: [7], 8: [7], 9: [6], 10: [6]}]}

In [36]:
documents = []
files = os.listdir('Articles')
for file in range(1, 11):
    documents.append("".join(preprocessing(read_files(str(file)+'.txt'))))
print(documents)



['antoni brutu caeser cleopatra merci worser', 'antoni brutu caeser calpurnia', 'merci worser', 'brutu caeser merci worser', 'caeser merci worser', 'antoni caeser merci', 'angel fool fear in rush to tread where', 'angel fool fear in rush to tread where', 'angel fool in rush to tread where', 'fool fear in rush to tread where']


In [37]:

all_terms = []
for doc in documents:
    for term in doc.split():
        all_terms.append(term)
all_terms = set(all_terms)
print(all_terms)



{'brutu', 'where', 'cleopatra', 'tread', 'merci', 'caeser', 'worser', 'calpurnia', 'fool', 'rush', 'antoni', 'to', 'fear', 'in', 'angel'}


In [40]:
def get_tf(document):
    wordDict = dict.fromkeys(all_terms, 0)
    for word in document.split():
        wordDict[word] += 1
    return wordDict
# print(get_tf(documents[0]).keys())
# print(get_tf(documents[0]).values())

In [10]:
tf = pd.DataFrame(get_tf(documents[0]).values(), index=get_tf(documents[0]).keys())
for i in range(1, len(documents)):
    tf[i] = get_tf(documents[i]).values()
tf.columns = ['doc'+str(i) for i in range(1, 11)]


In [11]:
tf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
brutu,1,1,0,1,0,0,0,0,0,0
where,0,0,0,0,0,0,1,1,1,1
cleopatra,1,0,0,0,0,0,0,0,0,0
tread,0,0,0,0,0,0,1,1,1,1
merci,1,0,1,1,1,1,0,0,0,0
caeser,1,1,0,1,1,1,0,0,0,0
worser,1,0,1,1,1,0,0,0,0,0
calpurnia,0,1,0,0,0,0,0,0,0,0
fool,0,0,0,0,0,0,1,1,1,1
rush,0,0,0,0,0,0,1,1,1,1


In [12]:
def weighted_tf(x):
    if x > 0:
        return math.log(x) + 1
    return 0


In [13]:
w_tf = tf.copy()
for i in range(0, len(documents)):
    w_tf['doc'+str(i+1)] = tf['doc'+str(i+1)].apply(weighted_tf)

In [14]:
w_tf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
brutu,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
where,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
cleopatra,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tread,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
merci,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
caeser,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
worser,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
calpurnia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fool,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
rush,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0


In [15]:
tdf = pd.DataFrame(columns=['df', 'idf'])
for i in range(len(tf)):
    in_term = w_tf.iloc[i].values.sum()

    tdf.loc[i, 'df'] = in_term

    tdf.loc[i, 'idf'] = math.log10(10 / (float(in_term)))

tdf.index=w_tf.index


In [16]:
tdf

Unnamed: 0,df,idf
brutu,3,0.522879
where,4,0.39794
cleopatra,1,1.0
tread,4,0.39794
merci,5,0.30103
caeser,5,0.30103
worser,4,0.39794
calpurnia,1,1.0
fool,4,0.39794
rush,4,0.39794


In [17]:
tf_idf = w_tf.multiply(tdf['idf'], axis=0)
tf_idf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
brutu,0.522879,0.522879,0.0,0.522879,0.0,0.0,0.0,0.0,0.0,0.0
where,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
cleopatra,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tread,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
merci,0.30103,0.0,0.30103,0.30103,0.30103,0.30103,0.0,0.0,0.0,0.0
caeser,0.30103,0.30103,0.0,0.30103,0.30103,0.30103,0.0,0.0,0.0,0.0
worser,0.39794,0.0,0.39794,0.39794,0.39794,0.0,0.0,0.0,0.0,0.0
calpurnia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fool,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
rush,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794


In [18]:
def get_doc_len(col):
    return np.sqrt(tf_idf[col].apply(lambda x: x**2).sum())

doc_len = pd.DataFrame()
for col in tf_idf.columns:
    doc_len.loc[0, col+'_length']= get_doc_len(col)


In [19]:
doc_len

Unnamed: 0,doc1_length,doc2_length,doc3_length,doc4_length,doc5_length,doc6_length,doc7_length,doc8_length,doc9_length,doc10_length
0,1.373462,1.279618,0.498974,0.782941,0.582747,0.67427,1.223496,1.223496,1.106137,1.106137


In [20]:
doc_len['doc2_length'].values[0]

1.2796184676775093

In [21]:
def get_norm_tf_idf(col, x):
    try:
        return x / doc_len[col+'_length'].values[0]
    except:
        return 0

In [22]:
norm_tf_idf = pd.DataFrame()
for col in tf_idf.columns:
    norm_tf_idf[col] = tf_idf[col].apply(lambda x : get_norm_tf_idf(col, x))


In [27]:
norm_tf_idf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
brutu,0.380701,0.408621,0.0,0.667839,0.0,0.0,0.0,0.0,0.0,0.0
where,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
cleopatra,0.728087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tread,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
merci,0.219176,0.0,0.603298,0.384486,0.51657,0.446453,0.0,0.0,0.0,0.0
caeser,0.219176,0.23525,0.0,0.384486,0.51657,0.446453,0.0,0.0,0.0,0.0
worser,0.289735,0.0,0.797516,0.508263,0.682869,0.0,0.0,0.0,0.0,0.0
calpurnia,0.0,0.781483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fool,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
rush,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756


In [26]:

# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(documents)
# X = X.T.toarray()
# df = pd.DataFrame(X, index=vectorizer.get_feature_names())


In [43]:
query = input("Please enter the query:")
# Tokenize and aplly stemming
tokenized_query = word_tokenize(query)
porter = PorterStemmer()

stemmed_query = [porter.stem(word) for word in tokenized_query]
# Join the stemmed tokens to form the new query
stemmed_query_str = ' '.join(stemmed_query)


def get_wtf(x):
    try:
        return 1 + math.log10(x)
    except:
        return 0
try:
    query_df = pd.DataFrame(index=norm_tf_idf.index)
    query_df['tf'] = [1 if x in stemmed_query_str.split() else 0 for x in list(norm_tf_idf.index)]
    query_df['w_tf'] = query_df['tf'].apply(lambda x : get_wtf(x))
    product = norm_tf_idf.multiply(query_df['w_tf'], axis=0)
    query_df['idf'] = tdf['idf'] * query_df['w_tf']
    query_df['tf*idf'] = query_df['tf'] * query_df['idf']
    query_df['normalized'] = 0
    query_df['normalized'] = query_df['idf'] / np.sqrt((query_df['idf'] ** 2).sum())

    filtered_query_df = query_df[query_df['tf'] != 0]
    print(filtered_query_df)
    product2 = product.multiply(query_df['normalized'], axis=0)
    scores = {}
    for col in product2.columns:
        if 0 in product2[col].loc[stemmed_query_str.split()].values:
            pass
        else:
            scores[col] = product2[col].sum()
    prod_res = product2[list(scores.keys())].loc[stemmed_query_str.split()]
    
    print()
    print(prod_res)
    print(f'\n{prod_res.sum()}\n')
    print(f'The similarity between "{query}" and the documents is :')
    print(f'\n{prod_res.sum()}\n')
    print(f"\nThe query length = {math.sqrt(sum([x ** 2 for x in query_df['idf'].loc[stemmed_query_str.split()]]))}")


    final_score = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    print(f"\n returned docs '{query}' are :\n")
    for doc in final_score:
        print(doc[0], end=' ')
    print()
    print()

except:
    print("The query is doesn't exist")

Please enter the query:antony brutus
        tf  w_tf       idf    tf*idf normalized
brutu    1   1.0  0.522879  0.522879   0.707107
antoni   1   1.0  0.522879  0.522879   0.707107

            doc1      doc2
antoni  0.269196  0.288939
brutu   0.269196  0.288939

doc1    0.538393
doc2    0.577877
dtype: float64

The similarity between "antony brutus" and the documents is :

doc1    0.538393
doc2    0.577877
dtype: float64


The query length = 0.7394622130520805

 returned docs 'antony brutus' are :

doc2 doc1 

