## Libraries

In [1]:
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer  # Add this line

from natsort import natsorted

import pandas as pd
import numpy as np

import math
import nltk
nltk.download('punkt')  # Add this line for word_tokenize
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Read files

In [24]:
def read_files(file):
    if 'txt' in file:
        with open(f'Articles/'+file, 'r') as f:
            return f.read()

In [3]:
documents = []
for file in os.listdir('Articles'):
    documents.append(read_files(file))

In [4]:
len(documents)

10

# First Phase $:-$

## Apply tokenization

In [5]:
# token_docs = []
# for document in documents:
#     token_docs.append(word_tokenize(document))
# token_docs

[['antony', 'brutus', 'caeser', 'cleopatra', 'mercy', 'worser'],
 ['fools', 'fear', 'in', 'rush', 'to', 'tread', 'where'],
 ['antony', 'brutus', 'caeser', 'calpurnia'],
 ['mercy', 'worser'],
 ['brutus', 'caeser', 'mercy', 'worser'],
 ['caeser', 'mercy', 'worser'],
 ['antony', 'caeser', 'mercy'],
 ['angels', 'fools', 'fear', 'in', 'rush', 'to', 'tread', 'where'],
 ['angels', 'fools', 'fear', 'in', 'rush', 'to', 'tread', 'where'],
 ['angels', 'fools', 'in', 'rush', 'to', 'tread', 'where']]

## Stemming

In [6]:
# porter = PorterStemmer()

# # Now, use the porter instance to stem the tokens
# stemmed_docs = []
# for token_list in token_docs:
#     stemmed_tokens = [porter.stem(token) for token in token_list]
#     stemmed_docs.append(stemmed_tokens)
# stemmed_docs

[['antoni', 'brutu', 'caeser', 'cleopatra', 'merci', 'worser'],
 ['fool', 'fear', 'in', 'rush', 'to', 'tread', 'where'],
 ['antoni', 'brutu', 'caeser', 'calpurnia'],
 ['merci', 'worser'],
 ['brutu', 'caeser', 'merci', 'worser'],
 ['caeser', 'merci', 'worser'],
 ['antoni', 'caeser', 'merci'],
 ['angel', 'fool', 'fear', 'in', 'rush', 'to', 'tread', 'where'],
 ['angel', 'fool', 'fear', 'in', 'rush', 'to', 'tread', 'where'],
 ['angel', 'fool', 'in', 'rush', 'to', 'tread', 'where']]

# Second phase $:-$

### Implement function to do all steps in first phase

In [2]:
def preprocessing(doc):
    token_docs = word_tokenize(doc)
    
    porter = PorterStemmer()
    # Now, use the porter instance to stem the tokens
    stemmed_tokens = [porter.stem(token) for token in token_docs]
    
    # Join the stemmed tokens into a string
    prepared_doc = " ".join(stemmed_tokens)
    
    return prepared_doc

In [6]:
# for file_name in file_names:

#     # Read file contents.
#     with open(f'Articles/{file_name}', 'r') as f:
#         stuff = f.read()
# preprocessing(stuff)

'fool fear in rush to tread where'

## apply positional index

In [18]:
# Initialize the file no.
fileno = 1

# Initialize the dictionary.
pos_index = {}

# Open files.
file_names = natsorted(os.listdir("Articles"))
print(file_names)

# For every file.
for file_name in file_names:
    # Read file contents.
    with open(f'Articles/{file_name}', 'r') as f:
        stuff = f.read()

    # This is the list of words in order of the text.
    # We need to preserve the order because we require positions.
    # 'preprocessing' function does some basic punctuation removal,
    final_token_list = preprocessing(stuff)
#     print(final_token_list)

    # Split the string into a list of terms
    terms = final_token_list.split()

    # For position and term in the tokens.
    for pos, term in enumerate(terms):
        print(term)
        
        # If term already exists in the positional index dictionary.
        if term in pos_index:
            # Increment total freq by 1.
            pos_index[term][0] = pos_index[term][0] + 1
                
            # Check if the term has existed in that DocID before.
            if fileno in pos_index[term][1]:
                pos_index[term][1][fileno].append(pos)
            else:
                pos_index[term][1][fileno] = [pos]

        # If term does not exist in the positional index dictionary
        # (first encounter).
        else:
            # Initialize the list.
            pos_index[term] = []
            # The total frequency is 1.
            pos_index[term].append(1)
            # The postings list is initially empty.
            pos_index[term].append({})     
            # Add doc ID to postings list.
            pos_index[term][1][fileno] = [pos]

    # Increment the file no. counter for document ID mapping             
    fileno += 1


['1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt', '8.txt', '9.txt', '10.txt']
antoni
brutu
caeser
cleopatra
merci
worser
antoni
brutu
caeser
calpurnia
merci
worser
brutu
caeser
merci
worser
caeser
merci
worser
antoni
caeser
merci
angel
fool
fear
in
rush
to
tread
where
angel
fool
fear
in
rush
to
tread
where
angel
fool
in
rush
to
tread
where
fool
fear
in
rush
to
tread
where


### displays each term 

In [19]:
pos_index

{'antoni': [3, {1: [0], 2: [0], 6: [0]}],
 'brutu': [3, {1: [1], 2: [1], 4: [0]}],
 'caeser': [5, {1: [2], 2: [2], 4: [1], 5: [0], 6: [1]}],
 'cleopatra': [1, {1: [3]}],
 'merci': [5, {1: [4], 3: [0], 4: [2], 5: [1], 6: [2]}],
 'worser': [4, {1: [5], 3: [1], 4: [3], 5: [2]}],
 'calpurnia': [1, {2: [3]}],
 'angel': [3, {7: [0], 8: [0], 9: [0]}],
 'fool': [4, {7: [1], 8: [1], 9: [1], 10: [0]}],
 'fear': [3, {7: [2], 8: [2], 10: [1]}],
 'in': [4, {7: [3], 8: [3], 9: [2], 10: [2]}],
 'rush': [4, {7: [4], 8: [4], 9: [3], 10: [3]}],
 'to': [4, {7: [5], 8: [5], 9: [4], 10: [4]}],
 'tread': [4, {7: [6], 8: [6], 9: [5], 10: [5]}],
 'where': [4, {7: [7], 8: [7], 9: [6], 10: [6]}]}

### Allow users to write phrase query 

In [20]:
q = 'angel fools'

# Third phase $:-$

In [138]:
def put_query(q):
    lis = [[] for i in range(10)]
    for term in q.split():
        if term in pos_index.keys():
            for key in pos_index[term][1].keys():
            
                if lis[key-1] != []:
                    
                    if lis[key-1][-1] == pos_index[term][1][key][0]-1:
                        lis[key-1].append(pos_index[term][1][key][0])
                else:
                    lis[key-1].append(pos_index[term][1][key][0])
                
    positions = []
    for pos, list in enumerate(lis, start=1):
        if len(list) == len(q.split()):
            positions.append('document '+str(pos))
    return positions

In [139]:
q='angel fool'
put_query(q)

['document 7', 'document 8', 'document 9']

In [140]:
documents = []
files = os.listdir('Articles')
for file in range(1, 11):
    documents.append("".join(preprocessing(read_files(str(file)+'.txt'))))
print(documents)

['antoni brutu caeser cleopatra merci worser', 'antoni brutu caeser calpurnia', 'merci worser', 'brutu caeser merci worser', 'caeser merci worser', 'antoni caeser merci', 'angel fool fear in rush to tread where', 'angel fool fear in rush to tread where', 'angel fool in rush to tread where', 'fool fear in rush to tread where']


In [67]:
all_terms = []
for doc in documents:
    for term in doc.split():
        all_terms.append(term)
all_terms = set(all_terms)
print(all_terms)

{'calpurnia', 'where', 'fear', 'worser', 'in', 'rush', 'to', 'brutu', 'caeser', 'antoni', 'tread', 'merci', 'angel', 'cleopatra', 'fool'}


In [142]:
# # Initialize the dictionary.
# tf = pd.DataFrame()

# # For every file.
# for file_name in file_names:
#     # Read file contents.
#     with open(f'Articles/{file_name}', 'r') as f:
#         stuff = f.read()

#     # This is the list of words in order of the text.
#     # We need to preserve the order because we require positions.
#     # 'preprocessing' function does some basic punctuation removal,
#     final_token_list = preprocessing(stuff)

#     # Split the string into a list of terms
#     terms = final_token_list.split()

#     # Initialize a dictionary to store the term frequency for the current document
#     doc_term_freq = dict.fromkeys(terms, 0)

#     # For position and term in the tokens.
#     for term in terms:
#         doc_term_freq[term] += 1

#     # Append the term frequency for the current document to the DataFrame
#     tf[file_name] = pd.Series(doc_term_freq)

# # Display the term frequency DataFrame
# print("Term frequency for each term in each document:")
# print(tf)


## Term Frequency
$$ tf = \frac{number of times the term appears in a document} {total number of words in the document}$$

In [147]:
def get_tf(document):
    wordDict = dict.fromkeys(all_terms, 0)
    for word in document.split():
        wordDict[word] += 1
    return wordDict
print(get_tf(documents[0]).values())
print(get_tf(documents[0]).keys())

dict_values([0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0])
dict_keys(['calpurnia', 'where', 'fear', 'worser', 'in', 'rush', 'to', 'brutu', 'caeser', 'antoni', 'tread', 'merci', 'angel', 'cleopatra', 'fool'])


In [150]:
tf = pd.DataFrame(get_tf(documents[0]).values(), index=get_tf(documents[0]).keys())
for i in range(1, len(documents)):
    tf[i] = get_tf(documents[i]).values()
tf.columns = ['doc'+str(i) for i in range(1, 11)]

In [151]:
tf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
calpurnia,0,1,0,0,0,0,0,0,0,0
where,0,0,0,0,0,0,1,1,1,1
fear,0,0,0,0,0,0,1,1,0,1
worser,1,0,1,1,1,0,0,0,0,0
in,0,0,0,0,0,0,1,1,1,1
rush,0,0,0,0,0,0,1,1,1,1
to,0,0,0,0,0,0,1,1,1,1
brutu,1,1,0,1,0,0,0,0,0,0
caeser,1,1,0,1,1,1,0,0,0,0
antoni,1,1,0,0,0,1,0,0,0,0


## Weighted tf(1+ log tf)

In [152]:
def weighted_tf(x):
    if x > 0:
        return math.log(x) + 1
    return 0

In [153]:
w_tf = tf.copy()
for i in range(0, len(documents)):
    w_tf['doc'+str(i+1)] = tf['doc'+str(i+1)].apply(weighted_tf)

In [154]:
w_tf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
calpurnia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
where,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
fear,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
worser,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
in,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
rush,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
to,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
brutu,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
caeser,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
antoni,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## Inverse Document Frequency
$$ idf = \frac{number of the documents in the corups} {number of documents in the corups contain the term}$$

In [155]:
tdf = pd.DataFrame(columns=['df', 'idf'])
for i in range(len(tf)):
    in_term = w_tf.iloc[i].values.sum()

    tdf.loc[i, 'df'] = in_term

    tdf.loc[i, 'idf'] = math.log10(10 / (float(in_term)))

tdf.index=w_tf.index

In [156]:
tdf

Unnamed: 0,df,idf
calpurnia,1,1.0
where,4,0.39794
fear,3,0.522879
worser,4,0.39794
in,4,0.39794
rush,4,0.39794
to,4,0.39794
brutu,3,0.522879
caeser,5,0.30103
antoni,3,0.522879


## TF.IDF

In [76]:
tf_idf = w_tf.multiply(tdf['idf'], axis=0)

In [77]:
tf_idf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
calpurnia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
where,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
fear,0.0,0.0,0.0,0.0,0.0,0.0,0.522879,0.522879,0.0,0.522879
worser,0.39794,0.0,0.39794,0.39794,0.39794,0.0,0.0,0.0,0.0,0.0
in,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
rush,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
to,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
brutu,0.522879,0.522879,0.0,0.522879,0.0,0.0,0.0,0.0,0.0,0.0
caeser,0.30103,0.30103,0.0,0.30103,0.30103,0.30103,0.0,0.0,0.0,0.0
antoni,0.522879,0.522879,0.0,0.0,0.0,0.522879,0.0,0.0,0.0,0.0


# Allow users to write phrase query

In [169]:
def put_query(q):
    lis = [[] for i in range(10)]
    for term in q.split():
        if term in pos_index.keys():
            for key in pos_index[term][1].keys():
                if lis[key-1] != []:
                    if lis[key-1][-1] == pos_index[term][1][key][0]-1:
                        lis[key-1].append(pos_index[term][1][key][0])
                else:
                    lis[key-1].append(pos_index[term][1][key][0])

    positions = []
    for pos, lst in enumerate(lis, start=1):
        if len(lst) == len(q.split()):
            positions.append('document '+str(pos))
    return positions

# Example usage
query = 'fool'
matched_documents = put_query(query)
print("Matched Documents for the Query:")
print(matched_documents)


Matched Documents for the Query:
['document 7', 'document 8', 'document 9', 'document 10']


## Document length

In [78]:
def get_doc_len(col):
    return np.sqrt(tf_idf[col].apply(lambda x: x**2).sum())

In [79]:
doc_len = pd.DataFrame()
for col in tf_idf.columns:
    doc_len.loc[0, col+'_length']= get_doc_len(col)


In [80]:
doc_len

Unnamed: 0,doc1_length,doc2_length,doc3_length,doc4_length,doc5_length,doc6_length,doc7_length,doc8_length,doc9_length,doc10_length
0,1.373462,1.279618,0.498974,0.782941,0.582747,0.67427,1.223496,1.223496,1.106137,1.106137


In [81]:
doc_len['doc1_length'].values[0]

1.3734623153231016

## Normalized TF.IDF

In [82]:
def get_norm_tf_idf(col, x):
    try:
        return x / doc_len[col+'_length'].values[0]
    except:
        return 0

In [83]:
norm_tf_idf = pd.DataFrame()
for col in tf_idf.columns:
    norm_tf_idf[col] = tf_idf[col].apply(lambda x : get_norm_tf_idf(col, x))

In [84]:
norm_tf_idf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
calpurnia,0.0,0.781483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
where,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
fear,0.0,0.0,0.0,0.0,0.0,0.0,0.427365,0.427365,0.0,0.472707
worser,0.289735,0.0,0.797516,0.508263,0.682869,0.0,0.0,0.0,0.0,0.0
in,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
rush,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
to,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
brutu,0.380701,0.408621,0.0,0.667839,0.0,0.0,0.0,0.0,0.0,0.0
caeser,0.219176,0.23525,0.0,0.384486,0.51657,0.446453,0.0,0.0,0.0,0.0
antoni,0.380701,0.408621,0.0,0.0,0.0,0.775474,0.0,0.0,0.0,0.0


In [85]:
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(documents)
# X = X.T.toarray()
# df = pd.DataFrame(X, index=vectorizer.get_feature_names_out())
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)
X = X.T.toarray()
df = pd.DataFrame(X, index=vectorizer.get_feature_names())


In [102]:
def get_similar_articles(q, df):
    print("query:", q)

    # Convert the query become a vector
    query = [q]
    q_vec = vectorizer.transform(query).toarray().reshape(df.shape[0],)
    sim = {}  # Calculate the similarity
    for i in range(10):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)

    # Sort the values
    sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
    # Print the articles and their similarity values
    for doc, score in sim_sorted:
        if score > 0.5:
            print("Similarity value:", score)
            print("The article is:", doc + 1)

    # Print information only for terms that exist in the dataframe
    existing_terms = set(q.split()) & set(df.index)
    if existing_terms:
        print('\t\t\t\tf-raw')
        print(tf.loc[existing_terms].to_string(index=False))

        print('\t\t\t\tw tf(1+ log tf)')
        print(w_tf.loc[existing_terms].to_string(index=False))

        print('\t\t\t\tidf')
        print(tdf.loc[existing_terms].to_string(index=False))

        print('\t\t\t\ttf*idf')
        print(tf_idf.loc[existing_terms].to_string(index=False))

        print('\t\t\t\tnormalized')
        print(norm_tf_idf.loc[existing_terms].to_string(index=False))
    else:
        print("No matching terms found in the dataframe.")

# ...


In [170]:
q1 = 'rush'
get_similar_articles(q1, df)

query: rush
				f-raw
 doc1  doc2  doc3  doc4  doc5  doc6  doc7  doc8  doc9  doc10
    0     0     0     0     0     0     1     1     1      1
				w tf(1+ log tf)
 doc1  doc2  doc3  doc4  doc5  doc6  doc7  doc8  doc9  doc10
  0.0   0.0   0.0   0.0   0.0   0.0   1.0   1.0   1.0    1.0
				idf
df      idf
 4  0.39794
				tf*idf
doc1 doc2 doc3 doc4 doc5 doc6     doc7     doc8     doc9    doc10
   0    0    0    0    0    0  0.39794  0.39794  0.39794  0.39794
				normalized
 doc1  doc2  doc3  doc4  doc5  doc6      doc7      doc8      doc9     doc10
  0.0   0.0   0.0   0.0   0.0   0.0  0.325248  0.325248  0.359756  0.359756
