In [1]:
import os
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from natsort import natsorted
import math
import numpy as np
import pandas as pd

file_name = natsorted(os.listdir('file'))
stemmer = PorterStemmer()

## Tokenization and Stemming

In [2]:
def preprocessing(doc):
    token_docs = word_tokenize(doc)
    prepared_doc = []

    for term in token_docs:
        stemmed_term = stemmer.stem(term.lower())
        prepared_doc.append(stemmed_term)

    return prepared_doc


document_of_terms = []
for file in file_name:
    with open(f'file/{file}', 'r') as f:
        document = f.read()
    document_of_terms.append(preprocessing(document))

print('Terms after tokenization and stemming ')
print(document_of_terms,"\n")

Terms after tokenization and stemming 
[['antoni', 'brutu', 'caeser', 'cleopatra', 'merci', 'worser'], ['antoni', 'brutu', 'caeser', 'calpurnia'], ['merci', 'worser'], ['brutu', 'caeser', 'merci', 'worser'], ['caeser', 'merci', 'worser'], ['antoni', 'caeser', 'merci'], ['angel', 'fool', 'fear', 'in', 'rush', 'to', 'tread', 'where'], ['angel', 'fool', 'fear', 'in', 'rush', 'to', 'tread', 'where'], ['angel', 'fool', 'in', 'rush', 'to', 'tread', 'where'], ['fool', 'fear', 'in', 'rush', 'to', 'tread', 'where']] 



## Positional Index

In [3]:
document_number = 0
positional_index = {}

for document in document_of_terms:
    # For position and term in the tokens.
    for positional, term in enumerate(document):
        # If term already exists in the positional index dictionary.
        if term in positional_index:
            # Increment total freq by 1.
            positional_index[term][0] = positional_index[term][0] + 1

            # Check if the term has existed in that DocID before.
            if document_number in positional_index[term][1]:
                positional_index[term][1][document_number].append(positional)

            else:
                positional_index[term][1][document_number] = [positional]

        # If term does not exist in the positional index dictionary
        # (first encounter).
        else:
            # Initialize the list.
            positional_index[term] = []
            # The total frequency is 1.
            positional_index[term].append(1)
            # The postings list is initially empty.
            positional_index[term].append({})
            # Add doc ID to postings list.
            positional_index[term][1][document_number] = [positional]

    # Increment the file no. counter for document ID mapping
    document_number += 1

print('Positional index')
#print(positional_index)


positional_index_df = pd.DataFrame.from_dict(positional_index, orient='index', columns=['Total Frequency', 'Postings List'])

# Display the DataFrame
positional_index_df


Positional index


Unnamed: 0,Total Frequency,Postings List
antoni,3,"{0: [0], 1: [0], 5: [0]}"
brutu,3,"{0: [1], 1: [1], 3: [0]}"
caeser,5,"{0: [2], 1: [2], 3: [1], 4: [0], 5: [1]}"
cleopatra,1,{0: [3]}
merci,5,"{0: [4], 2: [0], 3: [2], 4: [1], 5: [2]}"
worser,4,"{0: [5], 2: [1], 3: [3], 4: [2]}"
calpurnia,1,{1: [3]}
angel,3,"{6: [0], 7: [0], 8: [0]}"
fool,4,"{6: [1], 7: [1], 8: [1], 9: [0]}"
fear,3,"{6: [2], 7: [2], 9: [1]}"


## Phrase Query

In [17]:
query = input('Input Phrase Query: ')


def query_input(q):
    lis = [[] for _ in range(10)]
    for term in preprocessing(query):
        if term in positional_index:
            for key in positional_index[term][1].keys():
                if not lis[key] or lis[key][-1] == positional_index[term][1][key][0] - 1:
                    lis[key].append(positional_index[term][1][key][0])

    positions = [f'doc{pos}' for pos, lst in enumerate(lis, start=1) if len(lst) == len(preprocessing(query))]
    return positions


print('++++++phrase query+++++')
print(query_input(query))

++++++phrase query+++++
['doc4']


### Term frequency

In [16]:
all_words = []
for doc in document_of_terms:
    for word in doc:
        all_words.append(word)


def get_term_freq(doc):
    words_found = dict.fromkeys(all_words, 0)
    for word in doc:
        words_found[word] += 1
    return words_found


term_freq = pd.DataFrame(get_term_freq(document_of_terms[0]).values(), index=get_term_freq(document_of_terms[0]).keys())
for i in range(1, len(document_of_terms)):
    term_freq[i] = get_term_freq(document_of_terms[i]).values()

term_freq.columns = ['doc' + str(i) for i in range(1, 11)]
print('TF')
term_freq

TF


Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
antoni,1,1,0,0,0,1,0,0,0,0
brutu,1,1,0,1,0,0,0,0,0,0
caeser,1,1,0,1,1,1,0,0,0,0
cleopatra,1,0,0,0,0,0,0,0,0,0
merci,1,0,1,1,1,1,0,0,0,0
worser,1,0,1,1,1,0,0,0,0,0
calpurnia,0,1,0,0,0,0,0,0,0,0
angel,0,0,0,0,0,0,1,1,1,0
fool,0,0,0,0,0,0,1,1,1,1
fear,0,0,0,0,0,0,1,1,0,1


## Print tables before Input Query

### Weighted tf(1+ log tf)

In [6]:
def get_weighted_term_freq(x):
    if x > 0:
        return math.log10(x) + 1
    return 0

w_tf =term_freq.copy()
for i in range(1, len(document_of_terms) + 1):
    w_tf['doc' + str(i)] = term_freq['doc' + str(i)].apply(get_weighted_term_freq)

print('Weighted TF')
w_tf

Weighted TF


Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
antoni,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
brutu,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
caeser,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
cleopatra,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
merci,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
worser,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
calpurnia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
angel,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
fool,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
fear,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0


### Inverse Document Frequency


In [7]:
tfd = pd.DataFrame(columns=['freq', 'idf'])

for i in range(len(term_freq)):

    frequency = term_freq.iloc[i].values.sum()

    tfd.loc[i, 'freq'] = frequency

    tfd.loc[i, 'idf'] = math.log10(10 / (float(frequency)))

tfd.index = term_freq.index

print('IDF')
tfd

IDF


Unnamed: 0,freq,idf
antoni,3,0.522879
brutu,3,0.522879
caeser,5,0.30103
cleopatra,1,1.0
merci,5,0.30103
worser,4,0.39794
calpurnia,1,1.0
angel,3,0.522879
fool,4,0.39794
fear,3,0.522879


### TF.IDF

In [8]:
term_freq_inve_doc_freq = term_freq.multiply(tfd['idf'], axis=0)

print('TF.IDF')
term_freq_inve_doc_freq

TF.IDF


Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
antoni,0.522879,0.522879,0.0,0.0,0.0,0.522879,0.0,0.0,0.0,0.0
brutu,0.522879,0.522879,0.0,0.522879,0.0,0.0,0.0,0.0,0.0,0.0
caeser,0.30103,0.30103,0.0,0.30103,0.30103,0.30103,0.0,0.0,0.0,0.0
cleopatra,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
merci,0.30103,0.0,0.30103,0.30103,0.30103,0.30103,0.0,0.0,0.0,0.0
worser,0.39794,0.0,0.39794,0.39794,0.39794,0.0,0.0,0.0,0.0,0.0
calpurnia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
angel,0.0,0.0,0.0,0.0,0.0,0.0,0.522879,0.522879,0.522879,0.0
fool,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
fear,0.0,0.0,0.0,0.0,0.0,0.0,0.522879,0.522879,0.0,0.522879


### Document length

In [9]:
document_length = pd.DataFrame()


def get_docs_length(col):
    return np.sqrt(term_freq_inve_doc_freq[col].apply(lambda x: x ** 2).sum())


for column in term_freq_inve_doc_freq.columns:
    document_length.loc[0, column + '_len'] = get_docs_length(column)

print('Document Length')
document_length

Document Length


Unnamed: 0,doc1_len,doc2_len,doc3_len,doc4_len,doc5_len,doc6_len,doc7_len,doc8_len,doc9_len,doc10_len
0,1.373462,1.279618,0.498974,0.782941,0.582747,0.67427,1.223496,1.223496,1.106137,1.106137


### Normalized TF.IDF

In [10]:
normalized_term_freq_idf = pd.DataFrame()


def get_normalized(col, x):
    try:
        return x / document_length[col + '_len'].values[0]
    except:
        return 0


for column in term_freq_inve_doc_freq.columns:
    normalized_term_freq_idf[column] = term_freq_inve_doc_freq[column].apply(lambda x: get_normalized(column, x))

print('Normalized TF.IDF')
normalized_term_freq_idf

Normalized TF.IDF


Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
antoni,0.380701,0.408621,0.0,0.0,0.0,0.775474,0.0,0.0,0.0,0.0
brutu,0.380701,0.408621,0.0,0.667839,0.0,0.0,0.0,0.0,0.0,0.0
caeser,0.219176,0.23525,0.0,0.384486,0.51657,0.446453,0.0,0.0,0.0,0.0
cleopatra,0.728087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
merci,0.219176,0.0,0.603298,0.384486,0.51657,0.446453,0.0,0.0,0.0,0.0
worser,0.289735,0.0,0.797516,0.508263,0.682869,0.0,0.0,0.0,0.0,0.0
calpurnia,0.0,0.781483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
angel,0.0,0.0,0.0,0.0,0.0,0.0,0.427365,0.427365,0.472707,0.0
fool,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
fear,0.0,0.0,0.0,0.0,0.0,0.0,0.427365,0.427365,0.0,0.472707


## Input Query

### Boolean query

In [24]:
def boolean_search(query):
    operators = ["and", "or", "not"]

    def apply_operator(operands, operator):
        result = set(positional_index[operands[0]][1].keys())

        for operand in operands[1:]:
            if operator == "and":
                result &= set(positional_index[operand][1].keys())
            elif operator == "or":
                result |= set(positional_index[operand][1].keys())
            elif operator == "not":
                result -= set(positional_index[operand][1].keys())
        return list(result)

    def parse_query(subquery):
        for operator in operators:
            if operator in subquery:
                operands = subquery.split(operator)
                operands = [operand.strip() for operand in operands]
                return apply_operator(operands, operator)
        return list(set(positional_index[subquery.strip()][1].keys()))

    result_query = parse_query(query)
    positions = []
    for pos in result_query:     
            positions.append('document '+str(pos+1))
    return positions

def boolean_query_pre(query):
    token_docs = word_tokenize(query)
    prepared_doc = []

    for term in token_docs:
        stemmed_term = stemmer.stem(term.lower())
        prepared_doc.append(stemmed_term)

    return prepared_doc

variable = input('Enter your boolean query: ')
preprocessed_query_list = boolean_query_pre(variable)
query_string = ' '.join(preprocessed_query_list)
result = boolean_search(query_string)

print("Result of boolean search:", result)


Result of boolean search: ['document 1', 'document 2']


## Query

In [25]:
def insert_query(q):
    query = pd.DataFrame(index=normalized_term_freq_idf.index)
    query['tf'] = [1 if x in preprocessing(q) else 0 for x in list(normalized_term_freq_idf.index)]
    query['w_tf'] = query['tf'].apply(lambda x : get_weighted_term_freq(x))
    product = normalized_term_freq_idf.multiply(query['w_tf'], axis=0)
    query['idf'] = tfd['idf'] * query['w_tf']
    query['tf_idf'] = query['w_tf'] * query['idf']
    query['normalized'] = 0
    for i in range(len(query)):
        query['normalized'].iloc[i] = float(query['idf'].iloc[i]) / math.sqrt(sum(query['idf'].values**2))
    print('Query Details')
    print(query.loc[preprocessing(q)])
    product2 = product.multiply(query['normalized'], axis=0)
    scores = {}
    for col in product2.columns:
        if 0 in product2[col].loc[preprocessing(q)].values:
            pass
        else:
            scores[col] = product2[col].sum()
    product_result = product2[list(scores.keys())].loc[preprocessing(q)]
    print()
    print('Product (query*matched doc)')
    print(product_result)
    print()
    print('product sum')
    print(product_result.sum())
    print()
    print('Query Length')
    q_len = math.sqrt(sum([x**2 for x in query['idf'].loc[preprocessing(q)]]))
    print(q_len)
    print()
    print('Cosine Simliarity')
    print(product_result.sum())
    print()
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    print('Returned docs')
    for typle in sorted_scores:
        print(typle[0])

q = input('Input Query for print Query details and matched document: ')
insert_query(q)

Query Details
        tf  w_tf       idf    tf_idf  normalized
antoni   1   1.0  0.522879  0.522879    0.707107
brutu    1   1.0  0.522879  0.522879    0.707107

Product (query*matched doc)
            doc1      doc2
antoni  0.269196  0.288939
brutu   0.269196  0.288939

product sum
doc1    0.538393
doc2    0.577877
dtype: float64

Query Length
0.7394622130520805

Cosine Simliarity
doc1    0.538393
doc2    0.577877
dtype: float64

Returned docs
doc2
doc1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  query['normalized'].iloc[i] = float(query['idf'].iloc[i]) / math.sqrt(sum(query['idf'].values**2))
  query['normalized'].iloc[i] = float(query['idf'].iloc[i]) / math.sqrt(sum(query['idf'].values**2))
